예제 #1
0
    def test_beam(self):
        x = [
            "( and ( got the walk ) ( got the talk ) ( and ( got thatstyle ) ( got thatsmile ) ) )",
            "( and ( got the walk ) ( got talk the ) ( and ( got thatstyle ) ( got thatsmile ) ) )",
            "( and ( got the walk ) ( got the walk ) ( and ( got thatstyle ) ( got thatsmile ) ) )",
            "( and ( got the talk ) ( got the walk ) ( and ( got thatsmile ) ( got thatstyle ) ) )",
            "( too_bad ( she ( has ( a penis ) ) ) )"
        ]
        D = Vocab()
        for xe in x:
            for xes in xe.split():
                D.add_token(xes, seen=True)
        print(D.D)
        acc = TreeAccuracy(tensor2tree=partial(tensor2tree, D=D),
                           orderless={"and"})
        x = [[D[xes] for xes in xe.split()] for xe in x]
        # equalize dims
        maxlen = max([len(xe) for xe in x])
        x = [xe + [0] * (maxlen - len(xe)) for xe in x]
        x = torch.tensor(x)
        print(x)

        a = acc(None, x[torch.tensor([1, 4, 2, 3, 0])][None, :, :], x[0:1])
        print(a)
        self.assertTrue(a["tree_acc"] == 0)
        self.assertTrue(a["tree_acc_at1"] == 0)
        self.assertTrue(a["tree_acc_at2"] == 0)
        self.assertTrue(a["tree_acc_at3"] == 0)
        self.assertTrue(a["tree_acc_at4"] == 1)
        self.assertTrue(a["tree_acc_at5"] == 1)
        self.assertTrue(a["tree_acc_at_last"] == 1)
예제 #2
0
def build_vocab_from_pcfg(pcfg, min_freq=0, top_k=np.infty)->Vocab:
    vocab = Vocab()
    vocab.add_token("(")
    vocab.add_token(")")
    for rule in pcfg.productions():
        vocab.add_token(str(rule.lhs()))
        for rhse in rule.rhs():
            vocab.add_token(str(rhse))
    vocab.finalize(min_freq=min_freq, top_k=top_k)
    return vocab
예제 #3
0
 def tensor_to_trees(x, vocab: Vocab):
     xstrs = [
         vocab.tostr(x[i]).replace("@START@", "") for i in range(len(x))
     ]
     xstrs = [re.sub("::\d+", "", xstr) for xstr in xstrs]
     trees = []
     for xstr in xstrs:
         # drop everything after @END@, if present
         xstr = xstr.split("@END@")
         xstr = xstr[0]
         # add an opening parentheses if not there
         xstr = xstr.strip()
         if len(xstr) == 0 or xstr[0] != "(":
             xstr = "(" + xstr
         # balance closing parentheses
         parenthese_imbalance = xstr.count("(") - xstr.count(")")
         xstr = xstr + ")" * max(0, parenthese_imbalance
                                 )  # append missing closing parentheses
         xstr = "(" * -min(
             0, parenthese_imbalance
         ) + xstr  # prepend missing opening parentheses
         try:
             tree = taglisp_to_tree(xstr)
             if isinstance(
                     tree,
                     tuple) and len(tree) == 2 and tree[0] is None:
                 tree = None
         except Exception as e:
             tree = None
         trees.append(tree)
     return trees
예제 #4
0
def load_ds(domain="restaurants",
            min_freq=0,
            top_k=np.infty,
            nl_mode="bart-large",
            trainonvalid=False):
    ds = OvernightDatasetLoader(simplify_mode="light").load(
        domain=domain, trainonvalid=trainonvalid)

    seqenc_vocab = Vocab(padid=1, startid=0, endid=2, unkid=UNKID)
    seqenc = SequenceEncoder(vocab=seqenc_vocab,
                             tokenizer=tree_to_lisp_tokens,
                             add_start_token=True,
                             add_end_token=True)
    for example in ds.examples:
        query = example[1]
        seqenc.inc_build_vocab(query, seen=example[2] == "train")
    seqenc.finalize_vocab(min_freq=min_freq, top_k=top_k)

    nl_tokenizer = AutoTokenizer.from_pretrained(nl_mode)

    def tokenize(x):
        ret = (nl_tokenizer.encode(x[0], return_tensors="pt")[0],
               seqenc.convert(x[1], return_what="tensor"), x[2], x[0], x[1])
        return ret
    tds, vds, xds = ds[(None, None, "train")].map(tokenize), \
                    ds[(None, None, "valid")].map(tokenize), \
                    ds[(None, None, "test")].map(tokenize)
    return tds, vds, xds, nl_tokenizer, seqenc
    def __init__(self, dim, vocab:Vocab=None, numlayers:int=6, numheads:int=6,
                 dropout:float=0., maxpos=512, bertname="bert-base-uncased", **kw):
        super(TransformerTagger, self).__init__(**kw)
        self.vocab = vocab
        self.vocabsize = vocab.number_of_ids()
        self.dim = dim
        config = TransformerConfig(vocab_size=self.vocabsize, d_model=self.dim, d_ff=self.dim * 4,
                                   num_layers=numlayers, num_heads=numheads, dropout_rate=dropout)

        decoder_config = deepcopy(config)
        decoder_config.is_decoder = True
        self.decoder = RelativePositionTransformer(decoder_config)

        self.out = torch.nn.Linear(self.dim, self.vocabsize)

        vocab_mask = torch.ones(self.vocabsize)
        for excl_token in self.exclude:
            if excl_token in self.vocab:
                vocab_mask[self.vocab[excl_token]] = 0
        self.register_buffer("vocab_mask", vocab_mask)

        self.bertname = bertname
        self.bert_model = BertModel.from_pretrained(self.bertname)
        def set_dropout(m:torch.nn.Module):
            if isinstance(m, torch.nn.Dropout):
                m.p = dropout
        self.bert_model.apply(set_dropout)

        self.adapter = None
        if self.bert_model.config.hidden_size != decoder_config.d_model:
            self.adapter = torch.nn.Linear(self.bert_model.config.hidden_size, decoder_config.d_model, bias=False)

        self.reset_parameters()
예제 #6
0
    def build_copy_maps(self,
                        inp_vocab: Vocab,
                        str_action_re=re.compile(r"^([^_].*)$")):
        self.inp_vocab = inp_vocab
        self.register_buffer(
            "_inp_to_act",
            torch.zeros(inp_vocab.number_of_ids(), dtype=torch.long))
        self.register_buffer(
            "_act_to_inp",
            torch.zeros(self.out_vocab.number_of_ids(), dtype=torch.long))

        # for COPY, initialize mapping from input node vocab (sgb.vocab) to output action vocab (qgb.vocab_actions)
        self._build_copy_maps(str_action_re=str_action_re)

        # compute action mask from input: actions that are doable using input copy actions are 1, others are 0
        actmask = torch.zeros(self.out_vocab.number_of_ids(),
                              dtype=torch.uint8)
        actmask.index_fill_(0, self._inp_to_act, 1)
        actmask[0] = 0
        self.register_buffer("_inp_actmask", actmask)

        # rare actions
        self.rare_token_ids = self.out_vocab.rare_ids
        self.register_buffer("gen_mask", None)
        if len(self.rare_token_ids) > 0:
            gen_mask = torch.ones(self.out_vocab.number_of_ids())
            for rare_token_id in self.rare_token_ids:
                gen_mask[rare_token_id] = 0
            self.register_buffer("gen_mask", gen_mask)
예제 #7
0
    def __init__(self,
                 dim,
                 vocab: Vocab = None,
                 inpvocab: Vocab = None,
                 numlayers: int = 6,
                 mode="normal",
                 dropout: float = 0.,
                 worddropout: float = 0.,
                 **kw):
        super(GRUDecoderCell, self).__init__(**kw)
        self.vocab = vocab
        self.inpvocab = inpvocab
        self.vocabsize = vocab.number_of_ids()
        self.dim = dim
        self.mode = mode

        self.dec_emb = torch.nn.Embedding(self.vocabsize + 3, self.dim)
        dims = [self.dim + self.dim] + [self.dim for _ in range(numlayers)]
        self.dec_stack = torch.nn.ModuleList(
            [torch.nn.GRUCell(dims[i], dims[i + 1]) for i in range(numlayers)])
        self.dropout = torch.nn.Dropout(dropout)
        self.attn_linQ = None
        self.attn_linK = None
        self.attn_linV = None
        # self.attn_linQ = torch.nn.Linear(self.dim, self.dim)
        # self.attn_linK = torch.nn.Linear(self.dim, self.dim)
        # self.attn_linV = torch.nn.Linear(self.dim, self.dim)

        self.preout = torch.nn.Linear(self.dim + self.dim, self.dim)
        self.out = torch.nn.Linear(self.dim, self.vocabsize + 3)

        inpvocabsize = inpvocab.number_of_ids()
        self.encoder_model = Encoder(inpvocabsize + 5,
                                     self.dim,
                                     int(self.dim / 2),
                                     num_layers=numlayers,
                                     dropout=dropout)

        self.adapter = None
        self.inpworddropout = WordDropout(
            worddropout, self.inpvocab[self.inpvocab.masktoken],
            [self.inpvocab[self.inpvocab.padtoken]])
        self.worddropout = WordDropout(worddropout,
                                       self.vocab[self.vocab.masktoken],
                                       [self.vocab[self.vocab.padtoken]])

        self.reset_parameters()
예제 #8
0
    def _initialize(self, p, xlmr, min_freq:int):
        self.data = {}
        self.xlmr = xlmr
        self.xlmr_vocab = Vocab()
        self.xlmr_vocab.set_dict(xlmr.model.decoder.dictionary.indices)
        self.sentence_encoder = SequenceEncoder(lambda x: f"<s> {xlmr.bpe.encode(x)} </s>".split(), vocab=self.xlmr_vocab)
        trainlines = [x for x in ujson.load(open(os.path.join(p, f"geo-{self.train_lang}.json"), "r"))]
        testlines = [x for x in ujson.load(open(os.path.join(p, f"geo-{self.test_lang}.json"), "r"))]
        trainlines = [x for x in trainlines if x["split"] == "train"]
        testlines = [x for x in testlines if x["split"] == "test"]
        if self.cvfolds is None:
            splits = ["train"]*len(trainlines) + ["test"] * len(testlines)
        else:
            cvsplit_len = len(trainlines)/self.cvfolds
            splits = []
            for i in range(0, self.cvfolds):
                splits += [i] * round(cvsplit_len * (i+1) - len(splits))
            random.shuffle(splits)
            splits = ["valid" if x == self.testfold else "train" for x in splits]
            splits = splits + ["test"] * len(testlines)
        questions = [x["nl"] for x in trainlines]
        queries = [x["mrl"] for x in trainlines]
        xquestions = [x["nl"] for x in testlines]
        xqueries = [x["mrl"] for x in testlines]
        questions += xquestions
        queries += xqueries

        # initialize output vocabulary
        outvocab = Vocab()
        # for token, bertid in self.xlmr_vocab.D.items():
        #     outvocab.add_token(token, seen=False)

        self.query_encoder = SequenceEncoder(tokenizer=partial(basic_query_tokenizer, strtok=lambda x: xlmr.bpe.encode(x).split()), vocab=outvocab, add_end_token=True)

        # build vocabularies
        for i, (question, query, split) in enumerate(zip(questions, queries, splits)):
            question_tokens = self.sentence_encoder.convert(question, return_what="tokens")[0]
            for token in question_tokens:
                self.query_encoder.vocab.add_token(token, seen=False)
            self.query_encoder.inc_build_vocab(query, seen=split=="train")
        keeptokens = set(self.xlmr_vocab.D.keys())
        self.query_encoder.finalize_vocab(min_freq=min_freq, keep_tokens=keeptokens)

        token_specs = self.build_token_specs(queries)
        self.token_specs = token_specs

        self.build_data(questions, queries, splits)
예제 #9
0
    def __init__(self,
                 dim,
                 vocab: Vocab = None,
                 numlayers: int = 6,
                 numheads: int = 6,
                 dropout: float = 0.,
                 maxpos=512,
                 bertname="bert-base-uncased",
                 baseline=False,
                 **kw):
        super(TransformerTagger, self).__init__(**kw)
        self.vocab = vocab
        self.vocabsize = vocab.number_of_ids()
        self.dim = dim
        self.baseline = baseline
        config = TransformerConfig(vocab_size=self.vocabsize,
                                   d_model=self.dim,
                                   d_ff=self.dim * 4,
                                   num_layers=numlayers,
                                   num_heads=numheads,
                                   dropout_rate=dropout,
                                   use_relative_position=False)

        self.emb = torch.nn.Embedding(config.vocab_size, config.d_model)
        self.posemb = torch.nn.Embedding(maxpos, config.d_model)
        decoder_config = deepcopy(config)
        decoder_config.is_decoder = True
        decoder_config.use_causal_mask = baseline
        self.decoder = TransformerStack(decoder_config)

        if baseline:
            self.out = torch.nn.Linear(self.dim, self.vocabsize)
        else:
            self.out = torch.nn.Linear(self.dim * 2, self.vocabsize)
        # self.out = MOS(self.dim, self.vocabsize, K=mosk)

        vocab_mask = torch.ones(self.vocabsize)
        # for excl_token in self.exclude:
        #     if excl_token in self.vocab:
        #         vocab_mask[self.vocab[excl_token]] = 0
        self.register_buffer("vocab_mask", vocab_mask)

        self.bertname = bertname
        self.bert_model = BertModel.from_pretrained(self.bertname)
        # def set_dropout(m:torch.nn.Module):
        #     if isinstance(m, torch.nn.Dropout):
        #         m.p = dropout
        # self.bert_model.apply(set_dropout)

        self.adapter = None
        if self.bert_model.config.hidden_size != decoder_config.d_model:
            self.adapter = torch.nn.Linear(self.bert_model.config.hidden_size,
                                           decoder_config.d_model,
                                           bias=False)

        self.reset_parameters()
예제 #10
0
    def __init__(self, h_dim: int, vocab: Vocab = None, **kw):
        super(_PtrGenOutput, self).__init__(**kw)
        # initialize modules
        self.gen_lin = torch.nn.Linear(h_dim, vocab.number_of_ids(), bias=True)
        self.copy_or_gen = torch.nn.Linear(h_dim, 2, bias=True)
        self.sm = torch.nn.Softmax(-1)
        self.logsm = torch.nn.LogSoftmax(-1)

        self.inp_vocab, self.out_vocab = None, vocab

        self.naningrad = torch.nn.Parameter(torch.zeros(1))
        self.naningrad2 = torch.nn.Parameter(torch.zeros(1))
예제 #11
0
    def __init__(self,
                 h_dim: int,
                 inp_vocab: Vocab = None,
                 out_vocab: Vocab = None,
                 **kw):
        super(SumPtrGenOutputOLD, self).__init__(**kw)
        # initialize modules
        self.gen_lin = torch.nn.Linear(h_dim,
                                       out_vocab.number_of_ids(),
                                       bias=True)
        self.sm = torch.nn.Softmax(-1)
        self.logsm = torch.nn.LogSoftmax(-1)

        self.inp_vocab, self.out_vocab = inp_vocab, out_vocab

        self.register_buffer(
            "_inp_to_act",
            torch.zeros(self.inp_vocab.number_of_ids(), dtype=torch.long))
        self.register_buffer(
            "_act_from_inp",
            torch.zeros(out_vocab.number_of_ids(), dtype=torch.long))

        # for COPY, initialize mapping from input node vocab (sgb.vocab) to output action vocab (qgb.vocab_actions)
        self.build_copy_maps()

        # compute action mask from input: actions that are doable using input copy actions are 1, others are 0
        actmask = torch.zeros(out_vocab.number_of_ids(), dtype=torch.uint8)
        actmask.index_fill_(0, self._inp_to_act, 1)
        self.register_buffer("_inp_actmask", actmask)

        # rare actions
        self.rare_token_ids = out_vocab.rare_ids
        rare_id = 1
        if len(self.rare_token_ids) > 0:
            out_map = torch.arange(self.out_vocab.number_of_ids())
            for rare_token_id in self.rare_token_ids:
                out_map[rare_token_id] = rare_id
            self.register_buffer("out_map", out_map)
        else:
            self.register_buffer("out_map", None)
예제 #12
0
    def _initialize(self, p, bert_tokenizer, min_freq: int):
        self.data = {}
        self.bert_vocab = Vocab()
        self.bert_vocab.set_dict(bert_tokenizer.vocab)
        self.sentence_encoder = SequenceEncoder(
            lambda x: bert_tokenizer.tokenize(f"[CLS] {x} [SEP]"),
            vocab=self.bert_vocab)
        trainlines = [
            x for x in ujson.load(
                open(os.path.join(p, f"geo-{self.train_lang}.json"), "r"))
        ]
        testlines = [
            x for x in ujson.load(
                open(os.path.join(p, f"geo-{self.train_lang}.json"), "r"))
        ]
        trainlines = [x for x in trainlines if x["split"] == "train"]
        testlines = [x for x in testlines if x["split"] == "test"]
        if self.cvfolds is None:
            splits = ["train"] * len(trainlines) + ["test"] * len(testlines)
        else:
            cvsplit_len = len(trainlines) / self.cvfolds
            splits = []
            for i in range(0, self.cvfolds):
                splits += [i] * round(cvsplit_len * (i + 1) - len(splits))
            random.shuffle(splits)
            splits = [
                "valid" if x == self.testfold else "train" for x in splits
            ]
            splits = splits + ["test"] * len(testlines)
        questions = [x["nl"] for x in trainlines]
        queries = [x["mrl"] for x in trainlines]
        xquestions = [x["nl"] for x in testlines]
        xqueries = [x["mrl"] for x in testlines]
        questions += xquestions
        queries += xqueries

        # initialize output vocabulary
        outvocab = Vocab()
        for token, bertid in self.bert_vocab.D.items():
            outvocab.add_token(token, seen=False)

        self.query_encoder = SequenceEncoder(tokenizer=partial(
            basic_query_tokenizer, strtok=bert_tokenizer),
                                             vocab=outvocab,
                                             add_end_token=True)

        # build vocabularies
        for i, (question, query,
                split) in enumerate(zip(questions, queries, splits)):
            self.query_encoder.inc_build_vocab(query, seen=split == "train")
        keeptokens = set(self.bert_vocab.D.keys())
        self.query_encoder.finalize_vocab(min_freq=min_freq,
                                          keep_tokens=keeptokens)

        token_specs = self.build_token_specs(queries)
        self.token_specs = token_specs

        self.build_data(questions, queries, splits)
예제 #13
0
    def test_normal(self):
        x = [
            "( and ( has service ) ( has money ) ( and ( got thatstyle ) ( got thatsmile ) ) )",
            "( and ( has service ) ( has service ) ( and ( got thatstyle ) ( got thatsmile ) ) )",
            "( and ( has money ) ( has service ) ( and ( got thatsmile ) ( got thatstyle ) ) )"
        ]
        D = Vocab()
        for xe in x:
            for xes in xe.split():
                D.add_token(xes, seen=True)
        print(D.D)
        acc = TreeAccuracy(tensor2tree=partial(tensor2tree, D=D),
                           orderless={"and"})
        x = [[D[xes] for xes in xe.split()] for xe in x]
        x = torch.tensor(x)
        print(x)

        a = acc(None, x[0:1], x[1:2])
        self.assertEqual(a["tree_acc"], 0)
        print(a)
        a = acc(None, x[0:1], x[2:3])
        self.assertEqual(a["tree_acc"], 1.)
        print(a)
예제 #14
0
    def __init__(self,
                 h_dim: int,
                 vocab: Vocab = None,
                 dropout: float = 0.,
                 **kw):
        super(BasicGenOutput, self).__init__(**kw)
        self.gen_lin = torch.nn.Linear(h_dim, vocab.number_of_ids(), bias=True)
        self.sm = torch.nn.Softmax(-1)
        self.logsm = torch.nn.LogSoftmax(-1)
        self.dropout = torch.nn.Dropout(dropout)

        self.vocab = vocab

        # rare output tokens
        self.rare_token_ids = vocab.rare_ids
        if len(self.rare_token_ids) > 0:
            out_mask = torch.ones(self.vocab.number_of_ids())
            for rare_token_id in self.rare_token_ids:
                out_mask[rare_token_id] = 0
            self.register_buffer("out_mask", out_mask)
        else:
            self.register_buffer("out_mask", None)
예제 #15
0
def load_ds(traindomains=("restaurants",),
            testdomain="housing",
            min_freq=1,
            mincoverage=1,
            top_k=np.infty,
            nl_mode="bert-base-uncased",
            fullsimplify=False,
            onlyabstract=False,
            pretrainsetting="all+lex",    # "all", "lex" or "all+lex"
            finetunesetting="lex",        # "lex", "all", "min"
            ):
    """
    :param traindomains:
    :param testdomain:
    :param min_freq:
    :param mincoverage:
    :param top_k:
    :param nl_mode:
    :param fullsimplify:
    :param add_domain_start:
    :param onlyabstract:
    :param pretrainsetting:     "all": use all examples from every domain
                                "lex": use only lexical examples
                                "all+lex": use both
    :param finetunesetting:     "lex": use lexical examples
                                "all": use all training examples
                                "min": use minimal lexicon-covering set of examples
                            ! Test is always over the same original test set.
                            ! Validation is over a fraction of training data
    :return:
    """
    general_tokens = {
        "(", ")", "arg:~type", "arg:type", "op:and", "SW:concat", "cond:has",
        "arg:<=", "arg:<", "arg:>=", "arg:>", "arg:!=", "arg:=", "SW:superlative",
        "SW:CNT-arg:min", "SW:CNT-arg:<", "SW:CNT-arg:<=", "SW:CNT-arg:>=", "SW:CNT-arg:>",
        "SW:CNT-arg:max", "SW:CNT-arg:=", "arg:max",
    }

    def tokenize_and_add_start(t):
        tokens = tree_to_lisp_tokens(t)
        starttok = "@START@"
        tokens = [starttok] + tokens
        return tokens

    sourceex = []
    for traindomain in traindomains:
        ds = OvernightDatasetLoader(simplify_mode="light" if not fullsimplify else "full", simplify_blocks=True,
                                    restore_reverse=DATA_RESTORE_REVERSE, validfrac=.10)\
            .load(domain=traindomain)
        sourceex += ds[(None, None, lambda x: x in ("train", "valid", "lexicon"))].map(lambda x: (x[0], x[1], x[2], traindomain)).examples       # don't use test examples

    testds = OvernightDatasetLoader(simplify_mode="light" if not fullsimplify else "full", simplify_blocks=True, restore_reverse=DATA_RESTORE_REVERSE)\
        .load(domain=testdomain)

    targetex = testds.map(lambda x: x + (testdomain,)).examples

    pretrainex = []
    if "all" in pretrainsetting.split("+"):
        pretrainex += [(a, tokenize_and_add_start(b), "pretrain", d) for a, b, c, d in sourceex if c == "train"]
    if "lex" in pretrainsetting.split("+"):
        pretrainex += [(a, tokenize_and_add_start(b), "pretrain", d) for a, b, c, d in sourceex if c == "lexicon"]

    pretrainvalidex = [(a, tokenize_and_add_start(b), "pretrainvalid", d) for a, b, c, d in sourceex if c == "valid"]

    if finetunesetting == "all":
        finetunetrainex = [(a, tokenize_and_add_start(b), "fttrain", d) for a, b, c, d in targetex if c == "train"]
    elif finetunesetting == "lex":
        finetunetrainex = [(a, tokenize_and_add_start(b), "fttrain", d) for a, b, c, d in targetex if c == "lexicon"]
    elif finetunesetting == "min":
        finetunetrainex = get_maximum_spanning_examples([(a, b, c, d) for a, b, c, d in targetex if c == "train"],
                                      mincoverage=mincoverage,
                                      loadedex=[e for e in pretrainex if e[2] == "pretrain"])
        finetunetrainex = [(a, tokenize_and_add_start(b), "fttrain", d) for a, b, c, d in finetunetrainex]
    finetunevalidex = [(a, tokenize_and_add_start(b), "ftvalid", d) for a, b, c, d in targetex if c == "valid"]
    finetunetestex = [(a, tokenize_and_add_start(b), "fttest", d) for a, b, c, d in targetex if c == "test"]
    print(f"Using mode \"{finetunesetting}\" for finetuning data: "
          f"\n\t{len(finetunetrainex)} training examples")


    allex = pretrainex + pretrainvalidex + finetunetrainex + finetunevalidex + finetunetestex
    ds = Dataset(allex)

    if onlyabstract:
        et = get_lf_abstract_transform(ds[lambda x: x[3] != testdomain].examples)
        ds = ds.map(lambda x: (x[0], et(x[1]), x[2], x[3]))

    seqenc_vocab = Vocab(padid=0, startid=1, endid=2, unkid=UNKID)
    seqenc = SequenceEncoder(vocab=seqenc_vocab, tokenizer=lambda x: x,
                             add_start_token=False, add_end_token=True)
    for example in ds.examples:
        query = example[1]
        seqenc.inc_build_vocab(query, seen=example[2] in ("pretrain", "fttrain"))
    seqenc.finalize_vocab(min_freq=min_freq, top_k=top_k)

    generaltokenmask = torch.zeros(seqenc_vocab.number_of_ids(), dtype=torch.long)
    for token, tokenid in seqenc_vocab.D.items():
        if token in general_tokens:
            generaltokenmask[tokenid] = 1

    nl_tokenizer = AutoTokenizer.from_pretrained(nl_mode)
    def tokenize(x):
        ret = (nl_tokenizer.encode(x[0], return_tensors="pt")[0],
               seqenc.convert(x[1], return_what="tensor"),
               x[2],
               x[0], x[1], x[3])
        return ret
    tds, ftds, vds, fvds, xds = ds[(None, None, "pretrain", None)].map(tokenize), \
                          ds[(None, None, "fttrain", None)].map(tokenize), \
                          ds[(None, None, "pretrainvalid", None)].map(tokenize), \
                          ds[(None, None, "ftvalid", None)].map(tokenize), \
                          ds[(None, None, "fttest", None)].map(tokenize)
    return tds, ftds, vds, fvds, xds, nl_tokenizer, seqenc, generaltokenmask
def load_ds(domain="restaurants",
            nl_mode="bert-base-uncased",
            trainonvalid=False,
            noreorder=False):
    """
    Creates a dataset of examples which have
    * NL question and tensor
    * original FL tree
    * reduced FL tree with slots (this is randomly generated)
    * tensor corresponding to reduced FL tree with slots
    * mask specifying which elements in reduced FL tree are terminated
    * 2D gold that specifies whether a token/action is in gold for every position (compatibility with MML!)
    """
    orderless = {"op:and", "SW:concat"}  # only use in eval!!

    ds = OvernightDatasetLoader().load(domain=domain,
                                       trainonvalid=trainonvalid)
    ds = ds.map(lambda x: (x[0], ATree("@START@", [x[1]]), x[2]))

    if not noreorder:
        ds = ds.map(lambda x:
                    (x[0], reorder_tree(x[1], orderless=orderless), x[2]))

    vocab = Vocab(padid=0, startid=2, endid=3, unkid=1)
    vocab.add_token("@START@", seen=np.infty)
    vocab.add_token(
        "@CLOSE@", seen=np.infty
    )  # only here for the action of closing an open position, will not be seen at input
    vocab.add_token(
        "@OPEN@", seen=np.infty
    )  # only here for the action of opening a closed position, will not be seen at input
    vocab.add_token(
        "@REMOVE@", seen=np.infty
    )  # only here for deletion operations, won't be seen at input
    vocab.add_token(
        "@REMOVESUBTREE@", seen=np.infty
    )  # only here for deletion operations, won't be seen at input
    vocab.add_token("@SLOT@",
                    seen=np.infty)  # will be seen at input, can't be produced!

    nl_tokenizer = BertTokenizer.from_pretrained(nl_mode)
    # for tok, idd in nl_tokenizer.vocab.items():
    #     vocab.add_token(tok, seen=np.infty)          # all wordpieces are added for possible later generation

    tds, vds, xds = ds[lambda x: x[2] == "train"], \
                    ds[lambda x: x[2] == "valid"], \
                    ds[lambda x: x[2] == "test"]

    seqenc = SequenceEncoder(
        vocab=vocab,
        tokenizer=lambda x: extract_info(x, onlytokens=True),
        add_start_token=False,
        add_end_token=False)
    for example in tds.examples:
        query = example[1]
        seqenc.inc_build_vocab(query, seen=True)
    for example in vds.examples:
        query = example[1]
        seqenc.inc_build_vocab(query, seen=False)
    for example in xds.examples:
        query = example[1]
        seqenc.inc_build_vocab(query, seen=False)
    seqenc.finalize_vocab(min_freq=0)

    def mapper(x):
        nl = x[0]
        fl = x[1]
        fltoks = extract_info(fl, onlytokens=True)
        seq = seqenc.convert(fltoks, return_what="tensor")
        ret = (nl_tokenizer.encode(nl, return_tensors="pt")[0], seq)
        return ret

    tds_seq = tds.map(mapper)
    vds_seq = vds.map(mapper)
    xds_seq = xds.map(mapper)
    return tds_seq, vds_seq, xds_seq, nl_tokenizer, seqenc, orderless
예제 #17
0
class GeoDataset(object):
    def __init__(self,
                 p="../../datasets/geo880_multiling/geoquery/",
                 train_lang="en",
                 test_lang=None,
                 bert_tokenizer=None,
                 min_freq: int = 2,
                 cvfolds=None,
                 testfold=None,
                 **kw):
        super(GeoDataset, self).__init__(**kw)
        self.train_lang = train_lang
        self.test_lang = test_lang if test_lang is not None else train_lang
        self.cvfolds, self.testfold = cvfolds, testfold
        self._initialize(p, bert_tokenizer, min_freq)

    def _initialize(self, p, bert_tokenizer, min_freq: int):
        self.data = {}
        self.bert_vocab = Vocab()
        self.bert_vocab.set_dict(bert_tokenizer.vocab)
        self.sentence_encoder = SequenceEncoder(
            lambda x: bert_tokenizer.tokenize(f"[CLS] {x} [SEP]"),
            vocab=self.bert_vocab)
        trainlines = [
            x for x in ujson.load(
                open(os.path.join(p, f"geo-{self.train_lang}.json"), "r"))
        ]
        testlines = [
            x for x in ujson.load(
                open(os.path.join(p, f"geo-{self.train_lang}.json"), "r"))
        ]
        trainlines = [x for x in trainlines if x["split"] == "train"]
        testlines = [x for x in testlines if x["split"] == "test"]
        if self.cvfolds is None:
            splits = ["train"] * len(trainlines) + ["test"] * len(testlines)
        else:
            cvsplit_len = len(trainlines) / self.cvfolds
            splits = []
            for i in range(0, self.cvfolds):
                splits += [i] * round(cvsplit_len * (i + 1) - len(splits))
            random.shuffle(splits)
            splits = [
                "valid" if x == self.testfold else "train" for x in splits
            ]
            splits = splits + ["test"] * len(testlines)
        questions = [x["nl"] for x in trainlines]
        queries = [x["mrl"] for x in trainlines]
        xquestions = [x["nl"] for x in testlines]
        xqueries = [x["mrl"] for x in testlines]
        questions += xquestions
        queries += xqueries

        # initialize output vocabulary
        outvocab = Vocab()
        for token, bertid in self.bert_vocab.D.items():
            outvocab.add_token(token, seen=False)

        self.query_encoder = SequenceEncoder(tokenizer=partial(
            basic_query_tokenizer, strtok=bert_tokenizer),
                                             vocab=outvocab,
                                             add_end_token=True)

        # build vocabularies
        for i, (question, query,
                split) in enumerate(zip(questions, queries, splits)):
            self.query_encoder.inc_build_vocab(query, seen=split == "train")
        keeptokens = set(self.bert_vocab.D.keys())
        self.query_encoder.finalize_vocab(min_freq=min_freq,
                                          keep_tokens=keeptokens)

        token_specs = self.build_token_specs(queries)
        self.token_specs = token_specs

        self.build_data(questions, queries, splits)

    def build_token_specs(self, outputs: Iterable[str]):
        token_specs = dict()

        def walk_the_tree(t, _ts):
            l = t.label()
            if l not in _ts:
                _ts[l] = [np.infty, -np.infty]
            minc, maxc = _ts[l]
            _ts[l] = [min(minc, len(t)), max(maxc, len(t))]
            for c in t:
                walk_the_tree(c, _ts)

        for out in outputs:
            out_tokens = self.query_encoder.convert(out,
                                                    return_what="tokens")[0]
            assert (out_tokens[-1] == "@END@")
            out_tokens = out_tokens[:-1]
            out_str = " ".join(out_tokens)
            tree = lisp_to_tree(out_str)
            walk_the_tree(tree, token_specs)

        # token_specs["and"][1] = np.infty

        return token_specs

    def build_data(self, inputs: Iterable[str], outputs: Iterable[str],
                   splits: Iterable[str]):
        maxlen_in, maxlen_out = 0, 0
        for inp, out, split in zip(inputs, outputs, splits):
            # tokenize both input and output
            inp_tokens = self.sentence_encoder.convert(inp,
                                                       return_what="tokens")[0]
            out_tokens = self.query_encoder.convert(out,
                                                    return_what="tokens")[0]
            # get gold tree
            gold_tree = lisp_to_tree(" ".join(out_tokens[:-1]))
            assert (gold_tree is not None)
            # replace words in output that can't be copied from given input to UNK tokens
            unktoken = self.query_encoder.vocab.unktoken
            inp_tokens_ = set(inp_tokens)
            out_tokens = [
                out_token if out_token in inp_tokens_ or
                (out_token in self.query_encoder.vocab
                 and not out_token in self.query_encoder.vocab.rare_tokens)
                else unktoken for out_token in out_tokens
            ]
            # convert token sequences to ids
            inp_tensor = self.sentence_encoder.convert(inp_tokens,
                                                       return_what="tensor")[0]
            out_tensor = self.query_encoder.convert(out_tokens,
                                                    return_what="tensor")[0]

            state = TreeDecoderState([inp], [gold_tree],
                                     inp_tensor[None, :],
                                     out_tensor[None, :], [inp_tokens],
                                     [out_tokens],
                                     self.sentence_encoder.vocab,
                                     self.query_encoder.vocab,
                                     token_specs=self.token_specs)

            if split not in self.data:
                self.data[split] = []
            self.data[split].append(state)
            maxlen_in = max(maxlen_in, len(inp_tokens))
            maxlen_out = max(maxlen_out, len(out_tensor))
        self.maxlen_input = maxlen_in
        self.maxlen_output = maxlen_out

    def get_split(self, split: str):
        data = []
        for split_e in split.split("+"):
            data += self.data[split_e]
        return DatasetSplitProxy(data)

    @staticmethod
    def collate_fn(data: Iterable):
        goldmaxlen = 0
        inpmaxlen = 0
        data = [state.make_copy(detach=True, deep=True) for state in data]
        for state in data:
            goldmaxlen = max(goldmaxlen, state.gold_tensor.size(1))
            inpmaxlen = max(inpmaxlen, state.inp_tensor.size(1))
        for state in data:
            state.gold_tensor = torch.cat([
                state.gold_tensor,
                state.gold_tensor.new_zeros(
                    1, goldmaxlen - state.gold_tensor.size(1))
            ], 1)
            state.inp_tensor = torch.cat([
                state.inp_tensor,
                state.inp_tensor.new_zeros(
                    1, inpmaxlen - state.inp_tensor.size(1))
            ], 1)
        ret = data[0].merge(data)
        return ret

    def dataloader(self, split: str = None, batsize: int = 5, shuffle=None):
        if split is None:  # return all splits
            ret = {}
            for split in self.data.keys():
                ret[split] = self.dataloader(batsize=batsize,
                                             split=split,
                                             shuffle=shuffle)
            return ret
        else:
            # assert(split in self.data.keys())
            shuffle = shuffle if shuffle is not None else split in (
                "train", "train+valid")
            dl = DataLoader(self.get_split(split),
                            batch_size=batsize,
                            shuffle=shuffle,
                            collate_fn=type(self).collate_fn)
            return dl
def load_ds(traindomains=("restaurants", ),
            testdomain="housing",
            min_freq=1,
            mincoverage=1,
            top_k=np.infty,
            nl_mode="bert-base-uncased",
            fullsimplify=False,
            add_domain_start=True,
            useall=False):
    def tokenize_and_add_start(t, _domain):
        tokens = tree_to_lisp_tokens(t)
        starttok = f"@START/{_domain}@" if add_domain_start else "@START@"
        tokens = [starttok] + tokens
        return tokens

    allex = []
    for traindomain in traindomains:
        ds = OvernightDatasetLoader(simplify_mode="light" if not fullsimplify else "full", simplify_blocks=True, restore_reverse=DATA_RESTORE_REVERSE, validfrac=.10)\
            .load(domain=traindomain)
        allex += ds[(None, None, lambda x: x in
                     ("train", "valid"))].map(lambda x: (x[0], x[1], x[
                         2], traindomain)).examples  # don't use test examples

    testds = OvernightDatasetLoader(simplify_mode="light" if not fullsimplify else "full", simplify_blocks=True, restore_reverse=DATA_RESTORE_REVERSE)\
        .load(domain=testdomain)
    if useall:
        print("using all training examples")
        sortedexamples = testds[(None, None, "train")].examples
    else:
        sortedexamples = get_maximum_spanning_examples(
            testds[(None, None, "train")].examples,
            mincoverage=mincoverage,
            loadedex=[e for e in allex if e[2] == "train"])

    allex += testds[(
        None, None,
        "valid")].map(lambda x: (x[0], x[1], "ftvalid", testdomain)).examples
    allex += testds[(
        None, None,
        "test")].map(lambda x: (x[0], x[1], x[2], testdomain)).examples
    allex += [(ex[0], ex[1], "fttrain", testdomain) for ex in sortedexamples]

    _ds = Dataset(allex)
    ds = _ds.map(lambda x:
                 (x[0], tokenize_and_add_start(x[1], x[3]), x[2], x[3]))

    et = get_lf_abstract_transform(ds[lambda x: x[3] != testdomain].examples)
    ds = ds.map(lambda x: (x[0], et(x[1]), x[1], x[2], x[3]))

    seqenc_vocab = Vocab(padid=0, startid=1, endid=2, unkid=UNKID)
    absseqenc_vocab = Vocab(padid=0, startid=1, endid=2, unkid=UNKID)
    absseqenc = SequenceEncoder(vocab=seqenc_vocab,
                                tokenizer=lambda x: x,
                                add_start_token=False,
                                add_end_token=True)
    fullseqenc = SequenceEncoder(vocab=absseqenc_vocab,
                                 tokenizer=lambda x: x,
                                 add_start_token=False,
                                 add_end_token=True)
    for example in ds.examples:
        absseqenc.inc_build_vocab(example[1],
                                  seen=example[3] in ("train", "fttrain"))
        fullseqenc.inc_build_vocab(example[2],
                                   seen=example[3] in ("train", "fttrain"))
    absseqenc.finalize_vocab(min_freq=min_freq, top_k=top_k)
    fullseqenc.finalize_vocab(min_freq=min_freq, top_k=top_k)

    nl_tokenizer = AutoTokenizer.from_pretrained(nl_mode)

    def tokenize(x):
        ret = (nl_tokenizer.encode(x[0], return_tensors="pt")[0],
               absseqenc.convert(x[1], return_what="tensor"),
               fullseqenc.convert(x[2], return_what="tensor"), x[3], x[0],
               x[1], x[4])
        return ret
    tds, ftds, vds, fvds, xds = ds[(None, None, None, "train", None)].map(tokenize), \
                          ds[(None, None, None, "fttrain", None)].map(tokenize), \
                          ds[(None, None, None, "valid", None)].map(tokenize), \
                          ds[(None, None, None, "ftvalid", None)].map(tokenize), \
                          ds[(None, None, None, "test", None)].map(tokenize)
    return tds, ftds, vds, fvds, xds, nl_tokenizer, fullseqenc, absseqenc
예제 #19
0
    def __init__(self,
                 dim,
                 vocab: Vocab = None,
                 inpvocab: Vocab = None,
                 numlayers: int = 6,
                 numheads: int = 6,
                 userelpos=False,
                 useabspos=True,
                 relposmode="basic",
                 relposrng=10,
                 dropout: float = 0.,
                 sidedrop=0.,
                 maxpos=512,
                 bertname="bert-base-uncased",
                 mode="normal",
                 priorweight=0.,
                 **kw):
        super(SetModel, self).__init__(**kw)
        self.vocab = vocab
        self.inpvocab = inpvocab
        self.vocabsize = vocab.number_of_ids()
        self.dim = dim
        self.userelpos = userelpos
        self.relposrng = relposrng
        self.useabspos = useabspos

        self.out = torch.nn.Linear(self.dim, self.vocabsize)
        self.bertname = bertname
        if self.bertname.startswith("none") or self.bertname == "vanilla":
            self.encrelposemb = None
            if self.userelpos is True:
                if relposmode == "basic":
                    self.encrelposemb = BasicRelPosEmb(self.dim, relposrng)
                # elif relposmode == "mod":
                #     self.relposemb = ModRelPosEmb(self.dim, relposrng, levels=4)
                else:
                    raise Exception(f"Unrecognized relposmode '{relposmode}'")
            bname = "bert" + self.bertname[4:]
            if self.bertname == "vanilla":
                inpvocabsize = inpvocab.number_of_ids()
            else:
                tokenizer = AutoTokenizer.from_pretrained(bname)
                inpvocabsize = tokenizer.vocab_size
            encconfig = TransformerConfig(vocab_size=inpvocabsize,
                                          d_model=self.dim,
                                          d_ff=self.dim * 4,
                                          d_kv=int(self.dim / numheads),
                                          attention_dropout_rate=0.,
                                          num_layers=numlayers,
                                          num_heads=numheads,
                                          dropout_rate=dropout,
                                          sideways_dropout=sidedrop,
                                          vib_att=mode.replace(" ",
                                                               "") == "vibatt")
            encemb = TransformerEmbeddings(encconfig.vocab_size,
                                           encconfig.d_model,
                                           dropout=dropout,
                                           max_position_embeddings=maxpos,
                                           useabspos=useabspos)
            self.encoder_model = TransformerStack(encconfig,
                                                  encemb,
                                                  rel_emb=self.encrelposemb)
        else:
            self.encoder_model = BertModel.from_pretrained(
                self.bertname,
                hidden_dropout_prob=min(dropout, 0.2),
                attention_probs_dropout_prob=min(dropout, 0.1))
        self.adapter = None
        if self.encoder_model.config.hidden_size != self.dim:
            self.adapter = torch.nn.Linear(
                self.encoder_model.config.hidden_size, self.dim, bias=False)

        self.reset_parameters()

        self.bce = torch.nn.BCEWithLogitsLoss(reduction="none")

        self.mode = mode
        self.priorweight = priorweight

        if self.mode == "vib":
            self.vib_lin_mu = torch.nn.Linear(dim, dim)
            self.vib_lin_logvar = torch.nn.Linear(dim, dim)
예제 #20
0
def load_ds(domain="restaurants",
            nl_mode="bert-base-uncased",
            trainonvalid=False,
            noreorder=False,
            numbered=False):
    """
    Creates a dataset of examples which have
    * NL question and tensor
    * original FL tree
    * reduced FL tree with slots (this is randomly generated)
    * tensor corresponding to reduced FL tree with slots
    * mask specifying which elements in reduced FL tree are terminated
    * 2D gold that specifies whether a token/action is in gold for every position (compatibility with MML!)
    """
    # orderless = {"op:and", "SW:concat"}     # only use in eval!!
    orderless = ORDERLESS

    ds = OvernightDatasetLoader(simplify_mode="none").load(
        domain=domain, trainonvalid=trainonvalid)
    # ds contains 3-tuples of (input, output tree, split name)

    if not noreorder:
        ds = ds.map(lambda x:
                    (x[0], reorder_tree(x[1], orderless=orderless), x[2]))
    ds = ds.map(lambda x: (x[0], tree_to_seq(x[1]), x[2]))

    if numbered:
        ds = ds.map(lambda x: (x[0], make_numbered_tokens(x[1]), x[2]))

    vocab = Vocab(padid=0, startid=2, endid=3, unkid=1)
    vocab.add_token("@BOS@", seen=np.infty)
    vocab.add_token("@EOS@", seen=np.infty)
    vocab.add_token("@STOP@", seen=np.infty)

    nl_tokenizer = BertTokenizer.from_pretrained(nl_mode)

    tds, vds, xds = ds[lambda x: x[2] == "train"], \
                    ds[lambda x: x[2] == "valid"], \
                    ds[lambda x: x[2] == "test"]

    seqenc = SequenceEncoder(vocab=vocab,
                             tokenizer=lambda x: x,
                             add_start_token=False,
                             add_end_token=False)
    for example in tds.examples:
        query = example[1]
        seqenc.inc_build_vocab(query, seen=True)
    for example in vds.examples:
        query = example[1]
        seqenc.inc_build_vocab(query, seen=False)
    for example in xds.examples:
        query = example[1]
        seqenc.inc_build_vocab(query, seen=False)
    seqenc.finalize_vocab(min_freq=0)

    def mapper(x):
        seq = seqenc.convert(x[1], return_what="tensor")
        ret = (nl_tokenizer.encode(x[0], return_tensors="pt")[0], seq)
        return ret

    tds_seq = tds.map(mapper)
    vds_seq = vds.map(mapper)
    xds_seq = xds.map(mapper)
    return tds_seq, vds_seq, xds_seq, nl_tokenizer, seqenc, orderless
예제 #21
0
    def __init__(self,
                 dim,
                 vocab: Vocab = None,
                 numlayers: int = 6,
                 numheads: int = 6,
                 userelpos=False,
                 useabspos=True,
                 relposmode="basic",
                 relposrng=10,
                 dropout: float = 0.,
                 maxpos=512,
                 weightmode="vanilla",
                 **kw):
        super(TransformerEncoder, self).__init__(**kw)
        self.vocab = vocab
        self.vocabsize = vocab.number_of_ids()
        self.dim = dim
        self.userelpos = userelpos
        self.relposrng = relposrng
        self.useabspos = useabspos

        self.weightmode = weightmode
        if self.weightmode.startswith("none") or self.weightmode == "vanilla":
            self.encrelposemb = None
            if self.userelpos is True:
                if relposmode == "basic":
                    self.encrelposemb = BasicRelPosEmb(self.dim, relposrng)
                # elif relposmode == "mod":
                #     self.relposemb = ModRelPosEmb(self.dim, relposrng, levels=4)
                else:
                    raise Exception(f"Unrecognized relposmode '{relposmode}'")
            bname = "bert" + self.weightmode[4:]
            if self.weightmode == "vanilla":
                inpvocabsize = self.vocabsize
            else:
                tokenizer = AutoTokenizer.from_pretrained(bname)
                inpvocabsize = tokenizer.vocab_size
            config = TransformerConfig(vocab_size=inpvocabsize,
                                       d_model=self.dim,
                                       d_ff=self.dim * 4,
                                       d_kv=int(self.dim / numheads),
                                       num_layers=numlayers,
                                       num_heads=numheads,
                                       dropout_rate=dropout)
            encemb = TransformerEmbeddings(config.vocab_size,
                                           config.d_model,
                                           dropout=dropout,
                                           max_position_embeddings=maxpos,
                                           useabspos=useabspos)
            self.encoder_model = TransformerStack(config,
                                                  encemb,
                                                  rel_emb=self.encrelposemb)
        else:
            self.encoder_model = BertModel.from_pretrained(
                self.weightmode,
                hidden_dropout_prob=min(dropout, 0.2),
                attention_probs_dropout_prob=min(dropout, 0.1))
        self.adapter = None
        if self.encoder_model.config.hidden_size != self.dim:
            self.adapter = torch.nn.Linear(
                self.encoder_model.config.hidden_size, self.dim, bias=False)

        self.reset_parameters()
예제 #22
0
    def __init__(self,
                 dim,
                 vocab: Vocab = None,
                 inpvocab: Vocab = None,
                 numlayers: int = 6,
                 numheads: int = 6,
                 userelpos=False,
                 useabspos=True,
                 relposmode="basic",
                 relposrng=10,
                 mode="normal",
                 dropout: float = 0.,
                 worddropout: float = 0.,
                 maxpos=512,
                 bertname="bert-base-uncased",
                 **kw):
        super(TransformerDecoderCell, self).__init__(**kw)
        self.vocab = vocab
        self.inpvocab = inpvocab
        self.vocabsize = vocab.number_of_ids()
        self.dim = dim
        self.userelpos = userelpos
        self.relposrng = relposrng
        self.useabspos = useabspos
        self.mode = mode

        decconfig = TransformerConfig(vocab_size=self.vocabsize,
                                      d_model=self.dim,
                                      d_ff=self.dim * 4,
                                      d_kv=int(self.dim / numheads),
                                      num_layers=numlayers,
                                      num_heads=numheads,
                                      dropout_rate=dropout)

        self.dec_emb = torch.nn.Embedding(self.vocabsize, decconfig.d_model)
        self.slot_emb = torch.nn.Embedding(1, decconfig.d_model)

        self.relposemb = None
        if self.userelpos is True:
            if relposmode == "basic":
                self.relposemb = BasicRelPosEmb(self.dim, relposrng)
            # elif relposmode == "mod":
            #     self.relposemb = ModRelPosEmb(self.dim, relposrng, levels=4)
            else:
                raise Exception(f"Unrecognized relposmode '{relposmode}'")

        self.absposemb = None
        if self.relposemb is None or self.useabspos is True:
            self.absposemb = torch.nn.Embedding(maxpos, decconfig.d_model)

        decoder_config = deepcopy(decconfig)
        decoder_config.is_decoder = True
        decoder_config.use_causal_mask = True
        self.decoder = TransformerStackDecoder(decoder_config,
                                               rel_emb=self.relposemb)

        self.out = torch.nn.Linear(self.dim, self.vocabsize)

        vocab_mask = torch.ones(self.vocabsize)
        # for excl_token in self.exclude:
        #     if excl_token in self.vocab:
        #         vocab_mask[self.vocab[excl_token]] = 0
        self.register_buffer("vocab_mask", vocab_mask)

        self.bertname = bertname
        self.encrelposemb = None
        if self.bertname.startswith("none") or self.bertname == "vanilla":
            if self.userelpos is True:
                if relposmode == "basic":
                    self.encrelposemb = BasicRelPosEmb(self.dim, relposrng)
                # elif relposmode == "mod":
                #     self.relposemb = ModRelPosEmb(self.dim, relposrng, levels=4)
                else:
                    raise Exception(f"Unrecognized relposmode '{relposmode}'")
            bname = "bert" + self.bertname[4:]
            if self.bertname == "vanilla":
                inpvocabsize = inpvocab.number_of_ids()
                self.inpworddropout = WordDropout(
                    worddropout, self.inpvocab[self.inpvocab.masktoken],
                    [self.inpvocab[self.inpvocab.padtoken]])
            else:
                tokenizer = AutoTokenizer.from_pretrained(bname)
                inpvocabsize = tokenizer.vocab_size
                self.inpworddropout = WordDropout(
                    worddropout, self.inpvocab[self.inpvocab.masktoken], [
                        self.inpvocab["[CLS]"], self.inpvocab["[SEP]"],
                        self.inpvocab[self.inpvocab.padtoken]
                    ])
            encconfig = TransformerConfig(vocab_size=inpvocabsize,
                                          d_model=self.dim,
                                          d_ff=self.dim * 4,
                                          d_kv=int(self.dim / numheads),
                                          num_layers=numlayers,
                                          num_heads=numheads,
                                          dropout_rate=dropout)
            encemb = TransformerEmbeddings(encconfig.vocab_size,
                                           encconfig.d_model,
                                           dropout=dropout,
                                           max_position_embeddings=maxpos,
                                           useabspos=useabspos)
            self.encoder_model = TransformerStack(encconfig,
                                                  encemb,
                                                  rel_emb=self.encrelposemb)
        else:
            self.encoder_model = BertModel.from_pretrained(
                self.bertname,
                hidden_dropout_prob=min(dropout, 0.2),
                attention_probs_dropout_prob=min(dropout, 0.1))
            tokenizer = AutoTokenizer.from_pretrained(self.bertname)
            inpvocabsize = tokenizer.vocab_size
            self.inpvocab = Vocab()
            for tok, id in tokenizer.vocab.items():
                self.inpvocab.D[tok] = id
            self.inpvocab.masktoken = "[MASK]"
            self.inpvocab.unktoken = "[UNK]"
            self.inpvocab.padtoken = "[PAD]"
            self.inpworddropout = WordDropout(
                worddropout, self.inpvocab[self.inpvocab.masktoken], [
                    self.inpvocab["[CLS]"], self.inpvocab["[SEP]"],
                    self.inpvocab[self.inpvocab.padtoken]
                ])

        self.adapter = None
        if self.encoder_model.config.hidden_size != decoder_config.d_model:
            self.adapter = torch.nn.Linear(
                self.encoder_model.config.hidden_size,
                decoder_config.d_model,
                bias=False)

        self.worddropout = WordDropout(worddropout,
                                       self.vocab[self.vocab.masktoken],
                                       [self.vocab[self.vocab.padtoken]])

        self.reset_parameters()
예제 #23
0
def load_ds(dataset="scan/random",
            validfrac=0.1,
            recompute=False,
            bertname="bert-base-uncased"):
    tt = q.ticktock("data")
    tt.tick(f"loading '{dataset}'")
    if bertname.startswith("none"):
        bertname = "bert" + bertname[4:]
    if dataset.startswith("cfq/") or dataset.startswith("scan/mcd"):
        key = f"{dataset}|bertname={bertname}"
        print(f"validfrac is ineffective with dataset '{dataset}'")
    else:
        key = f"{dataset}|validfrac={validfrac}|bertname={bertname}"

    shelfname = os.path.basename(__file__) + ".cache.shelve"
    if not recompute:
        tt.tick(f"loading from shelf (key '{key}')")
        with shelve.open(shelfname) as shelf:
            if key not in shelf:
                recompute = True
                tt.tock("couldn't load from shelf")
            else:
                shelved = shelf[key]
                trainex, validex, testex, fldic = shelved["trainex"], shelved[
                    "validex"], shelved["testex"], shelved["fldic"]
                inpdic = shelved["inpdic"] if "inpdic" in shelved else None
                trainds, validds, testds = Dataset(trainex), Dataset(
                    validex), Dataset(testex)
                tt.tock("loaded from shelf")

    if recompute:
        tt.tick("loading data")
        splits = dataset.split("/")
        dataset, splits = splits[0], splits[1:]
        split = "/".join(splits)
        if dataset == "scan":
            ds = SCANDatasetLoader().load(split, validfrac=validfrac)
        elif dataset == "cfq":
            ds = CFQDatasetLoader().load(split + "/modent")
        else:
            raise Exception(f"Unknown dataset: '{dataset}'")
        tt.tock("loaded data")

        tt.tick("creating tokenizer")
        tokenizer = Tokenizer(bertname=bertname)
        tt.tock("created tokenizer")

        print(len(ds))

        tt.tick("dictionaries")
        inpdic = Vocab()
        inplens, outlens = [0], []
        fldic = Vocab()
        for x in ds:
            outtoks = tokenizer.get_out_toks(x[1])
            outlens.append(len(outtoks))
            for tok in outtoks:
                fldic.add_token(tok, seen=x[2] == "train")
            inptoks = tokenizer.get_toks(x[0])
            for tok in inptoks:
                inpdic.add_token(tok, seen=x[2] == "train")
        inpdic.finalize(min_freq=0, top_k=np.infty)
        fldic.finalize(min_freq=0, top_k=np.infty)
        print(
            f"input avg/max length is {np.mean(inplens):.1f}/{max(inplens)}, output avg/max length is {np.mean(outlens):.1f}/{max(outlens)}"
        )
        print(
            f"output vocabulary size: {len(fldic.D)} at output, {len(inpdic.D)} at input"
        )
        tt.tock()

        tt.tick("tensorizing")
        tokenizer.inpvocab = inpdic
        tokenizer.outvocab = fldic
        trainds = ds.filter(lambda x: x[-1] == "train").map(
            lambda x: x[:-1]).map(
                lambda x: tokenizer.tokenize(x[0], x[1])).cache(True)
        validds = ds.filter(lambda x: x[-1] == "valid").map(
            lambda x: x[:-1]).map(
                lambda x: tokenizer.tokenize(x[0], x[1])).cache(True)
        testds = ds.filter(lambda x: x[-1] == "test").map(
            lambda x: x[:-1]).map(
                lambda x: tokenizer.tokenize(x[0], x[1])).cache(True)
        # ds = ds.map(lambda x: tokenizer.tokenize(x[0], x[1]) + (x[2],)).cache(True)
        tt.tock("tensorized")

        tt.tick("shelving")
        with shelve.open(shelfname) as shelf:
            shelved = {
                "trainex": trainds.examples,
                "validex": validds.examples,
                "testex": testds.examples,
                "fldic": fldic,
                "inpdic": inpdic,
            }
            shelf[key] = shelved
        tt.tock("shelved")

    tt.tock(f"loaded '{dataset}'")
    tt.msg(
        f"#train={len(trainds)}, #valid={len(validds)}, #test={len(testds)}")

    tt.msg("Overlap of validation with train:")
    overlaps = compute_overlaps(trainds, validds)
    print(json.dumps(overlaps, indent=4))

    tt.msg("Overlap of test with train:")
    overlaps = compute_overlaps(trainds, testds)
    print(json.dumps(overlaps, indent=4))

    return trainds, validds, testds, fldic, inpdic
예제 #24
0
    def __init__(self,
                 dim,
                 vocab: Vocab = None,
                 inpvocab: Vocab = None,
                 numlayers: int = 2,
                 numtmlayers=6,
                 mode="normal",
                 dropout: float = 0.,
                 worddropout: float = 0.,
                 numheads=6,
                 noencoder=False,
                 **kw):
        super(DecoderCell, self).__init__(**kw)
        self.vocab = vocab
        self.inpvocab = inpvocab
        self.vocabsize = vocab.number_of_ids()
        self.dim = dim
        self.mode = mode
        self.noencoder = noencoder
        self.numlayers = numlayers
        self.numtmlayers = numtmlayers

        self.dec_emb = torch.nn.Embedding(self.vocabsize + 3, self.dim)
        dims = [self.dim + self.dim] + [self.dim for _ in range(numlayers)]
        self.dec_stack = torch.nn.ModuleList(
            [torch.nn.GRUCell(dims[i], dims[i + 1]) for i in range(numlayers)])
        self.dropout = torch.nn.Dropout(dropout)
        self.attn_linQ = None
        self.attn_linK = None
        self.attn_linV = None
        # self.attn_linQ = torch.nn.Linear(self.dim, self.dim)
        # self.attn_linK = torch.nn.Linear(self.dim, self.dim)
        # self.attn_linV = torch.nn.Linear(self.dim, self.dim)

        self.preout = torch.nn.Linear(self.dim + self.dim, self.dim)
        self.preoutnonlin = torch.nn.CELU()
        if self.mode == "cont":
            pass
        else:
            self.out = torch.nn.Linear(self.dim, self.vocabsize + 3)

        inpvocabsize = inpvocab.number_of_ids()
        if not self.noencoder:
            encconfig = TransformerConfig(vocab_size=inpvocabsize,
                                          d_model=self.dim,
                                          d_ff=self.dim * 4,
                                          d_kv=int(self.dim / numheads),
                                          num_layers=self.numtmlayers,
                                          num_heads=numheads,
                                          dropout_rate=dropout)
            encemb = TransformerEmbeddings(encconfig.vocab_size,
                                           encconfig.d_model,
                                           dropout=dropout,
                                           max_position_embeddings=1000,
                                           useabspos=True)
            self.encoder_model = TransformerStack(encconfig, encemb)
            # self.encoder_model = Encoder(inpvocabsize+5, self.dim, int(self.dim/2), num_layers=numlayers, dropout=dropout)

        self.adapter = None
        self.inpworddropout = WordDropout(
            worddropout, self.inpvocab[self.inpvocab.masktoken],
            [self.inpvocab[self.inpvocab.padtoken]])
        self.worddropout = WordDropout(worddropout,
                                       self.vocab[self.vocab.masktoken],
                                       [self.vocab[self.vocab.padtoken]])

        self.lenlin = torch.nn.Linear(self.dim * 2, self.dim)
        self.lennonlin = torch.nn.CELU()
        self.lenbias = torch.nn.Linear(self.dim, 1)
        self.lenscale = torch.nn.Linear(self.dim, 1)

        self.reset_parameters()