def test_beam(self): x = [ "( and ( got the walk ) ( got the talk ) ( and ( got thatstyle ) ( got thatsmile ) ) )", "( and ( got the walk ) ( got talk the ) ( and ( got thatstyle ) ( got thatsmile ) ) )", "( and ( got the walk ) ( got the walk ) ( and ( got thatstyle ) ( got thatsmile ) ) )", "( and ( got the talk ) ( got the walk ) ( and ( got thatsmile ) ( got thatstyle ) ) )", "( too_bad ( she ( has ( a penis ) ) ) )" ] D = Vocab() for xe in x: for xes in xe.split(): D.add_token(xes, seen=True) print(D.D) acc = TreeAccuracy(tensor2tree=partial(tensor2tree, D=D), orderless={"and"}) x = [[D[xes] for xes in xe.split()] for xe in x] # equalize dims maxlen = max([len(xe) for xe in x]) x = [xe + [0] * (maxlen - len(xe)) for xe in x] x = torch.tensor(x) print(x) a = acc(None, x[torch.tensor([1, 4, 2, 3, 0])][None, :, :], x[0:1]) print(a) self.assertTrue(a["tree_acc"] == 0) self.assertTrue(a["tree_acc_at1"] == 0) self.assertTrue(a["tree_acc_at2"] == 0) self.assertTrue(a["tree_acc_at3"] == 0) self.assertTrue(a["tree_acc_at4"] == 1) self.assertTrue(a["tree_acc_at5"] == 1) self.assertTrue(a["tree_acc_at_last"] == 1)
def build_vocab_from_pcfg(pcfg, min_freq=0, top_k=np.infty)->Vocab: vocab = Vocab() vocab.add_token("(") vocab.add_token(")") for rule in pcfg.productions(): vocab.add_token(str(rule.lhs())) for rhse in rule.rhs(): vocab.add_token(str(rhse)) vocab.finalize(min_freq=min_freq, top_k=top_k) return vocab
def tensor_to_trees(x, vocab: Vocab): xstrs = [ vocab.tostr(x[i]).replace("@START@", "") for i in range(len(x)) ] xstrs = [re.sub("::\d+", "", xstr) for xstr in xstrs] trees = [] for xstr in xstrs: # drop everything after @END@, if present xstr = xstr.split("@END@") xstr = xstr[0] # add an opening parentheses if not there xstr = xstr.strip() if len(xstr) == 0 or xstr[0] != "(": xstr = "(" + xstr # balance closing parentheses parenthese_imbalance = xstr.count("(") - xstr.count(")") xstr = xstr + ")" * max(0, parenthese_imbalance ) # append missing closing parentheses xstr = "(" * -min( 0, parenthese_imbalance ) + xstr # prepend missing opening parentheses try: tree = taglisp_to_tree(xstr) if isinstance( tree, tuple) and len(tree) == 2 and tree[0] is None: tree = None except Exception as e: tree = None trees.append(tree) return trees
def load_ds(domain="restaurants", min_freq=0, top_k=np.infty, nl_mode="bart-large", trainonvalid=False): ds = OvernightDatasetLoader(simplify_mode="light").load( domain=domain, trainonvalid=trainonvalid) seqenc_vocab = Vocab(padid=1, startid=0, endid=2, unkid=UNKID) seqenc = SequenceEncoder(vocab=seqenc_vocab, tokenizer=tree_to_lisp_tokens, add_start_token=True, add_end_token=True) for example in ds.examples: query = example[1] seqenc.inc_build_vocab(query, seen=example[2] == "train") seqenc.finalize_vocab(min_freq=min_freq, top_k=top_k) nl_tokenizer = AutoTokenizer.from_pretrained(nl_mode) def tokenize(x): ret = (nl_tokenizer.encode(x[0], return_tensors="pt")[0], seqenc.convert(x[1], return_what="tensor"), x[2], x[0], x[1]) return ret tds, vds, xds = ds[(None, None, "train")].map(tokenize), \ ds[(None, None, "valid")].map(tokenize), \ ds[(None, None, "test")].map(tokenize) return tds, vds, xds, nl_tokenizer, seqenc
def __init__(self, dim, vocab:Vocab=None, numlayers:int=6, numheads:int=6, dropout:float=0., maxpos=512, bertname="bert-base-uncased", **kw): super(TransformerTagger, self).__init__(**kw) self.vocab = vocab self.vocabsize = vocab.number_of_ids() self.dim = dim config = TransformerConfig(vocab_size=self.vocabsize, d_model=self.dim, d_ff=self.dim * 4, num_layers=numlayers, num_heads=numheads, dropout_rate=dropout) decoder_config = deepcopy(config) decoder_config.is_decoder = True self.decoder = RelativePositionTransformer(decoder_config) self.out = torch.nn.Linear(self.dim, self.vocabsize) vocab_mask = torch.ones(self.vocabsize) for excl_token in self.exclude: if excl_token in self.vocab: vocab_mask[self.vocab[excl_token]] = 0 self.register_buffer("vocab_mask", vocab_mask) self.bertname = bertname self.bert_model = BertModel.from_pretrained(self.bertname) def set_dropout(m:torch.nn.Module): if isinstance(m, torch.nn.Dropout): m.p = dropout self.bert_model.apply(set_dropout) self.adapter = None if self.bert_model.config.hidden_size != decoder_config.d_model: self.adapter = torch.nn.Linear(self.bert_model.config.hidden_size, decoder_config.d_model, bias=False) self.reset_parameters()
def build_copy_maps(self, inp_vocab: Vocab, str_action_re=re.compile(r"^([^_].*)$")): self.inp_vocab = inp_vocab self.register_buffer( "_inp_to_act", torch.zeros(inp_vocab.number_of_ids(), dtype=torch.long)) self.register_buffer( "_act_to_inp", torch.zeros(self.out_vocab.number_of_ids(), dtype=torch.long)) # for COPY, initialize mapping from input node vocab (sgb.vocab) to output action vocab (qgb.vocab_actions) self._build_copy_maps(str_action_re=str_action_re) # compute action mask from input: actions that are doable using input copy actions are 1, others are 0 actmask = torch.zeros(self.out_vocab.number_of_ids(), dtype=torch.uint8) actmask.index_fill_(0, self._inp_to_act, 1) actmask[0] = 0 self.register_buffer("_inp_actmask", actmask) # rare actions self.rare_token_ids = self.out_vocab.rare_ids self.register_buffer("gen_mask", None) if len(self.rare_token_ids) > 0: gen_mask = torch.ones(self.out_vocab.number_of_ids()) for rare_token_id in self.rare_token_ids: gen_mask[rare_token_id] = 0 self.register_buffer("gen_mask", gen_mask)
def __init__(self, dim, vocab: Vocab = None, inpvocab: Vocab = None, numlayers: int = 6, mode="normal", dropout: float = 0., worddropout: float = 0., **kw): super(GRUDecoderCell, self).__init__(**kw) self.vocab = vocab self.inpvocab = inpvocab self.vocabsize = vocab.number_of_ids() self.dim = dim self.mode = mode self.dec_emb = torch.nn.Embedding(self.vocabsize + 3, self.dim) dims = [self.dim + self.dim] + [self.dim for _ in range(numlayers)] self.dec_stack = torch.nn.ModuleList( [torch.nn.GRUCell(dims[i], dims[i + 1]) for i in range(numlayers)]) self.dropout = torch.nn.Dropout(dropout) self.attn_linQ = None self.attn_linK = None self.attn_linV = None # self.attn_linQ = torch.nn.Linear(self.dim, self.dim) # self.attn_linK = torch.nn.Linear(self.dim, self.dim) # self.attn_linV = torch.nn.Linear(self.dim, self.dim) self.preout = torch.nn.Linear(self.dim + self.dim, self.dim) self.out = torch.nn.Linear(self.dim, self.vocabsize + 3) inpvocabsize = inpvocab.number_of_ids() self.encoder_model = Encoder(inpvocabsize + 5, self.dim, int(self.dim / 2), num_layers=numlayers, dropout=dropout) self.adapter = None self.inpworddropout = WordDropout( worddropout, self.inpvocab[self.inpvocab.masktoken], [self.inpvocab[self.inpvocab.padtoken]]) self.worddropout = WordDropout(worddropout, self.vocab[self.vocab.masktoken], [self.vocab[self.vocab.padtoken]]) self.reset_parameters()
def _initialize(self, p, xlmr, min_freq:int): self.data = {} self.xlmr = xlmr self.xlmr_vocab = Vocab() self.xlmr_vocab.set_dict(xlmr.model.decoder.dictionary.indices) self.sentence_encoder = SequenceEncoder(lambda x: f"<s> {xlmr.bpe.encode(x)} </s>".split(), vocab=self.xlmr_vocab) trainlines = [x for x in ujson.load(open(os.path.join(p, f"geo-{self.train_lang}.json"), "r"))] testlines = [x for x in ujson.load(open(os.path.join(p, f"geo-{self.test_lang}.json"), "r"))] trainlines = [x for x in trainlines if x["split"] == "train"] testlines = [x for x in testlines if x["split"] == "test"] if self.cvfolds is None: splits = ["train"]*len(trainlines) + ["test"] * len(testlines) else: cvsplit_len = len(trainlines)/self.cvfolds splits = [] for i in range(0, self.cvfolds): splits += [i] * round(cvsplit_len * (i+1) - len(splits)) random.shuffle(splits) splits = ["valid" if x == self.testfold else "train" for x in splits] splits = splits + ["test"] * len(testlines) questions = [x["nl"] for x in trainlines] queries = [x["mrl"] for x in trainlines] xquestions = [x["nl"] for x in testlines] xqueries = [x["mrl"] for x in testlines] questions += xquestions queries += xqueries # initialize output vocabulary outvocab = Vocab() # for token, bertid in self.xlmr_vocab.D.items(): # outvocab.add_token(token, seen=False) self.query_encoder = SequenceEncoder(tokenizer=partial(basic_query_tokenizer, strtok=lambda x: xlmr.bpe.encode(x).split()), vocab=outvocab, add_end_token=True) # build vocabularies for i, (question, query, split) in enumerate(zip(questions, queries, splits)): question_tokens = self.sentence_encoder.convert(question, return_what="tokens")[0] for token in question_tokens: self.query_encoder.vocab.add_token(token, seen=False) self.query_encoder.inc_build_vocab(query, seen=split=="train") keeptokens = set(self.xlmr_vocab.D.keys()) self.query_encoder.finalize_vocab(min_freq=min_freq, keep_tokens=keeptokens) token_specs = self.build_token_specs(queries) self.token_specs = token_specs self.build_data(questions, queries, splits)
def __init__(self, dim, vocab: Vocab = None, numlayers: int = 6, numheads: int = 6, dropout: float = 0., maxpos=512, bertname="bert-base-uncased", baseline=False, **kw): super(TransformerTagger, self).__init__(**kw) self.vocab = vocab self.vocabsize = vocab.number_of_ids() self.dim = dim self.baseline = baseline config = TransformerConfig(vocab_size=self.vocabsize, d_model=self.dim, d_ff=self.dim * 4, num_layers=numlayers, num_heads=numheads, dropout_rate=dropout, use_relative_position=False) self.emb = torch.nn.Embedding(config.vocab_size, config.d_model) self.posemb = torch.nn.Embedding(maxpos, config.d_model) decoder_config = deepcopy(config) decoder_config.is_decoder = True decoder_config.use_causal_mask = baseline self.decoder = TransformerStack(decoder_config) if baseline: self.out = torch.nn.Linear(self.dim, self.vocabsize) else: self.out = torch.nn.Linear(self.dim * 2, self.vocabsize) # self.out = MOS(self.dim, self.vocabsize, K=mosk) vocab_mask = torch.ones(self.vocabsize) # for excl_token in self.exclude: # if excl_token in self.vocab: # vocab_mask[self.vocab[excl_token]] = 0 self.register_buffer("vocab_mask", vocab_mask) self.bertname = bertname self.bert_model = BertModel.from_pretrained(self.bertname) # def set_dropout(m:torch.nn.Module): # if isinstance(m, torch.nn.Dropout): # m.p = dropout # self.bert_model.apply(set_dropout) self.adapter = None if self.bert_model.config.hidden_size != decoder_config.d_model: self.adapter = torch.nn.Linear(self.bert_model.config.hidden_size, decoder_config.d_model, bias=False) self.reset_parameters()
def __init__(self, h_dim: int, vocab: Vocab = None, **kw): super(_PtrGenOutput, self).__init__(**kw) # initialize modules self.gen_lin = torch.nn.Linear(h_dim, vocab.number_of_ids(), bias=True) self.copy_or_gen = torch.nn.Linear(h_dim, 2, bias=True) self.sm = torch.nn.Softmax(-1) self.logsm = torch.nn.LogSoftmax(-1) self.inp_vocab, self.out_vocab = None, vocab self.naningrad = torch.nn.Parameter(torch.zeros(1)) self.naningrad2 = torch.nn.Parameter(torch.zeros(1))
def __init__(self, h_dim: int, inp_vocab: Vocab = None, out_vocab: Vocab = None, **kw): super(SumPtrGenOutputOLD, self).__init__(**kw) # initialize modules self.gen_lin = torch.nn.Linear(h_dim, out_vocab.number_of_ids(), bias=True) self.sm = torch.nn.Softmax(-1) self.logsm = torch.nn.LogSoftmax(-1) self.inp_vocab, self.out_vocab = inp_vocab, out_vocab self.register_buffer( "_inp_to_act", torch.zeros(self.inp_vocab.number_of_ids(), dtype=torch.long)) self.register_buffer( "_act_from_inp", torch.zeros(out_vocab.number_of_ids(), dtype=torch.long)) # for COPY, initialize mapping from input node vocab (sgb.vocab) to output action vocab (qgb.vocab_actions) self.build_copy_maps() # compute action mask from input: actions that are doable using input copy actions are 1, others are 0 actmask = torch.zeros(out_vocab.number_of_ids(), dtype=torch.uint8) actmask.index_fill_(0, self._inp_to_act, 1) self.register_buffer("_inp_actmask", actmask) # rare actions self.rare_token_ids = out_vocab.rare_ids rare_id = 1 if len(self.rare_token_ids) > 0: out_map = torch.arange(self.out_vocab.number_of_ids()) for rare_token_id in self.rare_token_ids: out_map[rare_token_id] = rare_id self.register_buffer("out_map", out_map) else: self.register_buffer("out_map", None)
def _initialize(self, p, bert_tokenizer, min_freq: int): self.data = {} self.bert_vocab = Vocab() self.bert_vocab.set_dict(bert_tokenizer.vocab) self.sentence_encoder = SequenceEncoder( lambda x: bert_tokenizer.tokenize(f"[CLS] {x} [SEP]"), vocab=self.bert_vocab) trainlines = [ x for x in ujson.load( open(os.path.join(p, f"geo-{self.train_lang}.json"), "r")) ] testlines = [ x for x in ujson.load( open(os.path.join(p, f"geo-{self.train_lang}.json"), "r")) ] trainlines = [x for x in trainlines if x["split"] == "train"] testlines = [x for x in testlines if x["split"] == "test"] if self.cvfolds is None: splits = ["train"] * len(trainlines) + ["test"] * len(testlines) else: cvsplit_len = len(trainlines) / self.cvfolds splits = [] for i in range(0, self.cvfolds): splits += [i] * round(cvsplit_len * (i + 1) - len(splits)) random.shuffle(splits) splits = [ "valid" if x == self.testfold else "train" for x in splits ] splits = splits + ["test"] * len(testlines) questions = [x["nl"] for x in trainlines] queries = [x["mrl"] for x in trainlines] xquestions = [x["nl"] for x in testlines] xqueries = [x["mrl"] for x in testlines] questions += xquestions queries += xqueries # initialize output vocabulary outvocab = Vocab() for token, bertid in self.bert_vocab.D.items(): outvocab.add_token(token, seen=False) self.query_encoder = SequenceEncoder(tokenizer=partial( basic_query_tokenizer, strtok=bert_tokenizer), vocab=outvocab, add_end_token=True) # build vocabularies for i, (question, query, split) in enumerate(zip(questions, queries, splits)): self.query_encoder.inc_build_vocab(query, seen=split == "train") keeptokens = set(self.bert_vocab.D.keys()) self.query_encoder.finalize_vocab(min_freq=min_freq, keep_tokens=keeptokens) token_specs = self.build_token_specs(queries) self.token_specs = token_specs self.build_data(questions, queries, splits)
def test_normal(self): x = [ "( and ( has service ) ( has money ) ( and ( got thatstyle ) ( got thatsmile ) ) )", "( and ( has service ) ( has service ) ( and ( got thatstyle ) ( got thatsmile ) ) )", "( and ( has money ) ( has service ) ( and ( got thatsmile ) ( got thatstyle ) ) )" ] D = Vocab() for xe in x: for xes in xe.split(): D.add_token(xes, seen=True) print(D.D) acc = TreeAccuracy(tensor2tree=partial(tensor2tree, D=D), orderless={"and"}) x = [[D[xes] for xes in xe.split()] for xe in x] x = torch.tensor(x) print(x) a = acc(None, x[0:1], x[1:2]) self.assertEqual(a["tree_acc"], 0) print(a) a = acc(None, x[0:1], x[2:3]) self.assertEqual(a["tree_acc"], 1.) print(a)
def __init__(self, h_dim: int, vocab: Vocab = None, dropout: float = 0., **kw): super(BasicGenOutput, self).__init__(**kw) self.gen_lin = torch.nn.Linear(h_dim, vocab.number_of_ids(), bias=True) self.sm = torch.nn.Softmax(-1) self.logsm = torch.nn.LogSoftmax(-1) self.dropout = torch.nn.Dropout(dropout) self.vocab = vocab # rare output tokens self.rare_token_ids = vocab.rare_ids if len(self.rare_token_ids) > 0: out_mask = torch.ones(self.vocab.number_of_ids()) for rare_token_id in self.rare_token_ids: out_mask[rare_token_id] = 0 self.register_buffer("out_mask", out_mask) else: self.register_buffer("out_mask", None)
def load_ds(traindomains=("restaurants",), testdomain="housing", min_freq=1, mincoverage=1, top_k=np.infty, nl_mode="bert-base-uncased", fullsimplify=False, onlyabstract=False, pretrainsetting="all+lex", # "all", "lex" or "all+lex" finetunesetting="lex", # "lex", "all", "min" ): """ :param traindomains: :param testdomain: :param min_freq: :param mincoverage: :param top_k: :param nl_mode: :param fullsimplify: :param add_domain_start: :param onlyabstract: :param pretrainsetting: "all": use all examples from every domain "lex": use only lexical examples "all+lex": use both :param finetunesetting: "lex": use lexical examples "all": use all training examples "min": use minimal lexicon-covering set of examples ! Test is always over the same original test set. ! Validation is over a fraction of training data :return: """ general_tokens = { "(", ")", "arg:~type", "arg:type", "op:and", "SW:concat", "cond:has", "arg:<=", "arg:<", "arg:>=", "arg:>", "arg:!=", "arg:=", "SW:superlative", "SW:CNT-arg:min", "SW:CNT-arg:<", "SW:CNT-arg:<=", "SW:CNT-arg:>=", "SW:CNT-arg:>", "SW:CNT-arg:max", "SW:CNT-arg:=", "arg:max", } def tokenize_and_add_start(t): tokens = tree_to_lisp_tokens(t) starttok = "@START@" tokens = [starttok] + tokens return tokens sourceex = [] for traindomain in traindomains: ds = OvernightDatasetLoader(simplify_mode="light" if not fullsimplify else "full", simplify_blocks=True, restore_reverse=DATA_RESTORE_REVERSE, validfrac=.10)\ .load(domain=traindomain) sourceex += ds[(None, None, lambda x: x in ("train", "valid", "lexicon"))].map(lambda x: (x[0], x[1], x[2], traindomain)).examples # don't use test examples testds = OvernightDatasetLoader(simplify_mode="light" if not fullsimplify else "full", simplify_blocks=True, restore_reverse=DATA_RESTORE_REVERSE)\ .load(domain=testdomain) targetex = testds.map(lambda x: x + (testdomain,)).examples pretrainex = [] if "all" in pretrainsetting.split("+"): pretrainex += [(a, tokenize_and_add_start(b), "pretrain", d) for a, b, c, d in sourceex if c == "train"] if "lex" in pretrainsetting.split("+"): pretrainex += [(a, tokenize_and_add_start(b), "pretrain", d) for a, b, c, d in sourceex if c == "lexicon"] pretrainvalidex = [(a, tokenize_and_add_start(b), "pretrainvalid", d) for a, b, c, d in sourceex if c == "valid"] if finetunesetting == "all": finetunetrainex = [(a, tokenize_and_add_start(b), "fttrain", d) for a, b, c, d in targetex if c == "train"] elif finetunesetting == "lex": finetunetrainex = [(a, tokenize_and_add_start(b), "fttrain", d) for a, b, c, d in targetex if c == "lexicon"] elif finetunesetting == "min": finetunetrainex = get_maximum_spanning_examples([(a, b, c, d) for a, b, c, d in targetex if c == "train"], mincoverage=mincoverage, loadedex=[e for e in pretrainex if e[2] == "pretrain"]) finetunetrainex = [(a, tokenize_and_add_start(b), "fttrain", d) for a, b, c, d in finetunetrainex] finetunevalidex = [(a, tokenize_and_add_start(b), "ftvalid", d) for a, b, c, d in targetex if c == "valid"] finetunetestex = [(a, tokenize_and_add_start(b), "fttest", d) for a, b, c, d in targetex if c == "test"] print(f"Using mode \"{finetunesetting}\" for finetuning data: " f"\n\t{len(finetunetrainex)} training examples") allex = pretrainex + pretrainvalidex + finetunetrainex + finetunevalidex + finetunetestex ds = Dataset(allex) if onlyabstract: et = get_lf_abstract_transform(ds[lambda x: x[3] != testdomain].examples) ds = ds.map(lambda x: (x[0], et(x[1]), x[2], x[3])) seqenc_vocab = Vocab(padid=0, startid=1, endid=2, unkid=UNKID) seqenc = SequenceEncoder(vocab=seqenc_vocab, tokenizer=lambda x: x, add_start_token=False, add_end_token=True) for example in ds.examples: query = example[1] seqenc.inc_build_vocab(query, seen=example[2] in ("pretrain", "fttrain")) seqenc.finalize_vocab(min_freq=min_freq, top_k=top_k) generaltokenmask = torch.zeros(seqenc_vocab.number_of_ids(), dtype=torch.long) for token, tokenid in seqenc_vocab.D.items(): if token in general_tokens: generaltokenmask[tokenid] = 1 nl_tokenizer = AutoTokenizer.from_pretrained(nl_mode) def tokenize(x): ret = (nl_tokenizer.encode(x[0], return_tensors="pt")[0], seqenc.convert(x[1], return_what="tensor"), x[2], x[0], x[1], x[3]) return ret tds, ftds, vds, fvds, xds = ds[(None, None, "pretrain", None)].map(tokenize), \ ds[(None, None, "fttrain", None)].map(tokenize), \ ds[(None, None, "pretrainvalid", None)].map(tokenize), \ ds[(None, None, "ftvalid", None)].map(tokenize), \ ds[(None, None, "fttest", None)].map(tokenize) return tds, ftds, vds, fvds, xds, nl_tokenizer, seqenc, generaltokenmask
def load_ds(domain="restaurants", nl_mode="bert-base-uncased", trainonvalid=False, noreorder=False): """ Creates a dataset of examples which have * NL question and tensor * original FL tree * reduced FL tree with slots (this is randomly generated) * tensor corresponding to reduced FL tree with slots * mask specifying which elements in reduced FL tree are terminated * 2D gold that specifies whether a token/action is in gold for every position (compatibility with MML!) """ orderless = {"op:and", "SW:concat"} # only use in eval!! ds = OvernightDatasetLoader().load(domain=domain, trainonvalid=trainonvalid) ds = ds.map(lambda x: (x[0], ATree("@START@", [x[1]]), x[2])) if not noreorder: ds = ds.map(lambda x: (x[0], reorder_tree(x[1], orderless=orderless), x[2])) vocab = Vocab(padid=0, startid=2, endid=3, unkid=1) vocab.add_token("@START@", seen=np.infty) vocab.add_token( "@CLOSE@", seen=np.infty ) # only here for the action of closing an open position, will not be seen at input vocab.add_token( "@OPEN@", seen=np.infty ) # only here for the action of opening a closed position, will not be seen at input vocab.add_token( "@REMOVE@", seen=np.infty ) # only here for deletion operations, won't be seen at input vocab.add_token( "@REMOVESUBTREE@", seen=np.infty ) # only here for deletion operations, won't be seen at input vocab.add_token("@SLOT@", seen=np.infty) # will be seen at input, can't be produced! nl_tokenizer = BertTokenizer.from_pretrained(nl_mode) # for tok, idd in nl_tokenizer.vocab.items(): # vocab.add_token(tok, seen=np.infty) # all wordpieces are added for possible later generation tds, vds, xds = ds[lambda x: x[2] == "train"], \ ds[lambda x: x[2] == "valid"], \ ds[lambda x: x[2] == "test"] seqenc = SequenceEncoder( vocab=vocab, tokenizer=lambda x: extract_info(x, onlytokens=True), add_start_token=False, add_end_token=False) for example in tds.examples: query = example[1] seqenc.inc_build_vocab(query, seen=True) for example in vds.examples: query = example[1] seqenc.inc_build_vocab(query, seen=False) for example in xds.examples: query = example[1] seqenc.inc_build_vocab(query, seen=False) seqenc.finalize_vocab(min_freq=0) def mapper(x): nl = x[0] fl = x[1] fltoks = extract_info(fl, onlytokens=True) seq = seqenc.convert(fltoks, return_what="tensor") ret = (nl_tokenizer.encode(nl, return_tensors="pt")[0], seq) return ret tds_seq = tds.map(mapper) vds_seq = vds.map(mapper) xds_seq = xds.map(mapper) return tds_seq, vds_seq, xds_seq, nl_tokenizer, seqenc, orderless
class GeoDataset(object): def __init__(self, p="../../datasets/geo880_multiling/geoquery/", train_lang="en", test_lang=None, bert_tokenizer=None, min_freq: int = 2, cvfolds=None, testfold=None, **kw): super(GeoDataset, self).__init__(**kw) self.train_lang = train_lang self.test_lang = test_lang if test_lang is not None else train_lang self.cvfolds, self.testfold = cvfolds, testfold self._initialize(p, bert_tokenizer, min_freq) def _initialize(self, p, bert_tokenizer, min_freq: int): self.data = {} self.bert_vocab = Vocab() self.bert_vocab.set_dict(bert_tokenizer.vocab) self.sentence_encoder = SequenceEncoder( lambda x: bert_tokenizer.tokenize(f"[CLS] {x} [SEP]"), vocab=self.bert_vocab) trainlines = [ x for x in ujson.load( open(os.path.join(p, f"geo-{self.train_lang}.json"), "r")) ] testlines = [ x for x in ujson.load( open(os.path.join(p, f"geo-{self.train_lang}.json"), "r")) ] trainlines = [x for x in trainlines if x["split"] == "train"] testlines = [x for x in testlines if x["split"] == "test"] if self.cvfolds is None: splits = ["train"] * len(trainlines) + ["test"] * len(testlines) else: cvsplit_len = len(trainlines) / self.cvfolds splits = [] for i in range(0, self.cvfolds): splits += [i] * round(cvsplit_len * (i + 1) - len(splits)) random.shuffle(splits) splits = [ "valid" if x == self.testfold else "train" for x in splits ] splits = splits + ["test"] * len(testlines) questions = [x["nl"] for x in trainlines] queries = [x["mrl"] for x in trainlines] xquestions = [x["nl"] for x in testlines] xqueries = [x["mrl"] for x in testlines] questions += xquestions queries += xqueries # initialize output vocabulary outvocab = Vocab() for token, bertid in self.bert_vocab.D.items(): outvocab.add_token(token, seen=False) self.query_encoder = SequenceEncoder(tokenizer=partial( basic_query_tokenizer, strtok=bert_tokenizer), vocab=outvocab, add_end_token=True) # build vocabularies for i, (question, query, split) in enumerate(zip(questions, queries, splits)): self.query_encoder.inc_build_vocab(query, seen=split == "train") keeptokens = set(self.bert_vocab.D.keys()) self.query_encoder.finalize_vocab(min_freq=min_freq, keep_tokens=keeptokens) token_specs = self.build_token_specs(queries) self.token_specs = token_specs self.build_data(questions, queries, splits) def build_token_specs(self, outputs: Iterable[str]): token_specs = dict() def walk_the_tree(t, _ts): l = t.label() if l not in _ts: _ts[l] = [np.infty, -np.infty] minc, maxc = _ts[l] _ts[l] = [min(minc, len(t)), max(maxc, len(t))] for c in t: walk_the_tree(c, _ts) for out in outputs: out_tokens = self.query_encoder.convert(out, return_what="tokens")[0] assert (out_tokens[-1] == "@END@") out_tokens = out_tokens[:-1] out_str = " ".join(out_tokens) tree = lisp_to_tree(out_str) walk_the_tree(tree, token_specs) # token_specs["and"][1] = np.infty return token_specs def build_data(self, inputs: Iterable[str], outputs: Iterable[str], splits: Iterable[str]): maxlen_in, maxlen_out = 0, 0 for inp, out, split in zip(inputs, outputs, splits): # tokenize both input and output inp_tokens = self.sentence_encoder.convert(inp, return_what="tokens")[0] out_tokens = self.query_encoder.convert(out, return_what="tokens")[0] # get gold tree gold_tree = lisp_to_tree(" ".join(out_tokens[:-1])) assert (gold_tree is not None) # replace words in output that can't be copied from given input to UNK tokens unktoken = self.query_encoder.vocab.unktoken inp_tokens_ = set(inp_tokens) out_tokens = [ out_token if out_token in inp_tokens_ or (out_token in self.query_encoder.vocab and not out_token in self.query_encoder.vocab.rare_tokens) else unktoken for out_token in out_tokens ] # convert token sequences to ids inp_tensor = self.sentence_encoder.convert(inp_tokens, return_what="tensor")[0] out_tensor = self.query_encoder.convert(out_tokens, return_what="tensor")[0] state = TreeDecoderState([inp], [gold_tree], inp_tensor[None, :], out_tensor[None, :], [inp_tokens], [out_tokens], self.sentence_encoder.vocab, self.query_encoder.vocab, token_specs=self.token_specs) if split not in self.data: self.data[split] = [] self.data[split].append(state) maxlen_in = max(maxlen_in, len(inp_tokens)) maxlen_out = max(maxlen_out, len(out_tensor)) self.maxlen_input = maxlen_in self.maxlen_output = maxlen_out def get_split(self, split: str): data = [] for split_e in split.split("+"): data += self.data[split_e] return DatasetSplitProxy(data) @staticmethod def collate_fn(data: Iterable): goldmaxlen = 0 inpmaxlen = 0 data = [state.make_copy(detach=True, deep=True) for state in data] for state in data: goldmaxlen = max(goldmaxlen, state.gold_tensor.size(1)) inpmaxlen = max(inpmaxlen, state.inp_tensor.size(1)) for state in data: state.gold_tensor = torch.cat([ state.gold_tensor, state.gold_tensor.new_zeros( 1, goldmaxlen - state.gold_tensor.size(1)) ], 1) state.inp_tensor = torch.cat([ state.inp_tensor, state.inp_tensor.new_zeros( 1, inpmaxlen - state.inp_tensor.size(1)) ], 1) ret = data[0].merge(data) return ret def dataloader(self, split: str = None, batsize: int = 5, shuffle=None): if split is None: # return all splits ret = {} for split in self.data.keys(): ret[split] = self.dataloader(batsize=batsize, split=split, shuffle=shuffle) return ret else: # assert(split in self.data.keys()) shuffle = shuffle if shuffle is not None else split in ( "train", "train+valid") dl = DataLoader(self.get_split(split), batch_size=batsize, shuffle=shuffle, collate_fn=type(self).collate_fn) return dl
def load_ds(traindomains=("restaurants", ), testdomain="housing", min_freq=1, mincoverage=1, top_k=np.infty, nl_mode="bert-base-uncased", fullsimplify=False, add_domain_start=True, useall=False): def tokenize_and_add_start(t, _domain): tokens = tree_to_lisp_tokens(t) starttok = f"@START/{_domain}@" if add_domain_start else "@START@" tokens = [starttok] + tokens return tokens allex = [] for traindomain in traindomains: ds = OvernightDatasetLoader(simplify_mode="light" if not fullsimplify else "full", simplify_blocks=True, restore_reverse=DATA_RESTORE_REVERSE, validfrac=.10)\ .load(domain=traindomain) allex += ds[(None, None, lambda x: x in ("train", "valid"))].map(lambda x: (x[0], x[1], x[ 2], traindomain)).examples # don't use test examples testds = OvernightDatasetLoader(simplify_mode="light" if not fullsimplify else "full", simplify_blocks=True, restore_reverse=DATA_RESTORE_REVERSE)\ .load(domain=testdomain) if useall: print("using all training examples") sortedexamples = testds[(None, None, "train")].examples else: sortedexamples = get_maximum_spanning_examples( testds[(None, None, "train")].examples, mincoverage=mincoverage, loadedex=[e for e in allex if e[2] == "train"]) allex += testds[( None, None, "valid")].map(lambda x: (x[0], x[1], "ftvalid", testdomain)).examples allex += testds[( None, None, "test")].map(lambda x: (x[0], x[1], x[2], testdomain)).examples allex += [(ex[0], ex[1], "fttrain", testdomain) for ex in sortedexamples] _ds = Dataset(allex) ds = _ds.map(lambda x: (x[0], tokenize_and_add_start(x[1], x[3]), x[2], x[3])) et = get_lf_abstract_transform(ds[lambda x: x[3] != testdomain].examples) ds = ds.map(lambda x: (x[0], et(x[1]), x[1], x[2], x[3])) seqenc_vocab = Vocab(padid=0, startid=1, endid=2, unkid=UNKID) absseqenc_vocab = Vocab(padid=0, startid=1, endid=2, unkid=UNKID) absseqenc = SequenceEncoder(vocab=seqenc_vocab, tokenizer=lambda x: x, add_start_token=False, add_end_token=True) fullseqenc = SequenceEncoder(vocab=absseqenc_vocab, tokenizer=lambda x: x, add_start_token=False, add_end_token=True) for example in ds.examples: absseqenc.inc_build_vocab(example[1], seen=example[3] in ("train", "fttrain")) fullseqenc.inc_build_vocab(example[2], seen=example[3] in ("train", "fttrain")) absseqenc.finalize_vocab(min_freq=min_freq, top_k=top_k) fullseqenc.finalize_vocab(min_freq=min_freq, top_k=top_k) nl_tokenizer = AutoTokenizer.from_pretrained(nl_mode) def tokenize(x): ret = (nl_tokenizer.encode(x[0], return_tensors="pt")[0], absseqenc.convert(x[1], return_what="tensor"), fullseqenc.convert(x[2], return_what="tensor"), x[3], x[0], x[1], x[4]) return ret tds, ftds, vds, fvds, xds = ds[(None, None, None, "train", None)].map(tokenize), \ ds[(None, None, None, "fttrain", None)].map(tokenize), \ ds[(None, None, None, "valid", None)].map(tokenize), \ ds[(None, None, None, "ftvalid", None)].map(tokenize), \ ds[(None, None, None, "test", None)].map(tokenize) return tds, ftds, vds, fvds, xds, nl_tokenizer, fullseqenc, absseqenc
def __init__(self, dim, vocab: Vocab = None, inpvocab: Vocab = None, numlayers: int = 6, numheads: int = 6, userelpos=False, useabspos=True, relposmode="basic", relposrng=10, dropout: float = 0., sidedrop=0., maxpos=512, bertname="bert-base-uncased", mode="normal", priorweight=0., **kw): super(SetModel, self).__init__(**kw) self.vocab = vocab self.inpvocab = inpvocab self.vocabsize = vocab.number_of_ids() self.dim = dim self.userelpos = userelpos self.relposrng = relposrng self.useabspos = useabspos self.out = torch.nn.Linear(self.dim, self.vocabsize) self.bertname = bertname if self.bertname.startswith("none") or self.bertname == "vanilla": self.encrelposemb = None if self.userelpos is True: if relposmode == "basic": self.encrelposemb = BasicRelPosEmb(self.dim, relposrng) # elif relposmode == "mod": # self.relposemb = ModRelPosEmb(self.dim, relposrng, levels=4) else: raise Exception(f"Unrecognized relposmode '{relposmode}'") bname = "bert" + self.bertname[4:] if self.bertname == "vanilla": inpvocabsize = inpvocab.number_of_ids() else: tokenizer = AutoTokenizer.from_pretrained(bname) inpvocabsize = tokenizer.vocab_size encconfig = TransformerConfig(vocab_size=inpvocabsize, d_model=self.dim, d_ff=self.dim * 4, d_kv=int(self.dim / numheads), attention_dropout_rate=0., num_layers=numlayers, num_heads=numheads, dropout_rate=dropout, sideways_dropout=sidedrop, vib_att=mode.replace(" ", "") == "vibatt") encemb = TransformerEmbeddings(encconfig.vocab_size, encconfig.d_model, dropout=dropout, max_position_embeddings=maxpos, useabspos=useabspos) self.encoder_model = TransformerStack(encconfig, encemb, rel_emb=self.encrelposemb) else: self.encoder_model = BertModel.from_pretrained( self.bertname, hidden_dropout_prob=min(dropout, 0.2), attention_probs_dropout_prob=min(dropout, 0.1)) self.adapter = None if self.encoder_model.config.hidden_size != self.dim: self.adapter = torch.nn.Linear( self.encoder_model.config.hidden_size, self.dim, bias=False) self.reset_parameters() self.bce = torch.nn.BCEWithLogitsLoss(reduction="none") self.mode = mode self.priorweight = priorweight if self.mode == "vib": self.vib_lin_mu = torch.nn.Linear(dim, dim) self.vib_lin_logvar = torch.nn.Linear(dim, dim)
def load_ds(domain="restaurants", nl_mode="bert-base-uncased", trainonvalid=False, noreorder=False, numbered=False): """ Creates a dataset of examples which have * NL question and tensor * original FL tree * reduced FL tree with slots (this is randomly generated) * tensor corresponding to reduced FL tree with slots * mask specifying which elements in reduced FL tree are terminated * 2D gold that specifies whether a token/action is in gold for every position (compatibility with MML!) """ # orderless = {"op:and", "SW:concat"} # only use in eval!! orderless = ORDERLESS ds = OvernightDatasetLoader(simplify_mode="none").load( domain=domain, trainonvalid=trainonvalid) # ds contains 3-tuples of (input, output tree, split name) if not noreorder: ds = ds.map(lambda x: (x[0], reorder_tree(x[1], orderless=orderless), x[2])) ds = ds.map(lambda x: (x[0], tree_to_seq(x[1]), x[2])) if numbered: ds = ds.map(lambda x: (x[0], make_numbered_tokens(x[1]), x[2])) vocab = Vocab(padid=0, startid=2, endid=3, unkid=1) vocab.add_token("@BOS@", seen=np.infty) vocab.add_token("@EOS@", seen=np.infty) vocab.add_token("@STOP@", seen=np.infty) nl_tokenizer = BertTokenizer.from_pretrained(nl_mode) tds, vds, xds = ds[lambda x: x[2] == "train"], \ ds[lambda x: x[2] == "valid"], \ ds[lambda x: x[2] == "test"] seqenc = SequenceEncoder(vocab=vocab, tokenizer=lambda x: x, add_start_token=False, add_end_token=False) for example in tds.examples: query = example[1] seqenc.inc_build_vocab(query, seen=True) for example in vds.examples: query = example[1] seqenc.inc_build_vocab(query, seen=False) for example in xds.examples: query = example[1] seqenc.inc_build_vocab(query, seen=False) seqenc.finalize_vocab(min_freq=0) def mapper(x): seq = seqenc.convert(x[1], return_what="tensor") ret = (nl_tokenizer.encode(x[0], return_tensors="pt")[0], seq) return ret tds_seq = tds.map(mapper) vds_seq = vds.map(mapper) xds_seq = xds.map(mapper) return tds_seq, vds_seq, xds_seq, nl_tokenizer, seqenc, orderless
def __init__(self, dim, vocab: Vocab = None, numlayers: int = 6, numheads: int = 6, userelpos=False, useabspos=True, relposmode="basic", relposrng=10, dropout: float = 0., maxpos=512, weightmode="vanilla", **kw): super(TransformerEncoder, self).__init__(**kw) self.vocab = vocab self.vocabsize = vocab.number_of_ids() self.dim = dim self.userelpos = userelpos self.relposrng = relposrng self.useabspos = useabspos self.weightmode = weightmode if self.weightmode.startswith("none") or self.weightmode == "vanilla": self.encrelposemb = None if self.userelpos is True: if relposmode == "basic": self.encrelposemb = BasicRelPosEmb(self.dim, relposrng) # elif relposmode == "mod": # self.relposemb = ModRelPosEmb(self.dim, relposrng, levels=4) else: raise Exception(f"Unrecognized relposmode '{relposmode}'") bname = "bert" + self.weightmode[4:] if self.weightmode == "vanilla": inpvocabsize = self.vocabsize else: tokenizer = AutoTokenizer.from_pretrained(bname) inpvocabsize = tokenizer.vocab_size config = TransformerConfig(vocab_size=inpvocabsize, d_model=self.dim, d_ff=self.dim * 4, d_kv=int(self.dim / numheads), num_layers=numlayers, num_heads=numheads, dropout_rate=dropout) encemb = TransformerEmbeddings(config.vocab_size, config.d_model, dropout=dropout, max_position_embeddings=maxpos, useabspos=useabspos) self.encoder_model = TransformerStack(config, encemb, rel_emb=self.encrelposemb) else: self.encoder_model = BertModel.from_pretrained( self.weightmode, hidden_dropout_prob=min(dropout, 0.2), attention_probs_dropout_prob=min(dropout, 0.1)) self.adapter = None if self.encoder_model.config.hidden_size != self.dim: self.adapter = torch.nn.Linear( self.encoder_model.config.hidden_size, self.dim, bias=False) self.reset_parameters()
def __init__(self, dim, vocab: Vocab = None, inpvocab: Vocab = None, numlayers: int = 6, numheads: int = 6, userelpos=False, useabspos=True, relposmode="basic", relposrng=10, mode="normal", dropout: float = 0., worddropout: float = 0., maxpos=512, bertname="bert-base-uncased", **kw): super(TransformerDecoderCell, self).__init__(**kw) self.vocab = vocab self.inpvocab = inpvocab self.vocabsize = vocab.number_of_ids() self.dim = dim self.userelpos = userelpos self.relposrng = relposrng self.useabspos = useabspos self.mode = mode decconfig = TransformerConfig(vocab_size=self.vocabsize, d_model=self.dim, d_ff=self.dim * 4, d_kv=int(self.dim / numheads), num_layers=numlayers, num_heads=numheads, dropout_rate=dropout) self.dec_emb = torch.nn.Embedding(self.vocabsize, decconfig.d_model) self.slot_emb = torch.nn.Embedding(1, decconfig.d_model) self.relposemb = None if self.userelpos is True: if relposmode == "basic": self.relposemb = BasicRelPosEmb(self.dim, relposrng) # elif relposmode == "mod": # self.relposemb = ModRelPosEmb(self.dim, relposrng, levels=4) else: raise Exception(f"Unrecognized relposmode '{relposmode}'") self.absposemb = None if self.relposemb is None or self.useabspos is True: self.absposemb = torch.nn.Embedding(maxpos, decconfig.d_model) decoder_config = deepcopy(decconfig) decoder_config.is_decoder = True decoder_config.use_causal_mask = True self.decoder = TransformerStackDecoder(decoder_config, rel_emb=self.relposemb) self.out = torch.nn.Linear(self.dim, self.vocabsize) vocab_mask = torch.ones(self.vocabsize) # for excl_token in self.exclude: # if excl_token in self.vocab: # vocab_mask[self.vocab[excl_token]] = 0 self.register_buffer("vocab_mask", vocab_mask) self.bertname = bertname self.encrelposemb = None if self.bertname.startswith("none") or self.bertname == "vanilla": if self.userelpos is True: if relposmode == "basic": self.encrelposemb = BasicRelPosEmb(self.dim, relposrng) # elif relposmode == "mod": # self.relposemb = ModRelPosEmb(self.dim, relposrng, levels=4) else: raise Exception(f"Unrecognized relposmode '{relposmode}'") bname = "bert" + self.bertname[4:] if self.bertname == "vanilla": inpvocabsize = inpvocab.number_of_ids() self.inpworddropout = WordDropout( worddropout, self.inpvocab[self.inpvocab.masktoken], [self.inpvocab[self.inpvocab.padtoken]]) else: tokenizer = AutoTokenizer.from_pretrained(bname) inpvocabsize = tokenizer.vocab_size self.inpworddropout = WordDropout( worddropout, self.inpvocab[self.inpvocab.masktoken], [ self.inpvocab["[CLS]"], self.inpvocab["[SEP]"], self.inpvocab[self.inpvocab.padtoken] ]) encconfig = TransformerConfig(vocab_size=inpvocabsize, d_model=self.dim, d_ff=self.dim * 4, d_kv=int(self.dim / numheads), num_layers=numlayers, num_heads=numheads, dropout_rate=dropout) encemb = TransformerEmbeddings(encconfig.vocab_size, encconfig.d_model, dropout=dropout, max_position_embeddings=maxpos, useabspos=useabspos) self.encoder_model = TransformerStack(encconfig, encemb, rel_emb=self.encrelposemb) else: self.encoder_model = BertModel.from_pretrained( self.bertname, hidden_dropout_prob=min(dropout, 0.2), attention_probs_dropout_prob=min(dropout, 0.1)) tokenizer = AutoTokenizer.from_pretrained(self.bertname) inpvocabsize = tokenizer.vocab_size self.inpvocab = Vocab() for tok, id in tokenizer.vocab.items(): self.inpvocab.D[tok] = id self.inpvocab.masktoken = "[MASK]" self.inpvocab.unktoken = "[UNK]" self.inpvocab.padtoken = "[PAD]" self.inpworddropout = WordDropout( worddropout, self.inpvocab[self.inpvocab.masktoken], [ self.inpvocab["[CLS]"], self.inpvocab["[SEP]"], self.inpvocab[self.inpvocab.padtoken] ]) self.adapter = None if self.encoder_model.config.hidden_size != decoder_config.d_model: self.adapter = torch.nn.Linear( self.encoder_model.config.hidden_size, decoder_config.d_model, bias=False) self.worddropout = WordDropout(worddropout, self.vocab[self.vocab.masktoken], [self.vocab[self.vocab.padtoken]]) self.reset_parameters()
def load_ds(dataset="scan/random", validfrac=0.1, recompute=False, bertname="bert-base-uncased"): tt = q.ticktock("data") tt.tick(f"loading '{dataset}'") if bertname.startswith("none"): bertname = "bert" + bertname[4:] if dataset.startswith("cfq/") or dataset.startswith("scan/mcd"): key = f"{dataset}|bertname={bertname}" print(f"validfrac is ineffective with dataset '{dataset}'") else: key = f"{dataset}|validfrac={validfrac}|bertname={bertname}" shelfname = os.path.basename(__file__) + ".cache.shelve" if not recompute: tt.tick(f"loading from shelf (key '{key}')") with shelve.open(shelfname) as shelf: if key not in shelf: recompute = True tt.tock("couldn't load from shelf") else: shelved = shelf[key] trainex, validex, testex, fldic = shelved["trainex"], shelved[ "validex"], shelved["testex"], shelved["fldic"] inpdic = shelved["inpdic"] if "inpdic" in shelved else None trainds, validds, testds = Dataset(trainex), Dataset( validex), Dataset(testex) tt.tock("loaded from shelf") if recompute: tt.tick("loading data") splits = dataset.split("/") dataset, splits = splits[0], splits[1:] split = "/".join(splits) if dataset == "scan": ds = SCANDatasetLoader().load(split, validfrac=validfrac) elif dataset == "cfq": ds = CFQDatasetLoader().load(split + "/modent") else: raise Exception(f"Unknown dataset: '{dataset}'") tt.tock("loaded data") tt.tick("creating tokenizer") tokenizer = Tokenizer(bertname=bertname) tt.tock("created tokenizer") print(len(ds)) tt.tick("dictionaries") inpdic = Vocab() inplens, outlens = [0], [] fldic = Vocab() for x in ds: outtoks = tokenizer.get_out_toks(x[1]) outlens.append(len(outtoks)) for tok in outtoks: fldic.add_token(tok, seen=x[2] == "train") inptoks = tokenizer.get_toks(x[0]) for tok in inptoks: inpdic.add_token(tok, seen=x[2] == "train") inpdic.finalize(min_freq=0, top_k=np.infty) fldic.finalize(min_freq=0, top_k=np.infty) print( f"input avg/max length is {np.mean(inplens):.1f}/{max(inplens)}, output avg/max length is {np.mean(outlens):.1f}/{max(outlens)}" ) print( f"output vocabulary size: {len(fldic.D)} at output, {len(inpdic.D)} at input" ) tt.tock() tt.tick("tensorizing") tokenizer.inpvocab = inpdic tokenizer.outvocab = fldic trainds = ds.filter(lambda x: x[-1] == "train").map( lambda x: x[:-1]).map( lambda x: tokenizer.tokenize(x[0], x[1])).cache(True) validds = ds.filter(lambda x: x[-1] == "valid").map( lambda x: x[:-1]).map( lambda x: tokenizer.tokenize(x[0], x[1])).cache(True) testds = ds.filter(lambda x: x[-1] == "test").map( lambda x: x[:-1]).map( lambda x: tokenizer.tokenize(x[0], x[1])).cache(True) # ds = ds.map(lambda x: tokenizer.tokenize(x[0], x[1]) + (x[2],)).cache(True) tt.tock("tensorized") tt.tick("shelving") with shelve.open(shelfname) as shelf: shelved = { "trainex": trainds.examples, "validex": validds.examples, "testex": testds.examples, "fldic": fldic, "inpdic": inpdic, } shelf[key] = shelved tt.tock("shelved") tt.tock(f"loaded '{dataset}'") tt.msg( f"#train={len(trainds)}, #valid={len(validds)}, #test={len(testds)}") tt.msg("Overlap of validation with train:") overlaps = compute_overlaps(trainds, validds) print(json.dumps(overlaps, indent=4)) tt.msg("Overlap of test with train:") overlaps = compute_overlaps(trainds, testds) print(json.dumps(overlaps, indent=4)) return trainds, validds, testds, fldic, inpdic
def __init__(self, dim, vocab: Vocab = None, inpvocab: Vocab = None, numlayers: int = 2, numtmlayers=6, mode="normal", dropout: float = 0., worddropout: float = 0., numheads=6, noencoder=False, **kw): super(DecoderCell, self).__init__(**kw) self.vocab = vocab self.inpvocab = inpvocab self.vocabsize = vocab.number_of_ids() self.dim = dim self.mode = mode self.noencoder = noencoder self.numlayers = numlayers self.numtmlayers = numtmlayers self.dec_emb = torch.nn.Embedding(self.vocabsize + 3, self.dim) dims = [self.dim + self.dim] + [self.dim for _ in range(numlayers)] self.dec_stack = torch.nn.ModuleList( [torch.nn.GRUCell(dims[i], dims[i + 1]) for i in range(numlayers)]) self.dropout = torch.nn.Dropout(dropout) self.attn_linQ = None self.attn_linK = None self.attn_linV = None # self.attn_linQ = torch.nn.Linear(self.dim, self.dim) # self.attn_linK = torch.nn.Linear(self.dim, self.dim) # self.attn_linV = torch.nn.Linear(self.dim, self.dim) self.preout = torch.nn.Linear(self.dim + self.dim, self.dim) self.preoutnonlin = torch.nn.CELU() if self.mode == "cont": pass else: self.out = torch.nn.Linear(self.dim, self.vocabsize + 3) inpvocabsize = inpvocab.number_of_ids() if not self.noencoder: encconfig = TransformerConfig(vocab_size=inpvocabsize, d_model=self.dim, d_ff=self.dim * 4, d_kv=int(self.dim / numheads), num_layers=self.numtmlayers, num_heads=numheads, dropout_rate=dropout) encemb = TransformerEmbeddings(encconfig.vocab_size, encconfig.d_model, dropout=dropout, max_position_embeddings=1000, useabspos=True) self.encoder_model = TransformerStack(encconfig, encemb) # self.encoder_model = Encoder(inpvocabsize+5, self.dim, int(self.dim/2), num_layers=numlayers, dropout=dropout) self.adapter = None self.inpworddropout = WordDropout( worddropout, self.inpvocab[self.inpvocab.masktoken], [self.inpvocab[self.inpvocab.padtoken]]) self.worddropout = WordDropout(worddropout, self.vocab[self.vocab.masktoken], [self.vocab[self.vocab.padtoken]]) self.lenlin = torch.nn.Linear(self.dim * 2, self.dim) self.lennonlin = torch.nn.CELU() self.lenbias = torch.nn.Linear(self.dim, 1) self.lenscale = torch.nn.Linear(self.dim, 1) self.reset_parameters()