def load_questions(p=defaultqp): tt = q.ticktock("question loader") tt.tick("loading questions") questions, queries = q.StringMatrix(), q.StringMatrix() xquestions, xqueries = q.StringMatrix(), q.StringMatrix() queries.tokenize = lambda x: x.split() xqueries.tokenize = lambda x: x.split() with open(p + ".train.butd") as f: for line in f: qid, question, query, replacements = line.split("\t") questions.add(question) queries.add(query) questions.finalize() queries.finalize() with open(p + ".test.butd") as f: for line in f: qid, question, query, replacements = line.split("\t") xquestions.add(question) xqueries.add(query) xquestions.finalize() xqueries.finalize() tt.tock("loaded questions") return (questions, queries), (xquestions, xqueries)
def load_questions_inone(p=defaultqp): tt = q.ticktock("question loader") tt.tick("loading questions") questions, queries = q.StringMatrix(), q.StringMatrix() qids = [] queries.tokenize = lambda x: x.split() with open(p + ".train.butd") as f: for line in f: qid, question, query, replacements = line.split("\t") questions.add(question) queries.add(query) qids.append(qid) tx_sep = len(qids) with open(p + ".test.butd") as f: for line in f: qid, question, query, replacements = line.split("\t") questions.add(question) queries.add(query) qids.append(qid) questions.finalize() queries.finalize() return questions, queries, qids, tx_sep
def load_data(p="../../../data/lcquad-fql/"): trainp = os.path.join(p, "train.json") testp = os.path.join(p, "test.json") print(f"Loading data from: '{trainp}' (train) and '{testp}' (test)") traindata = json.load(open(trainp)) print(f"Number of training examples: {len(traindata)}") testdata = json.load(open(testp)) print(f"Number of test examples: {len(testdata)}") # process logical forms # parse to trees, replace entities with placeholders in queries tdata = [(example["question"], ent2placeholder(fql2tree(example["logical_form"]))) for example in traindata] xdata = [(example["question"], ent2placeholder(fql2tree(example["logical_form"]))) for example in testdata] # get node types that have children parentnodes = set() for (_, e) in tdata + xdata: que = [e] while len(que) > 0: head = que.pop(0) if head.children is not None: assert (len(head.children) > 0) parentnodes.add(head.name) que += head.children print( f"Types of nodes that have children ({len(parentnodes)}): \n{parentnodes}" ) # build string matrices teststart = len(tdata) xsm = q.StringMatrix(indicate_start_end=True) ysm = q.StringMatrix(indicate_start=True) ysm.tokenize = lambda x: x.split() for question, l in tdata + xdata: xsm.add(question) ysm.add(l.to_transitions() + " <MASK>") xsm.finalize() ysm.finalize() tok2act = {} for tok in ysm.D: if tok == "<RED>": tok2act[ysm.D[tok]] = 2 elif tok in parentnodes: tok2act[ysm.D[tok]] = 1 else: tok2act[ysm.D[tok]] = 0 return xsm, ysm, teststart, tok2act
def load_word_mat( origp="../../data/buboqa/data/processed_simplequestions_dataset/"): outp = os.path.join(origp, "all.pkl") generate = True if generate: trainp = os.path.join(origp, "train.txt") validp = os.path.join(origp, "valid.txt") testp = os.path.join(origp, "test.txt") trainlines = open(trainp, encoding="utf8").readlines() validlines = open(validp, encoding="utf8").readlines() testlines = open(testp, encoding="utf8").readlines() sm = q.StringMatrix() sm.tokenize = lambda x: x.split() i = 0 for line in tqdm(trainlines): sm.add(line.split("\t")[5]) i += 1 devstart = i for line in tqdm(validlines): sm.add(line.split("\t")[5]) i += 1 teststart = i for line in tqdm(testlines): sm.add(line.split("\t")[5]) sm.finalize() print(len(sm.D)) print(sm[0]) pkl.dump((sm.matrix, sm.D, (devstart, teststart)), open(outp, "wb")) else: wordmat, wordD, (devstart, teststart) = pkl.load(open(outp, "rb")) return wordmat, wordD, (devstart, teststart)
def build_entity_matrices(info): tt = q.ticktock("entity matrix builder") tt.tick("building") # build ids = [] names = q.StringMatrix() names.tokenize = lambda x: q.tokenize(x) if x != "<RARE>" else [x] nameschars = q.StringMatrix() aliases = q.StringMatrix() aliases.tokenize = lambda x: q.tokenize(x) if x != "<RARE>" else [x] nameschars.tokenize = lambda x: " ".join(q.tokenize(x) ) if x != "<RARE>" else [x] typenames = q.StringMatrix() typenames.tokenize = lambda x: q.tokenize( x, preserve_patterns=['<[A-Z]+>']) if x != "<RARE>" else [x] types = q.StringMatrix() types.tokenize = lambda x: x notabletypenames = q.StringMatrix() notabletypenames.tokenize = lambda x: q.tokenize( x) if x != "<RARE>" else [x] for key, val in info.items(): ids.append(key) name = list(val["name"])[0] if val["name"] is not None else "<RARE>" names.add(name) nameschars.add(name) alias = " <SEP> ".join(list( val["aliases"])) if val["aliases"] is not None else "<RARE>" aliases.add(alias) typename = " <SEP> ".join(list( val["typenames"])) if val["typenames"] is not None else "<RARE>" typenames.add(typename) typ = list(val["types"]) if val["types"] is not None else ["<RARE>"] types.add(typ) notabletypename = list( val["notabletypenames"] )[0] if val["notabletypenames"] is not None else "<RARE>" notabletypenames.add(notabletypename) tt.tock("built") tt.tick("finalizing") names.finalize() nameschars.finalize() aliases.finalize() typenames.finalize() notabletypenames.finalize() types.finalize() tt.tock("finalized") edic = dict(zip(ids, range(len(ids)))) return edic, names, nameschars, aliases, typenames, notabletypenames, types
def load_data(p="../../data/buboqa/data/processed_simplequestions_dataset/", relp="../../data/buboqa/data/rels.txt", typep="../../data/buboqa/data/ent2type.pkl", outp="../../data/buboqa/data/bertified_dataset_v2", ): tt = q.ticktock("dataloader") tt.tick("loading files") trainlines = open(p+"train.txt", encoding="utf8").readlines() devlines = open(p+"valid.txt", encoding="utf8").readlines() testlines = open(p+"test.txt", encoding="utf8").readlines() allrels = [x.strip() for x in open(relp).readlines()] ent2type = pkl.load(open(typep, "rb")) tt.tock("files loaded") tt.tick("splitting") trainlines = [line.strip().split("\t") for line in trainlines] devlines = [line.strip().split("\t") for line in devlines] testlines = [line.strip().split("\t") for line in testlines] tt.tock("splitted") tt.tick("doing some stats") stt = q.ticktock("datastats") trainrels = set([line[3] for line in trainlines]) devrels = set([line[3] for line in devlines]) testrels = set([line[3] for line in testlines]) unkrels = set() for line in testlines: if line[3] not in trainrels: unkrels.add(line[0]) stt.msg("{}/{} unique rels in test not in train ({})" .format(len(testrels-trainrels), len(testrels), len(trainrels))) stt.msg("{}/{} unique rels in devnot in train ({})" .format(len(devrels-trainrels), len(devrels), len(trainrels))) stt.msg("{} unique rels".format(len(trainrels | devrels | testrels))) stt.msg("{}/{} unkrel cases in test".format(len(unkrels), len(testlines))) # print(trainlines[3]) tt.tick("creating word matrix") sm = q.StringMatrix(specialtoks=["<ENT>"], indicate_end=True) sm.tokenize = lambda x: x.split() wordborders = np.zeros((len(trainlines) + len(devlines) + len(testlines), 2), dtype="int64") def do_line(line_, i_): try: sm.add(line_[5]) previo = "O" ioline = line_[6] if "[" in ioline or not "I" in ioline: print(ioline) ioline = ioline.replace("'", "").replace("[", "").replace("]", "").replace(",", "") io = ioline.split() + ["O"] k = 0 for j in range(len(io)): if io[j] != previo: if k > 1: print(line_) wordborders[i_, k] = j previo = io[j] k += 1 except Exception as e: print(e) print(line_) i = 0 for line in tqdm(trainlines): do_line(line, i) i += 1 word_devstart = i for line in tqdm(devlines): do_line(line, i) i += 1 word_teststart = i for line in tqdm(testlines): do_line(line, i) i += 1 sm.finalize() print(len(sm.D)) print(sm[0]) tt.tock("created word matrix") tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") def bertify(line): try: subj = line[1] subjtype = ent2type[subj] if subj in ent2type else "none" rel = line[3] sent = "[CLS] {} [SEP]".format(line[5].lower()) span = "O {} O".format(line[6]).split() bertsent = [] #tokenizer.basic_tokenizer.tokenize(sent) unberter = [] sent = sent.split() bertspan = [] for i, (token, io) in enumerate(zip(sent, span)): berttokens = tokenizer.tokenize(token) bertsent += berttokens bertspan += [io] * len(berttokens) unberter += [i] * len(berttokens) except Exception as e: print(e) print(line) # raise e return bertsent, bertspan, rel, unberter, subjtype k = 1331 ret = bertify(trainlines[k]) print(tabulate(ret[0:2])) print(ret[2]) print(tabulate([trainlines[k][5].split(), trainlines[k][6].split()])) tt.tick("bertifying") bert_tokens_train, bert_io_train, bert_rel_train, unberter_train, bert_type_train = zip(*[bertify(line) for line in trainlines]) bert_tokens_dev, bert_io_dev, bert_rel_dev, unberter_dev, bert_type_dev = zip(*[bertify(line) for line in devlines]) bert_tokens_test, bert_io_test, bert_rel_test, unberter_test, bert_type_test = zip(*[bertify(line) for line in testlines]) tt.tock("bertified") print(tabulate([bert_tokens_train[3], bert_io_train[3], unberter_train[3]])) print(bert_rel_train[3]) print("{} entities in train have 'none' type".format()) # construct numpy matrix with ids in bert vocabulary # and also, numpy matrix with spans # and also, numpy vector of relations and dictionary tt.tick("creating token matrix") assert(tokenizer.convert_tokens_to_ids(["[PAD]"]) == [0]) maxlen = max([max([len(x) for x in bert_toks]) for bert_toks in [bert_tokens_train, bert_tokens_dev, bert_tokens_test]]) print(maxlen) tokmat = np.zeros((len(bert_tokens_train) + len(bert_tokens_dev) + len(bert_tokens_test), maxlen), dtype="int32") i = 0 for bert_toks in [bert_tokens_train, bert_tokens_dev, bert_tokens_test]: for x in bert_toks: xids = tokenizer.convert_tokens_to_ids(x) tokmat[i, :len(xids)] = xids i += 1 devstart = len(bert_tokens_train) teststart = len(bert_tokens_train) + len(bert_tokens_dev) assert(word_devstart == devstart, word_teststart == teststart) print(tokmat.shape) tt.tock("token matrix created") tt.tick("creating io matrix") iomat = np.zeros_like(tokmat) iobordersmat = np.zeros((tokmat.shape[0], 2), dtype="int32") i = 0 for bert_io in [bert_io_train, bert_io_dev, bert_io_test]: for x in bert_io: xids = [1 if xe == "O" else 2 for xe in x] iomat[i, :len(xids)] = xids ioborders = [] for j in range(1, len(xids)): if xids[j] != xids[j-1]: ioborders.append(j) iobordersmat[i, :len(ioborders)] = ioborders i += 1 tt.tock("io matrix created") # unbert mat unbertmat = np.zeros_like(tokmat) i = 0 for unberter in [unberter_train, unberter_dev, unberter_test]: for unbert_i in unberter: unbertmat[i, :len(unbert_i)] = [xe+1 for xe in unbert_i] i += 1 tt.tick("testing") test_i = 1331 test_tokids = [xe for xe in tokmat[test_i] if xe != 0] test_ios = iomat[test_i, :len(test_tokids)] test_tokens = tokenizer.convert_ids_to_tokens(test_tokids) print(tabulate([test_tokens, test_ios])) print(iobordersmat[test_i]) tt.tock("tested") tt.tick("doing relations") bert_rel_all = bert_rel_train + bert_rel_dev + bert_rel_test allrelwcounts = dict(zip(allrels, [0]*len(allrels))) for rel in bert_rel_train: allrelwcounts[rel] += 1 allrelwcounts = sorted(allrelwcounts.items(), key=lambda x: x[1], reverse=True) print(allrelwcounts[0]) tt.msg("{} total unique rels".format(len(allrelwcounts))) relD = dict(zip([rel for rel in allrels], range(len(allrels)))) rels = [relD[xe] for xe in bert_rel_all] rels = np.array(rels).astype("int32") relcounts = [rel[1] for rel in allrelwcounts] relcounts = np.array(relcounts).astype("int32") tt.tock("done relations") np.savez(outp, wordmat=sm.matrix, worddic=sm.D, wordborders=wordborders, tokmat=tokmat, iomat=iomat, tokborders=iobordersmat, rels=rels, relD=relD, relcounts=relcounts, unbertmat=unbertmat, devstart=devstart, teststart=teststart) threshold = 2 stt.msg("{} unique rels at least {} time(s) in train data".format( len([xe for xe in allrelwcounts if xe[1] > threshold]), threshold)) rarerels = set([xe[0] for xe in allrelwcounts if xe[1] <= threshold]) testrarecount = 0 for rel in bert_rel_test: if rel in rarerels: testrarecount += 1 stt.msg("{}/{} test examples affected by rare rel".format( testrarecount, len(bert_rel_test) )) tt.tick("reload") reloaded = np.load(open(outp+".npz", "rb")) _relD = reloaded["relD"].item() _tokmat = reloaded["tokmat"] print(reloaded["devstart"]) tt.tock("reloaded")
def gen_datasets(which="geo"): pprefix = "../data/" if which == "geo": pprefix = pprefix + "geoqueries/dong2016/" trainp = pprefix + "train.txt" validp = pprefix + "test.txt" testp = pprefix + "test.txt" elif which == "atis": pprefix += "atis/dong2016/" trainp = pprefix + "train.txt" validp = pprefix + "dev.txt" testp = pprefix + "test.txt" elif which == "jobs": pprefix += "jobqueries/dong2016/" trainp = pprefix + "train.txt" validp = pprefix + "test.txt" testp = pprefix + "test.txt" else: raise q.SumTingWongException("unknown dataset") nlsm = q.StringMatrix(indicate_start_end=True) nlsm.tokenize = lambda x: x.split() flsm = q.StringMatrix( indicate_start_end=True if which == "jobs" else False) flsm.tokenize = lambda x: x.split() devstart, teststart, i = 0, 0, 0 with open(trainp) as tf, open(validp) as vf, open(testp) as xf: for line in tf: line_nl, line_fl = line.strip().split("\t") line_nl = " ".join(line_nl.split(" ")[::-1]) nlsm.add(line_nl) flsm.add(line_fl) i += 1 devstart = i for line in vf: line_nl, line_fl = line.strip().split("\t") line_nl = " ".join(line_nl.split(" ")[::-1]) nlsm.add(line_nl) flsm.add(line_fl) i += 1 teststart = i for line in xf: line_nl, line_fl = line.strip().split("\t") line_nl = " ".join(line_nl.split(" ")[::-1]) nlsm.add(line_nl) flsm.add(line_fl) i += 1 nlsm.finalize() flsm.finalize() nlmat = torch.tensor(nlsm.matrix).long() flmat = torch.tensor(flsm.matrix).long() gold = torch.tensor(flsm.matrix[:, 1:]).long() gold = torch.cat([gold, torch.zeros_like(gold[:, 0:1])], 1) tds = torch.utils.data.TensorDataset(nlmat[:devstart], flmat[:devstart], gold[:devstart]) vds = torch.utils.data.TensorDataset(nlmat[devstart:teststart], flmat[devstart:teststart], gold[devstart:teststart]) xds = torch.utils.data.TensorDataset(nlmat[teststart:], flmat[teststart:], gold[teststart:]) return (tds, vds, xds), nlsm.D, flsm.D
def gen_datasets(which="geo"): pprefix = "../data/" if which == "geo": pprefix = pprefix + "geoqueries/jia2016/" trainp = pprefix + "train.txt" validp = pprefix + "test.txt" testp = pprefix + "test.txt" elif which == "atis": pprefix += "atis/jia2016/" trainp = pprefix + "train.txt" validp = pprefix + "dev.txt" testp = pprefix + "test.txt" elif which == "jobs": assert(False) # jia didn't do jobs pprefix += "jobqueries" trainp = pprefix + "train.txt" validp = pprefix + "test.txt" testp = pprefix + "test.txt" else: raise q.SumTingWongException("unknown dataset") nlsm = q.StringMatrix(indicate_start_end=True) nlsm.tokenize = lambda x: x.split() flsm = q.StringMatrix(indicate_start_end=True if which == "jobs" else False) flsm.tokenize = lambda x: x.split() devstart, teststart, i = 0, 0, 0 trainwords = set() trainwordcounts = {} testwords = set() trainwords_fl = set() trainwordcounts_fl = {} testwords_fl = set() with open(trainp) as tf, open(validp) as vf, open(testp) as xf: for line in tf: line_nl, line_fl = line.strip().split("\t") line_fl = line_fl.replace("' ", "") # line_nl = " ".join(line_nl.split(" ")[::-1]) nlsm.add(line_nl) flsm.add(line_fl) trainwords |= set(line_nl.split()) for word in set(line_nl.split()): if word not in trainwordcounts: trainwordcounts[word] = 0 trainwordcounts[word] += 1 trainwords_fl |= set(line_fl.split()) for word in set(line_fl.split()): if word not in trainwordcounts_fl: trainwordcounts_fl[word] = 0 trainwordcounts_fl[word] += 1 i += 1 devstart = i for line in vf: line_nl, line_fl = line.strip().split("\t") line_fl = line_fl.replace("' ", "") # line_nl = " ".join(line_nl.split(" ")[::-1]) nlsm.add(line_nl) flsm.add(line_fl) i += 1 teststart = i for line in xf: line_nl, line_fl = line.strip().split("\t") line_fl = line_fl.replace("' ", "") # line_nl = " ".join(line_nl.split(" ")[::-1]) nlsm.add(line_nl) flsm.add(line_fl) testwords |= set(line_nl.split()) testwords_fl |= set(line_fl.split()) i += 1 nlsm.finalize() flsm.finalize() # region get gate sup gatesups = torch.zeros(flsm.matrix.shape[0], flsm.matrix.shape[1]+1, dtype=torch.long) for i in range(nlsm.matrix.shape[0]): nl_sent = nlsm[i].split() fl_sent = flsm[i].split() inid = False for j, fl_sent_token in enumerate(fl_sent): if re.match("_\w+id", fl_sent_token): inid = True elif fl_sent_token == ")": inid = False elif fl_sent_token == "(": pass else: if inid: if fl_sent_token in nl_sent: gatesups[i, j] = 1 # endregion # region print analysis print("{} unique words in train, {} unique words in test, {} in test but not in train" .format(len(trainwords), len(testwords), len(testwords - trainwords))) print(testwords - trainwords) trainwords_once = set([k for k, v in trainwordcounts.items() if v < 2]) print("{} unique words in train that occur only once ({} of them is in test)".format(len(trainwords_once), len(trainwords_once & testwords))) print(trainwords_once) trainwords_twice = set([k for k, v in trainwordcounts.items() if v < 3]) print("{} unique words in train that occur only twice ({} of them is in test)".format(len(trainwords_twice), len(trainwords_twice & testwords))) rarerep = trainwords_once | (testwords - trainwords) print("{} unique rare representation words".format(len(rarerep))) print(rarerep) trainwords_fl_once = set([k for k, v in trainwordcounts_fl.items() if v < 2]) rarerep_fl = trainwords_fl_once | (testwords_fl - trainwords_fl) print("{} unique rare rep words in logical forms".format(len(rarerep_fl))) print(rarerep_fl) # endregion # endregion create datasets nlmat = torch.tensor(nlsm.matrix).long() flmat = torch.tensor(flsm.matrix).long() gold = torch.tensor(flsm.matrix[:, 1:]).long() gold = torch.cat([gold, torch.zeros_like(gold[:, 0:1])], 1) tds = torch.utils.data.TensorDataset(nlmat[:devstart], flmat[:devstart], gold[:devstart], gatesups[:devstart][:, 1:]) vds = torch.utils.data.TensorDataset(nlmat[devstart:teststart], flmat[devstart:teststart], gold[devstart:teststart]) xds = torch.utils.data.TensorDataset(nlmat[teststart:], flmat[teststart:], gold[teststart:]) # endregion return (tds, vds, xds), nlsm.D, flsm.D, rarerep, rarerep_fl
def build_relation_matrices(info): tt = q.ticktock("relation matrix builder") tt.tick("building") # build ids = [] names = q.StringMatrix() names.tokenize = lambda x: q.tokenize(x) if x != "<RARE>" else [x] domains = q.StringMatrix() domains.tokenize = lambda x: q.tokenize(x) if x != "<RARE>" else [x] ranges = q.StringMatrix() ranges.tokenize = lambda x: q.tokenize(x) if x != "<RARE>" else [x] urlwords = q.StringMatrix() urlwords.tokenize = lambda x: q.tokenize(x, preserve_patterns=['<[A-Z]+>'] ) if x != "<RARE>" else [x] urltokens = q.StringMatrix() urltokens.tokenize = lambda x: x domainids = q.StringMatrix() domainids.tokenize = lambda x: x rangeids = q.StringMatrix() rangeids.tokenize = lambda x: x for key, val in info.items(): ids.append(key) name = list(val["name"])[0] if val["name"] is not None else "<RARE>" names.add(name) domain = list(val["domainname"] )[0] if val["domainname"] is not None else "<RARE>" domains.add(domain) rangename = list( val["rangename"])[0] if val["rangename"] is not None else "<RARE>" ranges.add(rangename) rangeid = list( val["range"]) if val["range"] is not None else ["<RARE>"] rangeids.add(rangeid) domainid = list( val["domain"]) if val["domain"] is not None else ["<RARE>"] domainids.add(domainid) splits = key[1:].split(".") if splits[0] == "user": splits = splits[2:] while len(splits) < 3: splits = ["default"] + splits url = ".".join(splits) urlword = " <SEP> ".join(splits) urlwords.add(urlword) urltoken = [".".join(splits[:-2]), splits[-2], splits[-1]] urltokens.add(urltoken) tt.tock("built") tt.tick("finalizing") names.finalize() domains.finalize() ranges.finalize() rangeids.finalize() domainids.finalize() urlwords.finalize() urltokens.finalize() tt.tock("finalized") rdic = dict(zip(ids, range(len(ids)))) return rdic, names, domains, ranges, domainids, rangeids, urlwords, urltokens