def load_data(p="../../../datasets/semparse/", which=None, devfrac=0.1, devfracrandom=False): tt = q.ticktock("dataloader") tt.tick("loading data") assert(which is not None) which = {"geo": "geoquery", "atis": "atis", "jobs": "jobs"}[which] trainp = os.path.join(p, which, "train.txt") testp = os.path.join(p, which, "test.txt") devp = os.path.join(p, which, "dev.txt") trainlines = open(trainp).readlines() testlines = open(testp).readlines() if not os.path.exists(devp): tt.msg("no dev file, taking {} from training data".format(devfrac)) splitidx = round(len(trainlines)*devfrac) trainlines = trainlines[:-splitidx] devlines = trainlines[-splitidx:] else: devlines = open(devp).readlines() tt.msg("{} examples in training set".format(len(trainlines))) tt.msg("{} examples in dev set".format(len(devlines))) tt.msg("{} examples in test set".format(len(testlines))) nlsm = q.StringMatrix(freqcutoff=1) nlsm.tokenize = lambda x: x.strip().split() qlsm = q.StringMatrix(indicate_start_end=True, freqcutoff=1) qlsm.tokenize = lambda x: x.strip().split() i = 0 for line in trainlines: nl, ql = line.split("\t") nlsm.add(nl) qlsm.add(ql) i += 1 nlsm.unseen_mode = True qlsm.unseen_mode = True devstart = i for line in devlines: nl, ql = line.split("\t") nlsm.add(nl) qlsm.add(ql) i += 1 teststart = i for line in testlines: nl, ql = line.split("\t") nlsm.add(nl) qlsm.add(ql) nlsm.finalize() qlsm.finalize() tt.tock("data loaded") return nlsm, qlsm, (devstart, teststart)
def run(p1=DATA_PATH+"valid_dialogues.json", p2=DATA_PATH+"valid_dialogues.json", # change the file paths to use train and valid (so the ids are shared) maxwords=int(1e9), rarefreq=0): """ Saves in DATA_PATH, see code for exact paths :param p1: path to train json :param p2: path to valid json :param maxwords: maximum number of words in vocab :param rarefreq: word frequency for rare words :return: """ sm = q.StringMatrix(topnwords=maxwords, freqcutoff=rarefreq) sm.tokenize = lambda x: x.split() out_struct1, sm, us = load_datafile(p1, sm) sm.unseen_mode = True out_struct2, sm, us2 = load_datafile(p2, sm, uniquestrings=us) sm.finalize() ## !!! dictionary is in sm.D, numpy array is in sm.matrix assert(us == us2) print("done: {} unique strings \n\n".format(len(us))) json.dump(out_struct1, open(DATA_PATH + "train_dialogues.struct.json", "w")) json.dump(out_struct2, open(DATA_PATH + "valid_dialogues.struct.json", "w")) json.dump(sm.D, open(DATA_PATH+"dialogues.strings.dict", "w")) np.save(DATA_PATH+"dialogues.strings.mat", sm.matrix) print("saved") return out_struct1, out_struct2, sm
def run_toy(lr=0.001, seqlen=8, batsize=10, epochs=1000, embdim=32, innerdim=64, z_dim=32, noaccumulate=False, usebase=False, ): # generate some toy data N = 1000 data, vocab = gen_toy_data(N, seqlen=seqlen, mode="copymiddlefixed") datasm = q.StringMatrix() datasm.set_dictionary(vocab) datasm.tokenize = lambda x: list(x) for data_e in data: datasm.add(data_e) datasm.finalize() real_data = q.dataset(datasm.matrix) gen_data_d = q.gan.gauss_dataset(z_dim, len(real_data)) disc_data = q.datacat([real_data, gen_data_d], 1) gen_data = q.gan.gauss_dataset(z_dim) disc_data = q.dataload(disc_data, batch_size=batsize, shuffle=True) gen_data = q.dataload(gen_data, batch_size=batsize, shuffle=True) discriminator = Discriminator(datasm.D, embdim, innerdim) generator = Decoder(datasm.D, embdim, z_dim, "<START>", innerdim, maxtime=seqlen) SeqGAN = SeqGAN_Base if usebase else SeqGAN_DCL disc_model = SeqGAN(discriminator, generator, gan_mode=q.gan.GAN.DISC_TRAIN, accumulate=not noaccumulate) gen_model = SeqGAN(discriminator, generator, gan_mode=q.gan.GAN.GEN_TRAIN, accumulate=not noaccumulate) disc_optim = torch.optim.Adam(q.params_of(discriminator), lr=lr) gen_optim = torch.optim.Adam(q.params_of(generator), lr=lr) disc_trainer = q.trainer(disc_model).on(disc_data).optimizer(disc_optim).loss(q.no_losses(2)) gen_trainer = q.trainer(gen_model).on(gen_data).optimizer(gen_optim).loss(q.no_losses(2)) gan_trainer = q.gan.GANTrainer(disc_trainer, gen_trainer) gan_trainer.run(epochs, disciters=5, geniters=1, burnin=500) # print some predictions: with torch.no_grad(): rvocab = {v: k for k, v in vocab.items()} q.batch_reset(generator) eval_z = torch.randn(50, z_dim) eval_y, _ = generator(eval_z) for i in range(len(eval_y)): prow = "".join([rvocab[mij] for mij in eval_y[i].numpy()]) print(prow) print("done")
def run_cond_toy(lr=0.001, seqlen=8, batsize=10, epochs=1000, embdim=5, innerdim=32, z_dim=5, usebase=False, nrexamples=1000): data, vocab = gen_toy_data(nrexamples, seqlen=seqlen, mode="twointerleaveboth") datasm = q.StringMatrix() datasm.set_dictionary(vocab) datasm.tokenize = lambda x: list(x) for data_e in data: datasm.add(data_e) datasm.finalize() real_data = q.dataset(datasm.matrix) shuffled_datasm_matrix = datasm.matrix + 0 np.random.shuffle(shuffled_datasm_matrix) fake_data = q.dataset(shuffled_datasm_matrix) disc_data = q.datacat([real_data, fake_data], 1) gen_data = q.dataset(datasm.matrix) disc_data = q.dataload(disc_data, batch_size=batsize, shuffle=True) gen_data = q.dataload(gen_data, batch_size=batsize, shuffle=True) discr = Discriminator(datasm.D, embdim, innerdim) decoder = Decoder_Cond(datasm.D, embdim, z_dim, "<START>", innerdim) disc_model = SeqGAN_Cond(discr, decoder, gan_mode=q.gan.GAN.DISC_TRAIN) gen_model = SeqGAN_Cond(discr, decoder, gan_mode=q.gan.GAN.GEN_TRAIN) disc_optim = torch.optim.Adam(q.params_of(discr), lr=lr) gen_optim = torch.optim.Adam(q.params_of(decoder), lr=lr) disc_trainer = q.trainer(disc_model).on(disc_data).optimizer(disc_optim).loss(q.no_losses(2)) gen_trainer = q.trainer(gen_model).on(gen_data).optimizer(gen_optim).loss(q.no_losses(2)) gan_trainer = q.gan.GANTrainer(disc_trainer, gen_trainer) gan_trainer.run(epochs, disciters=5, geniters=1, burnin=500) with torch.no_grad(): rvocab = {v: k for k, v in vocab.items()} q.batch_reset(decoder) eval_z = torch.tensor(datasm.matrix[:50]) eval_y, _, _, _ = decoder(eval_z) for i in range(len(eval_y)): prow = "".join([rvocab[mij] for mij in eval_y[i].numpy()]) print(prow) print("done")
def load_data(p="../../datasets/simplequestions/"): tt = q.ticktock("dataloader") tt.tick("loading") questions, subjects, subject_names, relations, spans, (start_valid, start_test) \ = load_questions(p) generate_candidates(p) tt.tock("{} questions loaded".format(len(questions))) tt.tick("generating matrices") qsm = q.StringMatrix(freqcutoff=2) qsm.tokenize = lambda x: x.split() for question in tqdm.tqdm(questions[:start_valid]): qsm.add(question) qsm.unseen_mode = True for question in tqdm.tqdm(questions[start_valid:]): qsm.add(question) tt.msg("finalizing") qsm.finalize() print(qsm[0]) q.embed() tt.tock("matrices generated")
def run_classify(lr=0.001, seqlen=6, numex=500, epochs=25, batsize=10, test=True, cuda=False, gpu=0): device = torch.device("cpu") if cuda: device = torch.device("cuda", gpu) # region construct data colors = "red blue green magenta cyan orange yellow grey salmon pink purple teal".split( ) D = dict(zip(colors, range(len(colors)))) inpseqs = [] targets = [] for i in range(numex): inpseq = list(np.random.choice(colors, seqlen, replace=False)) target = np.random.choice(range(len(inpseq)), 1)[0] target_class = D[inpseq[target]] inpseq[target] = "${}$".format(inpseq[target]) inpseqs.append("".join(inpseq)) targets.append(target_class) sm = q.StringMatrix() sm.tokenize = lambda x: list(x) for inpseq in inpseqs: sm.add(inpseq) sm.finalize() print(sm[0]) print(sm.D) targets = np.asarray(targets) data = q.dataload(sm.matrix[:-100], targets[:-100], batch_size=batsize) valid_data = q.dataload(sm.matrix[-100:], targets[-100:], batch_size=batsize) # endregion # region model embdim = 20 enc2inpdim = 45 encdim = 20 outdim = 20 emb = q.WordEmb(embdim, worddic=sm.D) # sm dictionary (characters) out = q.WordLinout(outdim, worddic=D) # target dictionary # encoders: enc1 = q.RNNEncoder(embdim, encdim, bidir=True) enc2 = q.RNNCellEncoder(enc2inpdim, outdim // 2, bidir=True) # model class Model(torch.nn.Module): def __init__(self, dim, _emb, _out, _enc1, _enc2, **kw): super(Model, self).__init__(**kw) self.dim, self.emb, self.out, self.enc1, self.enc2 = dim, _emb, _out, _enc1, _enc2 self.score = torch.nn.Sequential( torch.nn.Linear(dim, 1, bias=False), torch.nn.Sigmoid()) self.emb_expander = ExpandVecs(embdim, enc2inpdim, 2) self.enc_expander = ExpandVecs(encdim * 2, enc2inpdim, 2) def forward(self, x, with_att=False): # embed and encode xemb, xmask = self.emb(x) xenc = self.enc1(xemb, mask=xmask) # compute attention xatt = self.score(xenc).squeeze( 2) * xmask.float()[:, :xenc.size(1)] # encode again _xemb = self.emb_expander(xemb[:, :xenc.size(1)]) _xenc = self.enc_expander(xenc) _, xenc2 = self.enc2(_xemb, gate=xatt, mask=xmask[:, :xenc.size(1)], ret_states=True) scores = self.out(xenc2.view(xenc.size(0), -1)) if with_att: return scores, xatt else: return scores model = Model(40, emb, out, enc1, enc2) # endregion # region test if test: inps = torch.tensor(sm.matrix[0:2]) outs = model(inps) # endregion # region train optimizer = torch.optim.Adam(q.params_of(model), lr=lr) trainer = q.trainer(model).on(data).loss(torch.nn.CrossEntropyLoss(), q.Accuracy())\ .optimizer(optimizer).hook(q.ClipGradNorm(5.)).device(device) validator = q.tester(model).on(valid_data).loss( q.Accuracy()).device(device) q.train(trainer, validator).run(epochs=epochs) # endregion # region check attention #TODO # feed a batch inpd = torch.tensor(sm.matrix[400:410]) outd, att = model(inpd, with_att=True) outd = torch.max(outd, 1)[1].cpu().detach().numpy() inpd = inpd.cpu().detach().numpy() att = att.cpu().detach().numpy() rD = {v: k for k, v in sm.D.items()} roD = {v: k for k, v in D.items()} for i in range(len(att)): inpdi = " ".join([rD[x] for x in inpd[i]]) outdi = roD[outd[i]] print("input: {}\nattention: {}\nprediction: {}".format( inpdi, " ".join(["{:.1f}".format(x) for x in att[i]]), outdi))
def run_words(lr=0.001, seqlen=8, batsize=50, epochs=1000, embdim=64, innerdim=128, z_dim=64, usebase=True, noaccumulate=False, ): # get some words N = 1000 glove = q.PretrainedWordEmb(50, vocabsize=N+2) words = list(glove.D.keys())[2:] datasm = q.StringMatrix() datasm.tokenize = lambda x: list(x) for word in words: datasm.add(word) datasm.finalize() datamat = datasm.matrix[:, :seqlen] # replace <mask> with <end> datamat = datamat + (datamat == datasm.D["<MASK>"]) * (datasm.D["<END>"] - datasm.D["<MASK>"]) real_data = q.dataset(datamat) gen_data_d = q.gan.gauss_dataset(z_dim, len(real_data)) disc_data = q.datacat([real_data, gen_data_d], 1) gen_data = q.gan.gauss_dataset(z_dim) disc_data = q.dataload(disc_data, batch_size=batsize, shuffle=True) gen_data = q.dataload(gen_data, batch_size=batsize, shuffle=True) discriminator = Discriminator(datasm.D, embdim, innerdim) generator = Decoder(datasm.D, embdim, z_dim, "<START>", innerdim, maxtime=seqlen) SeqGAN = SeqGAN_Base if usebase else SeqGAN_DCL disc_model = SeqGAN(discriminator, generator, gan_mode=q.gan.GAN.DISC_TRAIN, accumulate=not noaccumulate) gen_model = SeqGAN(discriminator, generator, gan_mode=q.gan.GAN.GEN_TRAIN, accumulate=not noaccumulate) disc_optim = torch.optim.Adam(q.params_of(discriminator), lr=lr) gen_optim = torch.optim.Adam(q.params_of(generator), lr=lr) disc_trainer = q.trainer(disc_model).on(disc_data).optimizer(disc_optim).loss(q.no_losses(2)) gen_trainer = q.trainer(gen_model).on(gen_data).optimizer(gen_optim).loss(q.no_losses(2)) gan_trainer = q.gan.GANTrainer(disc_trainer, gen_trainer) gan_trainer.run(epochs, disciters=5, geniters=1, burnin=500) # print some predictions: with torch.no_grad(): rvocab = {v: k for k, v in datasm.D.items()} q.batch_reset(generator) eval_z = torch.randn(50, z_dim) eval_y, _ = generator(eval_z) for i in range(len(eval_y)): prow = "".join([rvocab[mij] for mij in eval_y[i].numpy()]) print(prow) print("done")
def load_jsons(datap="../../../datasets/lcquad/newdata.json", relp="../../../datasets/lcquad/nrelations.json", mode="flat"): tt = q.ticktock("data loader") tt.tick("loading jsons") data = json.load(open(datap)) rels = json.load(open(relp)) tt.tock("jsons loaded") tt.tick("extracting data") questions = [] goldchains = [] badchains = [] for dataitem in data: questions.append(dataitem["parsed-data"]["corrected_question"]) goldchain = [] for x in dataitem["parsed-data"]["path_id"]: goldchain += [x[0], int(x[1:])] goldchains.append(goldchain) badchainses = [] goldfound = False for badchain in dataitem["uri"]["hop-1-properties"] + dataitem["uri"][ "hop-2-properties"]: if goldchain == badchain: goldfound = True else: if len(badchain) == 2: badchain += [-1, -1] badchainses.append(badchain) badchains.append(badchainses) tt.tock("extracted data") tt.msg("mode: {}".format(mode)) if mode == "flat": tt.tick("flattening") def flatten_chain(chainspec): flatchainspec = [] for x in chainspec: if x in ("+", "-"): flatchainspec.append(x) elif x > -1: relwords = rels[str(x)] flatchainspec += relwords elif x == -1: pass else: raise q.SumTingWongException("unexpected symbol in chain") return " ".join(flatchainspec) goldchainids = [] badchainsids = [] uniquechainids = {} qsm = q.StringMatrix() csm = q.StringMatrix() csm.tokenize = lambda x: x.lower().strip().split() def get_ensure_chainid(flatchain): if flatchain not in uniquechainids: uniquechainids[flatchain] = len(uniquechainids) csm.add(flatchain) assert (len(csm) == len(uniquechainids)) return uniquechainids[flatchain] eid = 0 numchains = 0 for question, goldchain, badchainses in zip(questions, goldchains, badchains): qsm.add(question) # flatten gold chain flatgoldchain = flatten_chain(goldchain) chainid = get_ensure_chainid(flatgoldchain) goldchainids.append(chainid) badchainsids.append([]) numchains += 1 for badchain in badchainses: flatbadchain = flatten_chain(badchain) chainid = get_ensure_chainid(flatbadchain) badchainsids[eid].append(chainid) numchains += 1 eid += 1 tt.live("{}".format(eid)) assert (len(badchainsids) == len(questions)) tt.stoplive() tt.msg("{} unique chains from {} total".format(len(csm), numchains)) qsm.finalize() csm.finalize() tt.tock("flattened") csm.tokenize = None return qsm, csm, goldchainids, badchainsids else: raise q.SumTingWongException("unsupported mode: {}".format(mode))