def test_auto_mask_within_seq2vec(self): Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt" batsize = 11 seqlen = 3 seqblank = 2 wordlen = 3 wordblank = 2 numchars = 20 numwords = 100 encdim = 4 embdim = 50 innerdim = 2 worddata = np.random.randint(0, numwords, (batsize, seqlen, 1)) worddatablank = np.zeros((batsize, seqblank, 1)).astype("int32") - 1 worddata = np.concatenate([worddata, worddatablank], axis=1) chardata = np.random.randint(0, numchars, (batsize, seqlen, wordlen)) charblank = np.zeros((batsize, seqlen, wordblank)).astype("int32") - 1 chardata = np.concatenate([chardata, charblank], axis=2) charblankblank = np.zeros( (batsize, seqblank, wordlen + wordblank)).astype("int32") - 1 chardata = np.concatenate([chardata, charblankblank], axis=1) data = np.concatenate([worddata, chardata], axis=2) wordemb = WordEncoderPlusGlove(numchars=numchars, numwords=numwords, encdim=encdim, embdim=embdim, maskid=-1, embtrainfrac=0) rnn, lastdim = SimpleSeq2Vec.makernu(embdim + encdim, innerdim, bidir=False) enc = Seq2Vec(wordemb, rnn, maskid=-1) enc.enc.with_outputs() finalpred, pred = enc.predict(data) #print pred.shape, finalpred.shape #print pred[0], finalpred[0] i = 1 while i < pred.shape[1]: self.assertEqual(np.allclose(pred[:, i - 1, :], pred[:, i, :]), i >= seqlen) i += 1
def test_auto_mask_within_seq2vec(self): Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt" batsize = 11 seqlen = 3 seqblank = 2 wordlen = 3 wordblank = 2 numchars = 20 numwords = 100 encdim = 4 embdim = 50 innerdim = 2 worddata = np.random.randint(0, numwords, (batsize, seqlen, 1)) worddatablank = np.zeros((batsize, seqblank, 1)).astype("int32") - 1 worddata = np.concatenate([worddata, worddatablank], axis=1) chardata = np.random.randint(0, numchars, (batsize, seqlen, wordlen)) charblank = np.zeros((batsize, seqlen, wordblank)).astype("int32") - 1 chardata = np.concatenate([chardata, charblank], axis=2) charblankblank = np.zeros((batsize, seqblank, wordlen+wordblank)).astype("int32") - 1 chardata = np.concatenate([chardata, charblankblank], axis=1) data = np.concatenate([worddata, chardata], axis=2) wordemb = WordEncoderPlusGlove(numchars=numchars, numwords=numwords, encdim=encdim, embdim=embdim, maskid=-1, embtrainfrac=0) rnn, lastdim = SimpleSeq2Vec.makernu(embdim + encdim, innerdim, bidir=False) enc = Seq2Vec(wordemb, rnn, maskid=-1) enc.enc.with_outputs finalpred, pred = enc.predict(data) #print pred.shape, finalpred.shape #print pred[0], finalpred[0] i = 1 while i < pred.shape[1]: self.assertEqual(np.allclose(pred[:, i-1, :], pred[:, i, :]), i >= seqlen) i += 1
def run(epochs=10, numbats=100, numsam=10000, lr=0.1, datap="../../../data/simplequestions/datamat.wordchar.pkl", embdim=50, encdim=50, innerdim=200, wreg=0.00005, bidir=False, keepmincount=5, sameenc=False, memaddr="dot", memattdim=100, layers=1, embtrainfrac=0.0, mem=False, membidir=False, memlayers=1, sharedwordenc=False): """ Memory match-based glove-based word-level relation classification """ (traindata, traingold), (validdata, validgold), (testdata, testgold), worddic, chardic, entdic\ = readdata(datap) # get words from relation names, update word dic memdata = getmemdata(entdic, worddic, chardic) # get glove and transform word mats to glove index space d2g, newdic, glove = getdic2glove(worddic, dim=embdim, trainfrac=embtrainfrac) traindata, validdata, testdata, memdata = \ [np.concatenate([np.vectorize(d2g)(x[..., 0]).reshape(x.shape[:2] + (1,)), x[..., 1:]], axis=2) for x in [traindata, validdata, testdata, memdata]] print traindata.shape, testdata.shape #embed() numwords = max(worddic.values()) + 1 # don't use this, use glove numchars = max(chardic.values()) + 1 numrels = max(entdic.values()) + 1 if bidir: encinnerdim = [innerdim / 2] * layers else: encinnerdim = [innerdim] * layers wordemb = WordEncoderPlusGlove(numchars=numchars, encdim=encdim, embdim=embdim, maskid=-1, embtrainfrac=embtrainfrac) rnn, lastdim = SimpleSeq2Vec.makernu(embdim + encdim, encinnerdim, bidir=bidir) enc = Seq2Vec(wordemb, rnn, maskid=-1) if mem: memembdim = embdim memencdim = encdim if membidir: innerdim = [innerdim / 2] * memlayers else: innerdim = [innerdim] * memlayers if not sharedwordenc: memwordemb = WordEncoderPlusGlove(numchars=numchars, encdim=encdim, embdim=embdim, maskid=-1, embtrainfrac=embtrainfrac) else: memwordemb = wordemb memrnn, memlastdim = SimpleSeq2Vec.makernu(memembdim + memencdim, innerdim, bidir=membidir) memenc = Seq2Vec(memwordemb, memrnn, maskid=-1) if memaddr is None or memaddr == "dot": memaddr = DotMemAddr elif memaddr == "lin": memaddr = LinearGateMemAddr dec = MemVec2Idx(memenc, memdata, memdim=innerdim, memaddr=memaddr, memattdim=memattdim) else: dec = SimpleVec2Idx(indim=innerdim, outdim=numrels) m = Seq2Idx(enc, dec) m = m.train([traindata], traingold).adagrad(lr=lr).l2(wreg).grad_total_norm(1.0).cross_entropy()\ .validate_on([validdata], validgold).accuracy().cross_entropy().takebest()\ .train(numbats=numbats, epochs=epochs) pred = m.predict(testdata) print pred.shape evalres = evaluate(np.argmax(pred, axis=1), testgold) print str(evalres) + "%"
def run( epochs=10, numbats=100, numsam=10000, lr=0.1, datap="../../../data/simplequestions/datamat.wordchar.pkl", embdim=50, encdim=50, innerdim=200, wreg=0.00005, bidir=False, keepmincount=5, sameenc=False, memaddr="dot", memattdim=100, layers=1, embtrainfrac=0.0, mem=False, membidir=False, memlayers=1, sharedwordenc=False ): """ Memory match-based glove-based word-level relation classification """ (traindata, traingold), (validdata, validgold), (testdata, testgold), worddic, chardic, entdic\ = readdata(datap) # get words from relation names, update word dic memdata = getmemdata(entdic, worddic, chardic) # get glove and transform word mats to glove index space d2g, newdic, glove = getdic2glove(worddic, dim=embdim, trainfrac=embtrainfrac) traindata, validdata, testdata, memdata = \ [np.concatenate([np.vectorize(d2g)(x[..., 0]).reshape(x.shape[:2] + (1,)), x[..., 1:]], axis=2) for x in [traindata, validdata, testdata, memdata]] print traindata.shape, testdata.shape #embed() numwords = max(worddic.values()) + 1 # don't use this, use glove numchars = max(chardic.values()) + 1 numrels = max(entdic.values()) + 1 if bidir: encinnerdim = [innerdim/2]*layers else: encinnerdim = [innerdim]*layers wordemb = WordEncoderPlusGlove(numchars=numchars, encdim=encdim, embdim=embdim, maskid=-1, embtrainfrac=embtrainfrac) rnn, lastdim = SimpleSeq2Vec.makernu(embdim+encdim, encinnerdim, bidir=bidir) enc = Seq2Vec(wordemb, rnn, maskid=-1) if mem: memembdim = embdim memencdim = encdim if membidir: innerdim = [innerdim/2]*memlayers else: innerdim = [innerdim]*memlayers if not sharedwordenc: memwordemb = WordEncoderPlusGlove(numchars=numchars, encdim=encdim, embdim=embdim, maskid=-1, embtrainfrac=embtrainfrac) else: memwordemb = wordemb memrnn, memlastdim = SimpleSeq2Vec.makernu(memembdim+memencdim, innerdim, bidir=membidir) memenc = Seq2Vec(memwordemb, memrnn, maskid=-1) if memaddr is None or memaddr == "dot": memaddr = DotMemAddr elif memaddr == "lin": memaddr = LinearGateMemAddr dec = MemVec2Idx(memenc, memdata, memdim=innerdim, memaddr=memaddr, memattdim=memattdim) else: dec = SimpleVec2Idx(indim=innerdim, outdim=numrels) m = Seq2Idx(enc, dec) m = m.train([traindata], traingold).adagrad(lr=lr).l2(wreg).grad_total_norm(1.0).cross_entropy()\ .validate_on([validdata], validgold).accuracy().cross_entropy().takebest()\ .train(numbats=numbats, epochs=epochs) pred = m.predict(testdata) print pred.shape evalres = evaluate(np.argmax(pred, axis=1), testgold) print str(evalres) + "%"