def readdata_word(trainp, testp, maxlen=100, masksym=-1): tt = ticktock("data reader") def readdataset(p, wdic, maxlen=100): dataret = [] goldret = [] toolong = 0 realmaxlen = 0 with open(p) as f: data = csv.reader(f, delimiter=",") for row in data: rowelems = tokenize(row[2]) realmaxlen = max(realmaxlen, len(rowelems)) if len(rowelems) > maxlen: toolong += 1 for rowelem in set(rowelems): if rowelem not in wdic: wdic[rowelem] = len(wdic) dataret.append([wdic[x] for x in rowelems]) goldret.append(row[0]) print "{} comments were too long".format(toolong) maxlen = min(maxlen, realmaxlen) datamat = np.ones((len(dataret) - 1, maxlen)).astype("int32") * masksym for i in range(1, len(dataret)): datamat[i - 1, :min(len(dataret[i]), maxlen)] = dataret[i][:min(len(dataret[i]), maxlen)] return datamat, np.asarray(goldret[1:], dtype="int32"), wdic tt.tick("reading data") traindata, traingold, wdic = readdataset(trainp, {}, maxlen=maxlen) testdata, testgold, wdic = readdataset(testp, wdic=wdic, maxlen=maxlen) tt.tock("data read") return (traindata, traingold), (testdata, testgold), wdic
def buildsamplespace(entmat, wd, maskid=-1): tt = ticktock("samplespace") tt.tick("making sample space") #rwd = {v: k for k, v in wd.items()} entmatm = sparse.dok_matrix((entmat.shape[0], np.max(entmat) + 1)) posblacklist = {0: {wd["base"], wd["user"]}} blacklist = set([wd[x] for x in "default domain of by the in at s this for with type".split()]) #revin = {k: set() for k in np.unique(entmat)} #revinm = sparse.dok_matrix((np.max(entmat), entmat.shape[0])) samdic = {k: set() for k in range(entmat.shape[0])} # from ent ids to sets of ent ids #samdic = np.zeros((entmat.shape[0], entmat.shape[0])) for i in range(entmat.shape[0]): for j in range(entmat.shape[1]): w = entmat[i, j] if w == -1: # beginning of padding break if j in posblacklist: if w in posblacklist[j]: continue if w in blacklist: continue entmatm[i, w] = 1 #for oe in revin[w]: # other entities already in revind # samdic[oe].add(i) # samdic[i].add(oe) #revin[w].add(i) #revinm[w, i] = 1 samdicm = entmatm.dot(entmatm.T) for i in range(samdicm.shape[0]): samdic[i] = list(np.argwhere(samdicm[i, :])[:, 1]) tt.tock("made sample space") return samdic, entmatm.T
def __init__(self, model, canenc, scorer, agg, relstarts=0, *buildargs, **kw): super(CustomRankSearch, self).__init__(**kw) self.model = model self.scorer = scorer self.canenc = canenc self.agg = agg self.tt = ticktock("RankSearch") self.ott = ticktock("RankSearch") self.relstarts = relstarts
def load(p): tt = ticktock("SubjectSearch") tt.tick("loading") d = {} l = [] k = None with open(p) as f: for line in f: if line[:2] == "::": if k is None: assert (l == []) else: d[k] = l l = [] k = line[2:-1] else: splits = line[:-1].split("\t") le = dict( zip("fb_id triplecount type_id type_name".split(), [splits[0], int(splits[1])] + splits[2:])) l.append(le) d[k] = l tt.tock("loaded") ret = SubjectSearch(subjinfop=d, revind=SubjectSearch.buildrevindex(d)) return ret
def build(self, p): i = 0 tt = ticktock("builder") tt.tick("building") for line in open(p): sline = line[:-1].split("\t") fb_id = sline[0] triplecount = int(sline[1]) + int(sline[2]) name = self.processor.processline(sline[3]) type_id = sline[4] type_id = type_id if type_id != "<UNK>" else None type_name = " ".join(tokenize(sline[5])) type_name = type_name if type_name != " ".join( tokenize("<UNK>")) else None if name not in self.indexdict: self.indexdict[name] = [] self.indexdict[name].append({ "fb_id": fb_id, "triplecount": triplecount, "type_id": type_id, "type_name": type_name }) i += 1 if i % 1000 == 0: tt.live("{}k".format(i // 1000)) tt.tock("built")
def readdata_char(trainp, testp, maxlen=1000, masksym=-1): tt = ticktock("data reader") def readdataset(p): dataret = [] goldret = [] toolong = 0 with open(p) as f: data = csv.reader(f, delimiter=",") for row in data: if len(row[2]) > maxlen: toolong += 1 dataret.append([ord(x) for x in row[2]]) goldret.append(row[0]) print "{} comments were too long".format(toolong) datamat = np.ones((len(dataret)-1, maxlen)).astype("int32") * masksym for i in range(1, len(dataret)): datamat[i-1, :min(len(dataret[i]), maxlen)] = dataret[i][:min(len(dataret[i]), maxlen)] return datamat, np.asarray(goldret[1:], dtype="int32") tt.tick("reading data") traindata, traingold = readdataset(trainp) testdata, testgold = readdataset(testp) allchars = set(list(np.unique(traindata))).union(set(list(np.unique(testdata)))) chardic = dict(zip(list(allchars), range(len(allchars)))) chardic[masksym] = masksym traindata = np.vectorize(lambda x: chardic[x])(traindata) testdata = np.vectorize(lambda x: chardic[x])(testdata) chardic = {chr(k): v for k, v in chardic.items() if k != masksym} tt.tock("data read") return (traindata, traingold), (testdata, testgold), chardic
def wordmat2charmat(wordmat, worddic=None, rwd=None, maxlen=100, raretoken="<RARE>", maskid=-1): assert (worddic is not None or rwd is not None) assert (not (worddic is not None and rwd is not None)) tt = ticktock("wordmat2charmat") tt.tick("transforming word mat to char mat") toolong = 0 charmat = maskid * np.ones((wordmat.shape[0], maxlen), dtype="int32") if rwd is None: rwd = {v: (k if k != raretoken else " ") for k, v in worddic.items()} else: rwd = dict([(k, (v if v != raretoken else " ")) for k, v in rwd.items()]) realmaxlen = 0 for i in range(wordmat.shape[0]): s = wordids2string(wordmat[i], rwd, maskid=maskid) s = s[:min(len(s), maxlen)] realmaxlen = max(len(s), realmaxlen) if len(s) > maxlen: toolong += 1 charmat[i, :len(s)] = [ord(ch) for ch in s] tt.progress(i, wordmat.shape[0], live=True) if realmaxlen < maxlen: charmat = charmat[:, :realmaxlen] if toolong > 0: print "{} too long".format(toolong) tt.tock("transformed") return charmat
def loadsubjinfo(entinfp, entdic, cachep=None): #"subjinfo.cache.pkl"): tt = ticktock("subjinfoloader") def make(): tt.tick("making subject info from file") subjinf = {} c = 0 for line in open(entinfp): subjuri, subjc, objc, subjname, typuri, typname = line[:-1].split( "\t") subjinf[entdic[subjuri]] = (subjname, typname.lower().split(), typuri, subjc, objc) if c % 1000 == 0: tt.live(str(c)) c += 1 tt.tock("made subject info from file") return subjinf if cachep is not None: if os.path.isfile(cachep): # load tt.tick("loading cached subject info") subjinfo = pickle.load(open(cachep)) tt.tock("loaded cached subject info") else: # make and dump subjinfo = make() tt.tick("dumping subject info in cache") pickle.dump(subjinfo, open(cachep, "w")) tt.tock("dumped subject info in cache") else: # just make subjinfo = make() return subjinfo
def loadlexdata(glovepath, fbentdicp, fblexpath, wordoffset, numwords, numchars): tt = ticktock("fblexdataloader") tt.tick() gd, vocnumwords = getglovedict(glovepath, offset=wordoffset) tt.tock("loaded %d worddic" % len(gd)).tick() ed, vocnuments = getentdict(fbentdicp, offset=0) tt.tock("loaded %d entdic" % len(ed)).tick() indata = FreebaseEntFeedsMaker(fblexpath, gd, ed, numwords=numwords, numchars=numchars, unkwordid=wordoffset - 1) datanuments = max(indata.goldfeed) + 1 tt.tick() print "max entity id+1: %d" % datanuments indata.trainfeed[0:9000] tt.tock("transformed") #embed() traindata = indata.trainfeed golddata = indata.goldfeed return traindata, golddata, vocnuments, vocnumwords, datanuments
def loaddata(numtestcans=5): tt = ticktock("dataloader") tt.tick("loading data") p = "../../../../data/simplequestions/clean/datamat.word.fb2m.pkl" entinfp = "../../../../data/simplequestions/clean/subjs-counts-labels-types.fb2m.tsv" x = pickle.load(open(p)) tt.tock("datamat loaded") worddic = x["worddic"] entdic = x["entdic"] entmat = x["entmat"] numents = x["numents"] traindata, traingold = x["train"] validdata, validgold = x["valid"] testdata, testgold = x["test"] traingold[:, 1] -= numents validgold[:, 1] -= numents testgold[:, 1] -= numents rwd = {v: k for k, v in worddic.items()} subjdic = {k: v for k, v in entdic.items() if v < numents} reldic = {k: v - numents for k, v in entdic.items() if v >= numents} subjinfo = loadsubjinfo(entinfp, subjdic) testsubjcans = loadsubjtestcans(numcans=numtestcans) testrelcans, relspersubj = loadreltestcans(testgold, subjdic, reldic) return testgold, testsubjcans, relspersubj
def gencans(data, top=50, exact=True, rwd=None, ed=None, host=None, index=None): idx = SimpleQuestionsLabelIndex(host=host, index=index) # transform data using worddic and search sentences = [] cans = [] tt = ticktock("candidate generator") tt.tick("generating cans") for i in range(data.shape[0]): sentence = " ".join( map(lambda x: rwd[x], filter(lambda x: x in rwd, data[i, :]))) sentences.append(sentence) searchres = idx.searchsentence(sentence, exact=exact, top=top) scans = map(lambda (x, (y, z)): ed[x], searchres.items()) if i % 10 == 0: tt.live("%d of %d" % (i, data.shape[0])) cans.append(scans) tt.stoplive() tt.tock("generated cans") return cans
def loadreltestcans( testgold, subjdic, reldic, relsperentp="../../../../data/simplequestions/allrelsperent.dmp"): tt = ticktock("test rel can loader") testsubjs = testgold[:, 0] relsperent = {} #{k: ([], []) for k in set(list(testsubjs))} tt.tick("loading rel test cans") for line in open(relsperentp): subj, relsout, relsin = line[:-1].split("\t") if subj in subjdic: relsperent[subjdic[subj]] = ( [reldic[x] for x in relsout.split(" ")] if relsout != "" else [], [reldic[x] for x in relsin.split(" ")] if relsin != "" else []) #if subj in subjdic and subjdic[subj] in relsoftestsubjs: # relsoftestsubjs[subjdic[subj]] = ( # [reldic[x] for x in relsout.split(" ")] if relsout != "" else [], # [reldic[x] for x in relsin.split(" ")] if relsin != "" else [] # ) tt.tock("test cans loaded") relsoftestexamples = [(relsperent[x][0], relsperent[x][1]) for x in testsubjs] return relsoftestexamples, relsperent
def __init__(self, model, canenc, scorer, agg, beamsize=1, *buildargs, **kw): super(SeqEncDecRankSearch, self).__init__(**kw) self.model = model self.beamsize = beamsize self.mu = SeqEncDecPredictor(model, *buildargs) self.scorer = scorer self.canenc = canenc self.agg = agg self.tt = ticktock("RankSearch") self.ott = ticktock("RankSearch")
def loadsubjtestcans(p="../../../../data/simplequestions/clean/testcans{}.pkl", numcans=None): tt = ticktock("test subjects candidate loader") tt.tick("loading candidates") p = p.format( "{}c".format(numcans)) if numcans is not None else p.format("") ret = pickle.load(open(p)) tt.tock("canddiates loaded") return ret
def oldpredict(self, data, entcans, relsperent): tt = ticktock("predictor") tt.tick("computing question encodings") qencodings = self.qenc.predict(data) # (numsam, encdim) tt.tock("computed question encodings") tt.tick("predicting") ret = np.zeros((data.shape[0], 2), dtype="int32") if self.mode == "concat": mid = qencodings.shape[1] / 2 qencforent = qencodings[:, :mid] qencforrel = qencodings[:, mid:] elif self.mode == "seq": qencforent = qencodings[:, :] qencforrel = qencodings[:, :] else: raise Exception("unrecognized mode") for i in range(qencodings.shape[0]): # predict subject if len(entcans[i]) == 0: bestsubj = -1 elif len(entcans[i]) == 1: bestsubj = entcans[i][0] else: entembs = self.eenc.predict.transform(self.enttrans)( entcans[i]) entscoresi = np.tensordot(qencforent[i], entembs, axes=(0, 1)) scoredentcans = sorted(zip(entcans[i], entscoresi), key=lambda (x, y): y, reverse=True) bestsubj = scoredentcans[0][0] if self.debug: embed() ret[i, 0] = bestsubj # predict relation relcans = relsperent[ret[i, 0]][0] if ret[i, 0] in relsperent else [] if len(relcans) == 0: bestrel = -1 elif len(relcans) == 1: bestrel = relcans[0] else: if self.debug: embed() relembs = self.renc.predict.transform(self.reltrans)(relcans) relscoresi = np.tensordot(qencforrel[i], relembs, axes=(0, 1)) scoredrelcans = sorted(zip(relcans, relscoresi), key=lambda (x, y): y, reverse=True) bestrel = scoredrelcans[0][0] ret[i, 1] = bestrel if self.debug: embed() tt.progress(i, qencodings.shape[0], live=True) tt.tock("predicted") return ret
def run( epochs=10, batsize=100, lr=0.1, embdim=200, encdim=300, layers=1, type="rnn", # rnn or cnn clean=False, rarefreq=4, p="../../data/simplequestions/datamat.word.mem.fb2m.pkl", ): # load data for classification tt = ticktock("script") tt.tick("loading data") (traindata, traingold), (validdata, validgold), (testdata, testgold), \ entdic, entmat, worddic, numents = readdata(p, clean=clean, rarefreq=rarefreq) numrels = len(entdic) traingold = np_utils.to_categorical(traingold, nb_classes=numrels) validgold = np_utils.to_categorical(validgold, nb_classes=numrels) testgold = np_utils.to_categorical(testgold, nb_classes=numrels) tt.tock("loaded data") # model tt.tick("building model") m = Sequential() m.add(Embedding(len(worddic) + 1, embdim, mask_zero=True)) if type == "rnn": print("doing RNN") for i in range(layers - 1): m.add(GRU(encdim, return_sequences=True)) m.add(GRU(encdim, return_sequences=False)) elif type == "cnn": print("doing CNN") for i in range(layers): m.add(Convolution1D(encdim, encdim)) m.add(GlobalMaxPooling1D()) m.add(Dense(len(entdic))) m.add(Activation("softmax")) m.compile(loss="categorical_crossentropy", optimizer=Adadelta(lr=lr), metrics=["accuracy"]) tt.tock("built model") tt.tick("training") m.fit(traindata, traingold, nb_epoch=epochs, batch_size=batsize, validation_data=(validdata, validgold)) tt.tock("trained") tt.tick("testing") score, acc = m.evaluate(testdata, testgold, batch_size=batsize) print("Score: {}\nAccuracy: {}".format(score, acc)) tt.tock("tested")
def eval(self, data, gold, transform=None, savep=None ): # data: wordidx^(batsize, seqlen), gold: entidx^(batsize) # generate candidates if os.path.isfile("testcans.pkl"): cans = self.loadcans("testcans.pkl") else: cans = gencans(data, host=self.host, index=self.index, rwd=self.rwd, ed=self.ed) # list of lists of entidx pickle.dump(cans, open("testcans.pkl", "w")) assert len(cans) == data.shape[0] == gold.shape[0] # embed() predictor = self.scorer.predict.transform(transform) tt = ticktock("evaluator") tt.tick("evaluating...") nocans = 0 nogoldcan = 0 tosave = {} for i in range(data.shape[0]): numcans = len(cans[i]) if gold[i] not in cans[i]: nogoldcan += 1 predinp = [ np.repeat(np.expand_dims(data[i, :], axis=0), numcans, axis=0), np.asarray(cans[i], dtype="int32") ] #print predinp, "%d/%d" % (i, data.shape[0]), numcans if numcans > 0: predinpscores = predictor(*predinp) # (numcans,) ranking = sorted(zip(cans[i], list(predinpscores)), key=lambda (x, y): y, reverse=True) tosave[i] = (gold[i], ranking) for metric in self.metrics: metric.accumulate([gold[i]], ranking) else: nocans += 1 if i % 100 == 0: tt.live("evaluated: %.2f%%" % (i * 100. / data.shape[0])) tt.tock("evaluated") if savep is not None: tt.tick("saving") pickle.dump(tosave, open(savep, "w")) tt.tock("saved") print "no cans for %d questions" % nocans print "gold not among cans for %d questions" % nogoldcan return self.metrics
def test_ns_training(self): num = 2000 self.expshape = (num, 50) Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt" self.glove = Glove(self.expshape[1], self.expshape[0]) self.cemb = VectorEmbed(indim=self.expshape[0] + 1, dim=self.expshape[1]) self.assertRaises(Exception, self.glove.block.predict, [num + 1]) self.assertRaises(Exception, self.cemb.predict, [num + 1]) m = MatchScore(self.glove.block, self.cemb, scorer=CosineDistance()) mg = MatchScore(self.glove.block, self.glove.block) # TODO factor out matchscore tests idxs = np.arange(num + 1) # glove against glove self.assertTrue( np.allclose(mg.predict([num, 100], [num, 100]), [ np.linalg.norm(self.glove % num)**2, np.linalg.norm(self.glove % 100)**2 ])) class NegIdxGen(): def __init__(self, num): self.n = num def __call__(self, l, r): return l, np.random.randint(0, self.n, r.shape) vdata = np.arange(num) negrate = 5 def obj(p, n): return n - p m, err, verr, _, _ = m.nstrain([idxs, idxs]).negsamplegen(NegIdxGen(num+1)).negrate(negrate)\ .adagrad(lr=0.1).objective(obj) \ .validate_on([vdata, vdata]).extvalid(geteval(m.predict, num, negrate)).validinter(30) \ .train(numbats=50, epochs=29, returnerrors=True) #.writeresultstofile("testingresultswriter.tsv") \ tdata = np.arange(num) tt = ticktock("eval") tt.tick() mrr, recat1, recat10 = geteval(m.predict, num, 1)(tdata) tt.tock("evaluated test data") print "%.4f MRR,\t%.4f MR@10,\t%.4f MR@1" % (mrr, recat10, recat1) self.assertGreater(mrr, 0.85) self.assertGreater(recat10, 0.9) print verr self.assertTrue( np.allclose(np.asarray([mrr, recat1, recat10]), np.asarray(verr[-1][1:])))
def searchwordmat(self, wordmat, wd, top=5): cans = [] rwd = {v: k for k, v in wd.items()} tt = ticktock("wordmatsearcher") tt.tick("started searching") for i in range(wordmat.shape[0]): sentence = wordids2string(wordmat[i], rwd=rwd) #ssentence.replace(" '", "") res = self.searchsentence(sentence, top=top) cans.append([r["fb_id"] for r in res]) tt.progress(i, wordmat.shape[0], live=True) tt.tock("done searching") return cans
def randgen(entcans, relsperent): tt = ticktock("randgen") tt.tick("generating") mat = np.zeros((len(entcans), 2), dtype="int32") for i in range(mat.shape[0]): cans = entcans[i] mat[i, 0] = random.sample( cans, 1)[0] if len(cans) > 0 else -1 # only those appearing as subject cans = relsperent[mat[i, 0]][0] if mat[i, 0] >= 0 else [] mat[i, 1] = random.sample(cans, 1)[0] if len( cans) > 0 else -1 # only outgoing relations of predicted subject tt.tock("generated") return mat
def loaddata(worddic, fbentdicp, fblexpath, wordoffset, numwords): tt = ticktock("fblexdataloader") ; tt.tick() ed, vocnuments = getentdict(fbentdicp, offset=0) tt.tock("loaded %d entdic" % len(ed)).tick() indata = FBSeqFeedsMaker(fblexpath, ed, worddic, numwords=numwords) datanuments = np.max(indata.goldfeed)+1 tt.tick() indata.trainfeed[0:9000] tt.tock("transformed") #embed() traindata = indata.trainfeed golddata = indata.goldfeed + 1 # no entity = id 0 return traindata, golddata, vocnuments, len(worddic)+1, datanuments+1, ed
def readdata(p="../../../../data/simplequestions/clean/datamat.word.fb2m.pkl", relsperentp="../../../../data/simplequestions/allrelsperent.dmp", wordchar=False): tt = ticktock("dataloader") tt.tick("loading datamat") x = pickle.load(open(p)) tt.tock("datamat loaded") worddic = x["worddic"] entdic = x["entdic"] numents = x["numents"] entmat = x["entmat"] traindata, traingold = x["train"] validdata, validgold = x["valid"] testdata, testgold = x["test"] testsubjs = testgold[:, 0] testsubjsrels = {k: ([], []) for k in set(list(testsubjs))} if wordchar: traindata = wordmat2wordchartensor(traindata, worddic=worddic) validdata = wordmat2wordchartensor(validdata, worddic=worddic) testdata = wordmat2wordchartensor(testdata, worddic=worddic) tt.tick("loading test cans") for line in open(relsperentp): subj, relsout, relsin = line[:-1].split("\t") if subj in entdic and entdic[subj] in testsubjsrels: testsubjsrels[entdic[subj]] = ( [entdic[x] for x in relsout.split(" ")] if relsout != "" else [], [entdic[x] for x in relsin.split(" ")] if relsin != "" else [] ) tt.tock("test cans loaded") # select and offset mats traingold = traingold[:, 1] - numents validgold = validgold[:, 1] - numents testgold = testgold[:, 1] - numents entmat = entmat[numents:, :] # select and offset entdic entdic = {k: v - numents for k, v in entdic.items() if v >= numents} # make testrelcans with new idx space testrelcans = [([y - numents for y in testsubjsrels[x][0]], [y - numents for y in testsubjsrels[x][1]]) for x in testsubjs] return (traindata, traingold), (validdata, validgold), (testdata, testgold),\ worddic, entdic, entmat, testrelcans
def run( epochs=100, lr=0.5, wreg=0.0001, numbats=100, fblexpath="../../data/freebase/labelsrevlex.map.sample", glovepath="../../data/glove/glove.6B.50d.txt", fbentdicp="../../data/freebase/entdic.all.map", numwords=10, numchars=30, wordembdim=50, wordencdim=100, innerdim=300, wordoffset=1, validinter=3, gradnorm=1.0, validsplit=100, ): tt = ticktock("fblextransrun") traindata, golddata, vocnuments, vocnumwords, datanuments = \ loadlexdata(glovepath, fbentdicp, fblexpath, wordoffset, numwords, numchars) tt.tock("made data").tick() # define model m = FBBasicCompositeEncoder( wordembdim=wordembdim, wordencdim=wordencdim, innerdim=innerdim, outdim=datanuments, numchars=128, # ASCII numwords=vocnumwords, ) #wenc = WordEncoderPlusGlove(numchars=numchars, numwords=vocnumwords, encdim=wordencdim, embdim=wordembdim) # train model TODO tt.tick("training") m.train([traindata], golddata).adagrad(lr=lr).grad_total_norm(gradnorm).cross_entropy()\ .autovalidate(splits=validsplit, random=True).validinter(validinter).accuracy()\ .train(numbats, epochs) #embed() tt.tock("trained").tick("predicting") print m.predict(traindata).shape tt.tock("predicted sample")
def __init__(self, questionencoder=None, entityencoder=None, relationencoder=None, enttrans=None, reltrans=None, debug=False, subjinfo=None): self.qenc = questionencoder self.eenc = entityencoder self.renc = relationencoder #self.mode = mode self.enttrans = enttrans self.reltrans = reltrans self.debug = debug self.subjinfo = subjinfo self.qencodings = None self.tt = ticktock("predictor")
def loaddata(worddic, fbentdicp, fblexpath, wordoffset, numwords): tt = ticktock("fblexdataloader") tt.tick() ed, vocnuments = getentdict(fbentdicp, offset=0) tt.tock("loaded %d entdic" % len(ed)).tick() indata = FBSeqFeedsMaker(fblexpath, ed, worddic, numwords=numwords) datanuments = np.max(indata.goldfeed) + 1 tt.tick() indata.trainfeed[0:9000] tt.tock("transformed") #embed() traindata = indata.trainfeed golddata = indata.goldfeed + 1 # no entity = id 0 return traindata, golddata, vocnuments, len( worddic) + 1, datanuments + 1, ed
def loaddata(glovepath, fbentdicp, fblexpath, wordoffset, numwords, numchars): tt = ticktock("fblexdataloader") ; tt.tick() gd, vocnumwords = getglovedict(glovepath, offset=wordoffset) tt.tock("loaded %d worddic" % len(gd)).tick() ed, vocnuments = getentdict(fbentdicp, offset=0) tt.tock("loaded %d entdic" % len(ed)).tick() indata = FreebaseSeqFeedMakerEntidxs(fblexpath, gd, ed, numwords=numwords, numchars=numchars, unkwordid=wordoffset - 1) datanuments = np.max(indata.goldfeed)+1 tt.tick() indata.trainfeed[0:9000] tt.tock("transformed") #embed() traindata = indata.trainfeed golddata = indata.goldfeed + 1 # no entity = id 0 return traindata, golddata, vocnuments, vocnumwords, datanuments+1, ed, gd
def loadlexdata(glovepath, fbentdicp, fblexpath, wordoffset, numwords, numchars): tt = ticktock("fblexdataloader") ; tt.tick() gd, vocnumwords = getglovedict(glovepath, offset=wordoffset) tt.tock("loaded %d worddic" % len(gd)).tick() ed, vocnuments = getentdict(fbentdicp, offset=0) tt.tock("loaded %d entdic" % len(ed)).tick() indata = FreebaseEntFeedsMaker(fblexpath, gd, ed, numwords=numwords, numchars=numchars, unkwordid=wordoffset - 1) datanuments = max(indata.goldfeed)+1 tt.tick() print "max entity id+1: %d" % datanuments indata.trainfeed[0:9000] tt.tock("transformed") #embed() traindata = indata.trainfeed golddata = indata.goldfeed return traindata, golddata, vocnuments, vocnumwords, datanuments
def eval(self, data, gold, transform=None, savep=None): # data: wordidx^(batsize, seqlen), gold: entidx^(batsize) # generate candidates if os.path.isfile("testcans.pkl"): cans = self.loadcans("testcans.pkl") else: cans = gencans(data, host=self.host, index=self.index, rwd=self.rwd, ed=self.ed) # list of lists of entidx pickle.dump(cans, open("testcans.pkl", "w")) assert len(cans) == data.shape[0] == gold.shape[0] # embed() predictor = self.scorer.predict.transform(transform) tt = ticktock("evaluator") tt.tick("evaluating...") nocans = 0 nogoldcan = 0 tosave = {} for i in range(data.shape[0]): numcans = len(cans[i]) if gold[i] not in cans[i]: nogoldcan += 1 predinp = [np.repeat(np.expand_dims(data[i, :], axis=0), numcans, axis=0), np.asarray(cans[i], dtype="int32")] #print predinp, "%d/%d" % (i, data.shape[0]), numcans if numcans > 0: predinpscores = predictor(*predinp) # (numcans,) ranking = sorted(zip(cans[i], list(predinpscores)), key=lambda (x, y): y, reverse=True) tosave[i] = (gold[i], ranking) for metric in self.metrics: metric.accumulate([gold[i]], ranking) else: nocans += 1 if i % 100 == 0: tt.live("evaluated: %.2f%%" % (i*100./data.shape[0])) tt.tock("evaluated") if savep is not None: tt.tick("saving") pickle.dump(tosave, open(savep, "w")) tt.tock("saved") print "no cans for %d questions" % nocans print "gold not among cans for %d questions" % nogoldcan return self.metrics
def load(self, entdic): self.trainingdata = [] self.golddata = [] tt = ticktock(self.__class__.__name__) tt.tick("loading kgraph") with open(self.path) as f: c = 0 for line in f: ns = line[:-1].split("\t") if len(ns) is not 2: print line, c continue sf, fb = ns self.trainingdata.append(self._process_sf(sf, self.numwords, self.numchars)) entids = self._process_ent(fb, entdic) self.golddata.append(entids) if c % 1e6 == 0: tt.tock("%.0fM" % (c/1e6)).tick() c += 1 self.golddata = np.asarray(self.golddata, dtype="int32") self.trainingdata = np.array(self.trainingdata)
def load(self, entdic): self.trainingdata = [] self.golddata = [] tt = ticktock(self.__class__.__name__) tt.tick("loading kgraph") with open(self.path) as f: c = 0 for line in f: ns = line[:-1].split("\t") if len(ns) is not 2: print line, c continue sf, fb = ns self.trainingdata.append(self._process_sf(sf, self.numwords)) entids = self._process_ent(fb, entdic) self.golddata.append(entids) if c % 1e6 == 0: tt.tock("%.0fM" % (c / 1e6)).tick() c += 1 self.golddata = np.asarray(self.golddata, dtype="int32") self.trainingdata = np.array(self.trainingdata)
def run(epochs=10, numbats=700, lr=0.1, embdim=200, encdim=300, layers=1, clean=False, rarefreq=4, glove=0, type="rnn", # rnn or cnn p="../../data/simplequestions/datamat.word.mem.fb2m.pkl", ): # load data for classification tt = ticktock("script") tt.tick("loading data") (traindata, traingold), (validdata, validgold), (testdata, testgold), \ entdic, entmat, worddic, numents = readdata(p, clean=clean, rarefreq=rarefreq) tt.tock("loaded data") # model tt.tick("building model") m = SimpleSeq2Idx(indim=len(worddic)+1, inpembdim=embdim, numclasses=len(entdic), innerdim=encdim, maskid=0, layers=layers) tt.tock("built model") tt.tick("training") m.train([traindata], traingold).adadelta(lr=lr).cross_entropy().grad_total_norm(1.)\ .validate_on([validdata], validgold).cross_entropy().accuracy().takebest()\ .train(numbats=numbats, epochs=epochs) tt.tock("trained") tt.tick("testing") preds = m.predict(testdata) preds = np.argmax(preds, axis=1) acc = preds == testgold acc = np.sum(acc) * 1.0 / testdata.shape[0] print("Accuracy: {}".format(acc)) tt.tock("tested")
def loaddata(glovepath, fbentdicp, fblexpath, wordoffset, numwords, numchars): tt = ticktock("fblexdataloader") tt.tick() gd, vocnumwords = getglovedict(glovepath, offset=wordoffset) tt.tock("loaded %d worddic" % len(gd)).tick() ed, vocnuments = getentdict(fbentdicp, offset=0) tt.tock("loaded %d entdic" % len(ed)).tick() indata = FreebaseSeqFeedMakerEntidxs(fblexpath, gd, ed, numwords=numwords, numchars=numchars, unkwordid=wordoffset - 1) datanuments = np.max(indata.goldfeed) + 1 tt.tick() indata.trainfeed[0:9000] tt.tock("transformed") #embed() traindata = indata.trainfeed golddata = indata.goldfeed + 1 # no entity = id 0 return traindata, golddata, vocnuments, vocnumwords, datanuments + 1, ed, gd
def run( epochs=10, numbats=100, negrate=1, lr=0.1, datap="../../../data/simplequestions/datamat.word.mem.fb2m.pkl", embdim=100, innerdim=200, wreg=0.00005, bidir=False, mem=False, membidir=False, memlayers=1, layers=1, testfirst=False, rankingloss=False, rlmargin=1., charlevel=False, pool=False, resultsave=False, resultsavep="subjdetns.res.pkl", ): tt = ticktock("script") tt.tick() (traindata, traingold), (validdata, validgold), (testdata, testgold), \ worddic, entdic, entmat\ = readdata(datap, charlevel) print entmat.shape print traindata.shape, traingold.shape, testdata.shape, testgold.shape tt.tock("data loaded") # *data: matrix of word ids (-1 filler), example per row # *gold: vector of true entity ids # entmat: matrix of word ids (-1 filler), entity label per row, indexes according to *gold # *dic: from word/ent-fbid to integer id, as used in data numwords = max(worddic.values()) + 1 numents = max(entdic.values()) + 1 print "%d words, %d entities" % (numwords, numents) if bidir: encinnerdim = [innerdim / 2] * layers else: encinnerdim = [innerdim] * layers # question representation: # encodes question sequence to vector # let's try to embed chars too <-- embdim = None if charlevel else embdim qenc = SimpleSeq2Vec(indim=numwords, inpembdim=embdim, innerdim=encinnerdim, maskid=-1, bidir=bidir, pool=pool) # entity representation: if mem: # encodes label to vector if membidir: innerdim = [innerdim / 2] * memlayers else: innerdim = [innerdim] * memlayers memembdim = embdim #embed chars too <-- meminpemb = None if charlevel else qenc.inpemb # share embeddings #memembdim = None if charlevel else memembdim meminpemb = qenc.inpemb # also chars are embedded and embeddings are always shared lenc = SimpleSeq2Vec(indim=numwords, inpembdim=memembdim, inpemb=meminpemb, innerdim=innerdim, maskid=-1, bidir=membidir) else: # embeds entity id to vector lenc = VectorEmbed(indim=numents, dim=innerdim) # question-entity score computation: scorer = MatchScore(qenc, lenc) # batched dot # trainer config preparation class PreProcf(object): def __init__(self, entmat): self.em = Val(entmat) # entmat: idx[word]^(numents, len(ent.name)) def __call__(self, datas, gold): # gold: idx^(batsize, ) return (datas, self.em[gold, :]), {} class NegIdxGen(object): def __init__(self, rng): self.min = 0 self.max = rng def __call__(self, datas, gold): # gold: idx^(batsize,) return datas, np.random.randint(self.min, self.max, gold.shape).astype("int32") if testfirst: eval = SubjRankEval(scorer, worddic=worddic, entdic=entdic, metrics=[ClassAccuracy(), RecallAt(5)]) evalres = eval.eval(testdata, testgold, transform=PreProcf(entmat)) for e in evalres: print e tt.msg("tested dummy") sys.exit() #embed() # trainer config and training obj = lambda p, n: n - p if rankingloss: obj = lambda p, n: (n - p + rlmargin).clip(0, np.infty) nscorer = scorer.nstrain([traindata, traingold]).transform(PreProcf(entmat))\ .negsamplegen(NegIdxGen(numents)).negrate(negrate).objective(obj)\ .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0)\ .validate_on([validdata, validgold])\ .train(numbats=numbats, epochs=epochs) # evaluation eval = SubjRankEval(scorer, worddic=worddic, entdic=entdic, metrics=[ ClassAccuracy(), RecallAt(1), RecallAt(2), RecallAt(5), RecallAt(10) ]) evalres = eval.eval(testdata, testgold, transform=PreProcf(entmat), savep=None if not resultsave else resultsavep) for evalre in evalres: print evalre
def run( epochs=10, numbats=100, negrate=1, lr=0.1, embdim=50, encdim=50, wreg=0.00005, marginloss=False, margin=1.0, cosine=False, bidir=False, ): tt = ticktock("script") # get glove words g = Glove(encdim) words = g.D.keys() maxwordlen = 0 for word in words: maxwordlen = max(maxwordlen, len(word)) chars = set("".join(words)) chars.add(" ") print "{} words, maxlen {}, {} characters in words".format(len(words), maxwordlen, len(chars)) # get char word matrix chardic = dict(zip(chars, range(len(chars)))) pickle.dump(chardic, open("glove2c2w.chardic.pkl", "w")) charwordmat = -np.ones((len(words) + 1, maxwordlen), dtype="int32") charwordmat[0, 0] = chardic[" "] for i in range(0, len(words)): word = words[i] charwordmat[i + 1, : len(word)] = [chardic[x] for x in word] print charwordmat[0] # encode characters cwenc = SimpleSeq2Vec( indim=len(chars), inpembdim=embdim, innerdim=encdim / 2 if bidir else encdim, maskid=-1, bidir=bidir ) dist = CosineDistance() if cosine else EuclideanDistance() # DotDistance() print "using " + str(dist) scorer = MatchScore(cwenc, g.block, scorer=dist) """ scorer.train([charwordmat, np.arange(len(words)+1)], np.ones((charwordmat.shape[0],), dtype="int32") * (-1 if cosine else 1))\ .linear_objective().adagrad(lr=lr).l2(wreg)\ .train(numbats=numbats, epochs=epochs) #embed() """ class NegIdxGen(object): def __init__(self, rng): self.min = 0 self.max = rng def __call__(self, datas, gold): return datas, np.random.randint(self.min, self.max, gold.shape).astype("int32") if marginloss: obj = lambda p, n: (n - p + margin).clip(0, np.infty) else: obj = lambda p, n: n - p nscorer = ( scorer.nstrain([charwordmat, np.arange(len(words) + 1)]) .negsamplegen(NegIdxGen(len(words))) .negrate(negrate) .objective(obj) .adagrad(lr=lr) .l2(wreg) .train(numbats=numbats, epochs=epochs) ) cwenc.save("glove2c2w.block")
def run( epochs=100, lr=0.01, wreg=0.0001, numbats=10, fbdatapath="../../data/mfqa/mfqa.tsv.sample.small", fblexpath="../../data/mfqa/mfqa.labels.idx.map", glovepath="../../data/glove/glove.6B.50d.txt", fbentdicp="../../data/mfqa/mfqa.dic.map", numwords=20, numchars=30, wordembdim=50, wordencdim=100, entembdim=100, innerdim=200, attdim=200, wordoffset=1, validinter=1, gradnorm=1.0, validsplit=1, vocnumwordsres=50e3, model="mem", ): tt = ticktock("fblextransrun") traindata, golddata, vocnuments, vocnumwords, datanuments, entdic, worddic = \ loaddata(glovepath, fbentdicp, fbdatapath, wordoffset, numwords, numchars) outdata = shiftdata(golddata) tt.tock("made data").tick() entids, lexdata = load_lex_data(fblexpath, datanuments, worddic) if "mem" in model: print lexdata.shape print datanuments #embed() if "att" in model: print "model with attention AND memory" m = FBSeqCompEncMemDecAtt( wordembdim=wordembdim, wordencdim=wordencdim, entembdim=entembdim, innerdim=innerdim, outdim=datanuments, numchars=128, # ASCII numwords=vocnumwords, memdata=[entids, lexdata], attdim=attdim, memaddr=GeneralDotMemAddr, ) else: m = FBSeqCompositeEncMemDec( wordembdim=wordembdim, wordencdim=wordencdim, entembdim=entembdim, innerdim=innerdim, outdim=datanuments, numchars=128, # ASCII numwords=vocnumwords, memdata=[entids, lexdata], attdim=attdim, memaddr=LinearGateMemAddr, ) elif model == "lex": # for testing purposes print lexdata.shape print datanuments #vocnumwords = 4000 #exit() #embed() m = FBMemMatch( wordembdim=wordembdim, wordencdim=wordencdim, entembdim=entembdim, innerdim=innerdim, outdim=datanuments, numchars=128, numwords=vocnumwords, memdata=[entids, lexdata], attdim=attdim, ) elif model == "nomem": m = FBSeqCompositeEncDec( # compiles, errors go down wordembdim=wordembdim, wordencdim=wordencdim, entembdim=entembdim, innerdim=innerdim, outdim=datanuments, numchars=128, numwords=vocnumwords) else: m = None print "no such model" reventdic = {} for k, v in entdic.items(): reventdic[v] = k #wenc = WordEncoderPlusGlove(numchars=numchars, numwords=vocnumwords, encdim=wordencdim, embdim=wordembdim) tt.tock("model defined") if model == "lex": # for testing purposes tt.tick("predicting") print lexdata[1:5].shape, entids[1:5].shape #print lexdata[1:5] print entids[1:5] pred = m.predict(lexdata[1:5]) print pred.shape print np.argmax(pred, axis=1) - 1 print np.vectorize(lambda x: reventdic[x] if x in reventdic else None)( np.argmax(pred, axis=1) - 1) tt.tock("predicted sample") tt.tick("training") m.train([lexdata[1:151]], entids[1:151]).adagrad(lr=lr).cross_entropy().grad_total_norm(0.5)\ .split_validate(5, random=True).validinter(validinter).accuracy()\ .train(numbats, epochs) else: #embed() tt.tick("predicting") print traindata[:5].shape, outdata[:5].shape pred = m.predict(traindata[:5], outdata[:5]) print np.argmax(pred, axis=2) - 1 print np.vectorize(lambda x: reventdic[x])(np.argmax(pred, axis=2) - 1) tt.tock("predicted sample") tt.tick("training") m.train([traindata, outdata], golddata).adagrad(lr=lr).grad_total_norm(gradnorm).seq_cross_entropy()\ .split_validate(splits=5, random=False).validinter(validinter).seq_accuracy().seq_cross_entropy()\ .train(numbats, epochs) #embed() tt.tock("trained").tick("predicting") pred = m.predict(traindata[:50], outdata[:50]) print np.vectorize(lambda x: reventdic[x])(np.argmax(pred, axis=2) - 1) tt.tock("predicted sample")
def run( epochs=100, epochsp=10, lr=0.03, wreg=0.001, numbats=10, fbdatapath="../../data/mfqa/mfqa.tsv.sample.small", fblexpath="../../data/mfqa/mfqa.labels.idx.map", glovepath="../../data/glove/glove.6B.50d.txt", fbentdicp="../../data/mfqa/mfqa.dic.map", numwords=20, numchars=30, wordembdim=50, wordencdim=100, entembdim=100, innerdim=400, attdim=200, wordoffset=1, validinter=1, gradnorm=1.0, validsplit=5, vocnumwordsres=50e3, model="nomem", ): tt = ticktock("fblextransrun") traindata, golddata, vocnuments, vocnumwords, datanuments, entdic, worddic = \ loaddata(glovepath, fbentdicp, fbdatapath, wordoffset, numwords, numchars) tt.tock("made data").tick() entids, lexdata = load_lex_data(fblexpath, datanuments, worddic) # manual split # TODO: do split in feeder splitpoint = int(traindata.shape[0]*(1. - 1./validsplit)) print splitpoint validdata = traindata[splitpoint:] validgold = golddata[splitpoint:] traindata = traindata[:splitpoint] golddata = golddata[:splitpoint] if "att" in model: m = FBSeqCompEncDecAtt( wordembdim=wordembdim, wordencdim=wordencdim, entembdim=entembdim, innerdim=innerdim, outdim=datanuments, numchars=128, attdim=attdim, numwords=vocnumwords ) else: m = FBSeqCompositeEncDec( # compiles, errors go down wordembdim=wordembdim, wordencdim=wordencdim, entembdim=entembdim, innerdim=innerdim, outdim=datanuments, numchars=128, numwords=vocnumwords ) reventdic = {} for k, v in entdic.items(): reventdic[v] = k #wenc = WordEncoderPlusGlove(numchars=numchars, numwords=vocnumwords, encdim=wordencdim, embdim=wordembdim) tt.tock("model defined") if "lex" in model: tt.tick("predicting lexicon") print lexdata[1:5].shape, entids[1:5].shape, golddata[:5].shape #print lexdata[1:5] #print entids[:5]; exit() pred = m.predict(lexdata[1:5], np.zeros((entids[1:5].shape[0], 1), dtype="int32")) print pred.shape print np.argmax(pred, axis=2)-1 print np.vectorize(lambda x: reventdic[x] if x in reventdic else None)(np.argmax(pred, axis=2)-1) tt.tock("predicted sample") tt.tick("training") lextrain = lexdata print lextrain.shape lexgold = entids.reshape((entids.shape[0], 1)) print lexgold.shape lexgoldshifted = shiftdata(lexgold) m.train([lextrain, lexgoldshifted], lexgold).adagrad(lr=lr).seq_cross_entropy().grad_total_norm(gradnorm)\ .autovalidate(validsplit, random=True).validinter(validinter).seq_accuracy().seq_cross_entropy()\ .train(numbats, epochsp) tt.tick("predicting") print lexdata[1:5].shape, entids[1:5].shape, golddata[:5].shape # print lexdata[1:5] # print entids[:5]; exit() pred = m.predict(lexdata[1:5], np.zeros((entids[1:5].shape[0], 1), dtype="int32")) print pred.shape print np.argmax(pred, axis=2) - 1 print np.vectorize(lambda x: reventdic[x] if x in reventdic else None)(np.argmax(pred, axis=2) - 1) tt.tock("predicted sample") m.fixO(lr=0.01) # embed() outdata = shiftdata(golddata) tt.tick("predicting") print traindata[:5].shape, outdata[:5].shape #print golddata[:5] ; exit() pred = m.predict(traindata[:5], outdata[:5]) print np.argmax(pred, axis=2) - 1 print np.vectorize(lambda x: reventdic[x])(np.argmax(pred, axis=2) - 1) tt.tock("predicted sample") tt.tick("training") m.train([traindata, outdata], golddata).adagrad(lr=lr).l2(wreg).grad_total_norm(gradnorm).seq_cross_entropy() \ .validate_on([validdata, shiftdata(validgold)], validgold).validinter(validinter).seq_accuracy().seq_cross_entropy() \ .train(numbats, epochs) # embed() tt.tock("trained").tick("predicting") pred = m.predict(traindata[:50], outdata[:50]) print np.argmax(pred, axis=2) - 1 #print np.vectorize(lambda x: reventdic[x])(np.argmax(pred, axis=2) - 1) tt.tock("predicted sample")
def run( epochs=10, numbats=100, negrate=1, lr=0.1, datap="../../../data/simplequestions/datamat.word.mem.fb2m.pkl", embdim=100, innerdim=200, wreg=0.00005, bidir=False, mem=False, membidir=False, memlayers=1, layers=1, testfirst=False, rankingloss=False, rlmargin=1., charlevel=False, pool=False, resultsave=False, resultsavep="subjdetns.res.pkl", ): tt = ticktock("script") tt.tick() (traindata, traingold), (validdata, validgold), (testdata, testgold), \ worddic, entdic, entmat\ = readdata(datap, charlevel) print entmat.shape print traindata.shape, traingold.shape, testdata.shape, testgold.shape tt.tock("data loaded") # *data: matrix of word ids (-1 filler), example per row # *gold: vector of true entity ids # entmat: matrix of word ids (-1 filler), entity label per row, indexes according to *gold # *dic: from word/ent-fbid to integer id, as used in data numwords = max(worddic.values()) + 1 numents = max(entdic.values()) + 1 print "%d words, %d entities" % (numwords, numents) if bidir: encinnerdim = [innerdim/2]*layers else: encinnerdim = [innerdim]*layers # question representation: # encodes question sequence to vector # let's try to embed chars too <-- embdim = None if charlevel else embdim qenc = SimpleSeq2Vec(indim=numwords, inpembdim=embdim, innerdim=encinnerdim, maskid=-1, bidir=bidir, pool=pool) # entity representation: if mem: # encodes label to vector if membidir: innerdim = [innerdim/2]*memlayers else: innerdim = [innerdim]*memlayers memembdim = embdim #embed chars too <-- meminpemb = None if charlevel else qenc.inpemb # share embeddings #memembdim = None if charlevel else memembdim meminpemb = qenc.inpemb # also chars are embedded and embeddings are always shared lenc = SimpleSeq2Vec(indim=numwords, inpembdim=memembdim, inpemb=meminpemb, innerdim=innerdim, maskid=-1, bidir=membidir) else: # embeds entity id to vector lenc = VectorEmbed(indim=numents, dim=innerdim) # question-entity score computation: scorer = MatchScore(qenc, lenc) # batched dot # trainer config preparation class PreProcf(object): def __init__(self, entmat): self.em = Val(entmat) # entmat: idx[word]^(numents, len(ent.name)) def __call__(self, datas, gold): # gold: idx^(batsize, ) return (datas, self.em[gold, :]), {} class NegIdxGen(object): def __init__(self, rng): self.min = 0 self.max = rng def __call__(self, datas, gold): # gold: idx^(batsize,) return datas, np.random.randint(self.min, self.max, gold.shape).astype("int32") if testfirst: eval = SubjRankEval(scorer, worddic=worddic, entdic=entdic, metrics=[ClassAccuracy(), RecallAt(5)]) evalres = eval.eval(testdata, testgold, transform=PreProcf(entmat)) for e in evalres: print e tt.msg("tested dummy") sys.exit() #embed() # trainer config and training obj = lambda p, n: n - p if rankingloss: obj = lambda p, n: (n - p + rlmargin).clip(0, np.infty) nscorer = scorer.nstrain([traindata, traingold]).transform(PreProcf(entmat))\ .negsamplegen(NegIdxGen(numents)).negrate(negrate).objective(obj)\ .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0)\ .validate_on([validdata, validgold])\ .train(numbats=numbats, epochs=epochs) # evaluation eval = SubjRankEval(scorer, worddic=worddic, entdic=entdic, metrics=[ClassAccuracy(), RecallAt(1), RecallAt(2), RecallAt(5), RecallAt(10)]) evalres = eval.eval(testdata, testgold, transform=PreProcf(entmat), savep=None if not resultsave else resultsavep) for evalre in evalres: print evalre
def run( epochs=100, lr=0.03, wreg=0.0001, numbats=10, fbdatapath="../../data/mfqa/mfqa.tsv.sample.small", fblexpath="../../data/mfqa/mfqa.labels.idx.map", fbentdicp="../../data/mfqa/mfqa.dic.map", numwords=20, wordembdim=50, entembdim=101, innerdim=100, attdim=100, wordoffset=1, validinter=1, gradnorm=1.0, validsplit=5, model="lex", ): tt = ticktock("fblextransrun") worddic = makeworddict(fblexpath, fbdatapath) traindata, golddata, vocnuments, vocnumwords, datanuments, entdic = \ loaddata(worddic, fbentdicp, fbdatapath, wordoffset, numwords) tt.tock("made data").tick() entids, lexdata = load_lex_data(fblexpath, datanuments, worddic) # manual split # TODO: do split in feeder splitpoint = int(traindata.shape[0]*(1. - 1./validsplit)) print splitpoint validdata = traindata[splitpoint:] validgold = golddata[splitpoint:] traindata = traindata[:splitpoint] golddata = golddata[:splitpoint] print traindata.shape, golddata.shape print validdata.shape, validgold.shape if "lex" in model: # append lexdata traindata = np.concatenate([traindata, lexdata], axis=0) print traindata.shape entids = entids.reshape((entids.shape[0], 1)) golddata = np.concatenate([golddata, np.concatenate([entids, np.zeros_like(entids, dtype="int32")], axis=1)], axis=0) print golddata.shape #exit() m = FBSeqSimpEncDecAtt( wordembdim=wordembdim, entembdim=entembdim, innerdim=innerdim, attdim=attdim, outdim=datanuments, numwords=vocnumwords, ) tt.tock("model defined") reventdic = {} for k, v in entdic.items(): reventdic[v] = k # embed() outdata = shiftdata(golddata) tt.tick("predicting") print traindata[:5].shape, outdata[:5].shape #print golddata[:5] ; exit() pred = m.predict(traindata[:5], outdata[:5]) print np.argmax(pred, axis=2) - 1 print np.vectorize(lambda x: reventdic[x])(np.argmax(pred, axis=2) - 1) tt.tock("predicted sample") tt.tick("training") m.train([traindata, outdata], golddata).adagrad(lr=lr).l2(wreg).grad_total_norm(gradnorm).seq_cross_entropy() \ .validate_on([validdata, shiftdata(validgold)], validgold).validinter(validinter).seq_accuracy().seq_cross_entropy() \ .train(numbats, epochs) # embed() tt.tock("trained").tick("predicting") pred = m.predict(validdata, shiftdata(validgold)) print np.argmax(pred, axis=2) - 1 #print np.vectorize(lambda x: reventdic[x])(np.argmax(pred, axis=2) - 1) tt.tock("predicted sample")
def __init__(self, model, canenc, scorer, agg, beamsize=1, *buildargs, **kw): super(SeqEncDecRankSearch, self).__init__(model, beamsize, *buildargs, **kw) self.scorer = scorer self.canenc = canenc self.agg = agg self.tt = ticktock("RankSearch")
def run( epochs=50, mode="char", # "char" or "word" or "charword" numbats=100, lr=0.1, wreg=0.000001, bidir=False, layers=1, encdim=200, decdim=400, embdim=100, negrate=1, margin=1., hingeloss=False, debug=False, preeval=False, sumhingeloss=False, checkdata=False, # starts interactive shell for data inspection printpreds=False, subjpred=False, predpred=False, specemb=-1, balancednegidx=False, usetypes=False, ): if debug: # debug settings sumhingeloss = True numbats = 10 lr = 0.02 epochs = 10 printpreds = True whatpred = "all" if whatpred == "pred": predpred = True elif whatpred == "subj": subjpred = True #preeval = True specemb = 100 margin = 1. balancednegidx = True #usetypes=True # load the right file tt = ticktock("script") specids = specemb > 0 tt.tick() (traindata, traingold), (validdata, validgold), (testdata, testgold), \ worddic, entdic, entmat, relstarts, canids\ = readdata(mode, testcans="testcans.pkl", debug=debug, specids=specids, usetypes=usetypes) entmat = entmat.astype("int32") #embed() if subjpred is True and predpred is False: traingold = traingold[:, [0]] validgold = validgold[:, [0]] testgold = testgold[:, [0]] if predpred is True and subjpred is False: traingold = traingold[:, [1]] validgold = validgold[:, [1]] testgold = testgold[:, [1]] if checkdata: rwd = {v: k for k, v in worddic.items()} red = {v: k for k, v in entdic.items()} def p(xids): return (" " if mode == "word" else "").join([rwd[xid] if xid > -1 else "" for xid in xids]) embed() reventdic = {v: k for k, v in entdic.items()} revworddic = {v: k for k, v in worddic.items()} print traindata.shape, traingold.shape, testdata.shape, testgold.shape tt.tock("data loaded") # *data: matrix of word ids (-1 filler), example per row # *gold: vector of true entity ids # entmat: matrix of word ids (-1 filler), entity label per row, indexes according to *gold # *dic: from word/ent-fbid to integer id, as used in data numwords = max(worddic.values()) + 1 numents = max(entdic.values()) + 1 print "%d words, %d entities" % (numwords, numents) if bidir: encinnerdim = [encdim / 2] * layers else: encinnerdim = [encdim] * layers memembdim = embdim memlayers = layers membidir = bidir if membidir: decinnerdim = [decdim/2]*memlayers else: decinnerdim = [decdim]*memlayers entenc = SimpleSeq2Vec(indim=numwords, inpembdim=memembdim, innerdim=decinnerdim, maskid=-1, bidir=membidir) if specids: # include vectorembedder numentembs = len(np.unique(entmat[:, 0])) entenc = EntEmbEnc(entenc, numentembs, specemb) # adjust params for enc/dec construction #encinnerdim[-1] += specemb #innerdim[-1] += specemb encdec = SimpleSeqEncDecAtt(inpvocsize=numwords, inpembdim=embdim, encdim=encinnerdim, bidir=bidir, outembdim=entenc, decdim=decinnerdim, vecout=True, statetrans="matdot") scorerargs = ([encdec, SeqUnroll(entenc)], {"argproc": lambda x, y, z: ((x, y), (z,)), "scorer": GenDotDistance(decinnerdim[-1], entenc.outdim)}) if sumhingeloss: scorerargs[1]["aggregator"] = lambda x: x # no aggregation of scores scorer = SeqMatchScore(*scorerargs[0], **scorerargs[1]) #scorer.save("scorer.test.save") # TODO: below this line, check and test class PreProc(object): def __init__(self, entmat): self.f = PreProcE(entmat) def __call__(self, encdata, decsg, decgold): # gold: idx^(batsize, seqlen) return (encdata, self.f(decsg), self.f(decgold)), {} class PreProcE(object): def __init__(self, entmat): self.em = Val(entmat) def __call__(self, x): return self.em[x] transf = PreProc(entmat) class NegIdxGen(object): def __init__(self, rng, midsplit=None): self.min = 0 self.max = rng self.midsplit = midsplit def __call__(self, datas, sgold, gold): # the whole target sequence is corrupted, corruption targets the whole set of entities and relations together if self.midsplit is None or not balancednegidx: return datas, sgold, np.random.randint(self.min, self.max, gold.shape).astype("int32") else: entrand = np.random.randint(self.min, self.midsplit, gold.shape) relrand = np.random.randint(self.midsplit, self.max, gold.shape) mask = np.random.randint(0, 2, gold.shape) ret = entrand * mask + relrand * (1 - mask) return datas, sgold, ret.astype("int32") # !!! MASKS ON OUTPUT SHOULD BE IMPLEMENTED FOR VARIABLE LENGTH OUTPUT SEQS obj = lambda p, n: n - p if hingeloss: obj = lambda p, n: (n - p + margin).clip(0, np.infty) if sumhingeloss: # obj = lambda p, n: T.sum((n - p + margin).clip(0, np.infty), axis=1) traingoldshifted = shiftdata(traingold) validgoldshifted = shiftdata(validgold) #embed() # eval if preeval: tt.tick("pre-evaluating") s = SeqEncDecRankSearch(encdec, entenc, scorer.s, scorer.agg) eval = FullRankEval() pred, scores = s.decode(testdata, 0, testgold.shape[1], candata=entmat, canids=canids, transform=transf.f, debug=printpreds) evalres = eval.eval(pred, testgold, debug=debug) for k, evalre in evalres.items(): print("{}:\t{}".format(k, evalre)) tt.tock("pre-evaluated") negidxgenargs = ([numents], {"midsplit": relstarts}) if debug: pass #negidxgenargs = ([numents], {}) tt.tick("training") nscorer = scorer.nstrain([traindata, traingoldshifted, traingold]).transform(transf) \ .negsamplegen(NegIdxGen(*negidxgenargs[0], **negidxgenargs[1])).negrate(negrate).objective(obj) \ .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0) \ .validate_on([validdata, validgoldshifted, validgold]) \ .train(numbats=numbats, epochs=epochs) tt.tock("trained") #scorer.save("scorer.test.save") # eval tt.tick("evaluating") s = SeqEncDecRankSearch(encdec, entenc, scorer.s, scorer.agg) eval = FullRankEval() pred, scores = s.decode(testdata, 0, testgold.shape[1], candata=entmat, canids=canids, transform=transf.f, debug=printpreds) if printpreds: print pred debugarg = "subj" if subjpred else "pred" if predpred else False evalres = eval.eval(pred, testgold, debug=debugarg) for k, evalre in evalres.items(): print("{}:\t{}".format(k, evalre)) tt.tock("evaluated") # save basename = os.path.splitext(os.path.basename(__file__))[0] dirname = basename + ".results" if not os.path.exists(dirname): os.makedirs(dirname) savenamegen = lambda i: "{}/{}.res".format(dirname, i) savename = None for i in xrange(100): savename = savenamegen(i) if not os.path.exists(savename): break savename = None if savename is None: raise Exception("exceeded number of saved results") with open(savename, "w") as f: f.write("{}\n".format(" ".join(sys.argv))) for k, evalre in evalres.items(): f.write("{}:\t{}\n".format(k, evalre))
def run( epochs=100, lr=0.03, wreg=0.0001, numbats=10, fbdatapath="../../data/mfqa/mfqa.tsv.sample", fblexpath="../../data/mfqa/mfqa.labels.idx.map", glovepath="../../data/glove/glove.6B.50d.txt", fbentdicp="../../data/mfqa/mfqa.dic.map", numwords=20, numchars=30, wordembdim=50, wordencdim=50, entembdim=101, innerdim=100, attdim=100, wordoffset=1, validinter=1, gradnorm=1.0, validsplit=5, vocnumwordsres=50e3, model="nomem", ): tt = ticktock("fblextransrun") traindata, golddata, vocnuments, vocnumwords, datanuments, entdic, worddic = \ loaddata(glovepath, fbentdicp, fbdatapath, wordoffset, numwords, numchars) tt.tock("made data").tick() entids, lexdata = load_lex_data(fblexpath, datanuments, worddic) # manual split # TODO: do split in feeder splitpoint = int(traindata.shape[0] * (1. - 1. / validsplit)) print splitpoint validdata = traindata[splitpoint:] validgold = golddata[splitpoint:] traindata = traindata[:splitpoint] golddata = golddata[:splitpoint] print traindata.shape, golddata.shape print validdata.shape, validgold.shape if "lex" in model: # append lexdata traindata = np.concatenate([traindata, lexdata], axis=0) print traindata.shape entids = entids.reshape((entids.shape[0], 1)) golddata = np.concatenate([ golddata, np.concatenate( [entids, np.zeros_like(entids, dtype="int32")], axis=1) ], axis=0) print golddata.shape #exit() if "att" in model: m = FBSeqCompEncDecAtt( wordembdim=wordembdim, wordencdim=wordencdim, entembdim=entembdim, innerdim=innerdim, outdim=datanuments, numchars=128, numwords=vocnumwords, attdim=attdim, ) else: m = FBSeqCompositeEncDec( wordembdim=wordembdim, wordencdim=wordencdim, entembdim=entembdim, innerdim=innerdim, outdim=datanuments, numchars=128, numwords=vocnumwords, ) reventdic = {} for k, v in entdic.items(): reventdic[v] = k prelex = "lex" in model #wenc = WordEncoderPlusGlove(numchars=numchars, numwords=vocnumwords, encdim=wordencdim, embdim=wordembdim) tt.tock("model defined") # embed() outdata = shiftdata(golddata) tt.tick("predicting") print traindata[:5].shape, outdata[:5].shape #print golddata[:5] ; exit() pred = m.predict(traindata[:5], outdata[:5]) print np.argmax(pred, axis=2) - 1 print np.vectorize(lambda x: reventdic[x])(np.argmax(pred, axis=2) - 1) tt.tock("predicted sample") tt.tick("training") m.train([traindata, outdata], golddata).adagrad(lr=lr).l2(wreg).grad_total_norm(gradnorm).seq_cross_entropy() \ .validate_on([validdata, shiftdata(validgold)], validgold).validinter(validinter).seq_accuracy().seq_cross_entropy() \ .train(numbats, epochs) # embed() tt.tock("trained").tick("predicting") pred = m.predict(validdata, shiftdata(validgold)) print np.argmax(pred, axis=2) - 1 #print np.vectorize(lambda x: reventdic[x])(np.argmax(pred, axis=2) - 1) tt.tock("predicted sample")
def run( epochs=100, lr=0.01, wreg=0.0001, numbats=10, fbdatapath="../../data/mfqa/mfqa.tsv.sample.small", fblexpath="../../data/mfqa/mfqa.labels.idx.map", glovepath="../../data/glove/glove.6B.50d.txt", fbentdicp="../../data/mfqa/mfqa.dic.map", numwords=20, numchars=30, wordembdim=50, wordencdim=100, entembdim=100, innerdim=200, attdim=200, wordoffset=1, validinter=1, gradnorm=1.0, validsplit=1, vocnumwordsres=50e3, model="mem", ): tt = ticktock("fblextransrun") traindata, golddata, vocnuments, vocnumwords, datanuments, entdic, worddic = \ loaddata(glovepath, fbentdicp, fbdatapath, wordoffset, numwords, numchars) outdata = shiftdata(golddata) tt.tock("made data").tick() entids, lexdata = load_lex_data(fblexpath, datanuments, worddic) if "mem" in model: print lexdata.shape print datanuments #embed() if "att" in model: print "model with attention AND memory" m = FBSeqCompEncMemDecAtt( wordembdim=wordembdim, wordencdim=wordencdim, entembdim=entembdim, innerdim=innerdim, outdim=datanuments, numchars=128, # ASCII numwords=vocnumwords, memdata=[entids, lexdata], attdim=attdim, memaddr=GeneralDotMemAddr, ) else: m = FBSeqCompositeEncMemDec( wordembdim=wordembdim, wordencdim=wordencdim, entembdim=entembdim, innerdim=innerdim, outdim=datanuments, numchars=128, # ASCII numwords=vocnumwords, memdata=[entids, lexdata], attdim=attdim, memaddr=LinearGateMemAddr, ) elif model=="lex": # for testing purposes print lexdata.shape print datanuments #vocnumwords = 4000 #exit() #embed() m = FBMemMatch( wordembdim=wordembdim, wordencdim=wordencdim, entembdim=entembdim, innerdim=innerdim, outdim=datanuments, numchars=128, numwords=vocnumwords, memdata=[entids, lexdata], attdim=attdim, ) elif model=="nomem": m = FBSeqCompositeEncDec( # compiles, errors go down wordembdim=wordembdim, wordencdim=wordencdim, entembdim=entembdim, innerdim=innerdim, outdim=datanuments, numchars=128, numwords=vocnumwords ) else: m = None print "no such model" reventdic = {} for k, v in entdic.items(): reventdic[v] = k #wenc = WordEncoderPlusGlove(numchars=numchars, numwords=vocnumwords, encdim=wordencdim, embdim=wordembdim) tt.tock("model defined") if model == "lex": # for testing purposes tt.tick("predicting") print lexdata[1:5].shape, entids[1:5].shape #print lexdata[1:5] print entids[1:5] pred = m.predict(lexdata[1:5]) print pred.shape print np.argmax(pred, axis=1)-1 print np.vectorize(lambda x: reventdic[x] if x in reventdic else None)(np.argmax(pred, axis=1)-1) tt.tock("predicted sample") tt.tick("training") m.train([lexdata[1:151]], entids[1:151]).adagrad(lr=lr).cross_entropy().grad_total_norm(0.5)\ .split_validate(5, random=True).validinter(validinter).accuracy()\ .train(numbats, epochs) else: #embed() tt.tick("predicting") print traindata[:5].shape, outdata[:5].shape pred = m.predict(traindata[:5], outdata[:5]) print np.argmax(pred, axis=2)-1 print np.vectorize(lambda x: reventdic[x])(np.argmax(pred, axis=2)-1) tt.tock("predicted sample") tt.tick("training") m.train([traindata, outdata], golddata).adagrad(lr=lr).grad_total_norm(gradnorm).seq_cross_entropy()\ .split_validate(splits=5, random=False).validinter(validinter).seq_accuracy().seq_cross_entropy()\ .train(numbats, epochs) #embed() tt.tock("trained").tick("predicting") pred = m.predict(traindata[:50], outdata[:50]) print np.vectorize(lambda x: reventdic[x])(np.argmax(pred, axis=2)-1) tt.tock("predicted sample")