def run(p="../../../data/atis/atis.pkl", wordembdim=100, innerdim=200, lr=0.05, numbats=100, epochs=20, validinter=1, wreg=0.0003, depth=1): p = os.path.join(os.path.dirname(__file__), p) train, test, dics = pickle.load(open(p)) word2idx = dics["words2idx"] table2idx = dics["tables2idx"] label2idx = dics["labels2idx"] label2idxrev = {v: k for k, v in label2idx.items()} train = zip(*train) test = zip(*test) print "%d training examples, %d test examples" % (len(train), len(test)) #tup2text(train[0], word2idx, table2idx, label2idx) maxlen = 0 for tup in train + test: maxlen = max(len(tup[0]), maxlen) numwords = max(word2idx.values()) + 2 numlabels = max(label2idx.values()) + 2 # get training data traindata = getdatamatrix(train, maxlen, 0).astype("int32") traingold = getdatamatrix(train, maxlen, 2).astype("int32") trainmask = (traindata > 0).astype("float32") # test data testdata = getdatamatrix(test, maxlen, 0).astype("int32") testgold = getdatamatrix(test, maxlen, 2).astype("int32") testmask = (testdata > 0).astype("float32") res = atiseval(testgold-1, testgold-1, label2idxrev); print res#; exit() # define model innerdim = [innerdim] * depth m = SimpleSeqTransducer(indim=numwords, embdim=wordembdim, innerdim=innerdim, outdim=numlabels) '''m = StupidAtis(inpembdim = wordembdim, indim = numwords, outdim = numlabels) m = StupidAtisNative(inpembdim=wordembdim, indim=numwords, outdim=numlabels)''' #m = StupidAtisScanMod(inpembdim=wordembdim, indim=numwords, outdim=numlabels) #m = StupidAtisScanModNative(inpembdim=wordembdim, indim=numwords, outdim=numlabels) # training '''m.train([traindata, trainmask], traingold).adagrad(lr=lr).grad_total_norm(5.0).seq_cross_entropy().l2(wreg)\ .split_validate(splits=5, random=True).seq_cross_entropy().seq_accuracy().validinter(validinter)\ .train(numbats, epochs)''' m.train([traindata], traingold).adagrad(lr=lr).grad_total_norm(5.0).seq_cross_entropy().l2(wreg)\ .split_validate(splits=5, random=True).seq_cross_entropy().seq_accuracy().validinter(validinter)\ .train(numbats, epochs) # predict after training testpredprobs = m.predict(testdata, testmask) testpred = np.argmax(testpredprobs, axis=2)-1 #testpred = testpred * testmask #print np.vectorize(lambda x: label2idxrev[x] if x > -1 else " ")(testpred) evalres = atiseval(testpred, testgold-1, label2idxrev); print evalres
def test_output_shape(self): # settings batsize = 10 seqlen = 5 invocsize = 50 inembdim = 50 innerdim = 11 outvocsize = 17 # data traindata = np.random.randint(0, invocsize, (batsize, seqlen)) traingold = np.random.randint(0, outvocsize, (batsize, seqlen)) # model m = SimpleSeqTransducer(indim=invocsize, embdim=inembdim, innerdim=innerdim, outdim=outvocsize) pred = m.predict(traindata) self.assertEqual(pred.shape, (batsize, seqlen, outvocsize))
def run(p="../../../data/atis/atis.pkl", wordembdim=100, innerdim=200, lr=0.05, numbats=100, epochs=20, validinter=1, wreg=0.0003, depth=1): p = os.path.join(os.path.dirname(__file__), p) train, test, dics = pickle.load(open(p)) word2idx = dics["words2idx"] table2idx = dics["tables2idx"] label2idx = dics["labels2idx"] label2idxrev = {v: k for k, v in label2idx.items()} train = zip(*train) test = zip(*test) print "%d training examples, %d test examples" % (len(train), len(test)) #tup2text(train[0], word2idx, table2idx, label2idx) maxlen = 0 for tup in train + test: maxlen = max(len(tup[0]), maxlen) numwords = max(word2idx.values()) + 2 numlabels = max(label2idx.values()) + 2 # get training data traindata = getdatamatrix(train, maxlen, 0).astype("int32") traingold = getdatamatrix(train, maxlen, 2).astype("int32") trainmask = (traindata > 0).astype("float32") # test data testdata = getdatamatrix(test, maxlen, 0).astype("int32") testgold = getdatamatrix(test, maxlen, 2).astype("int32") testmask = (testdata > 0).astype("float32") res = atiseval(testgold - 1, testgold - 1, label2idxrev) print res #; exit() # define model innerdim = [innerdim] * depth m = SimpleSeqTransducer(indim=numwords, embdim=wordembdim, innerdim=innerdim, outdim=numlabels) '''m = StupidAtis(inpembdim = wordembdim, indim = numwords, outdim = numlabels) m = StupidAtisNative(inpembdim=wordembdim, indim=numwords, outdim=numlabels)''' #m = StupidAtisScanMod(inpembdim=wordembdim, indim=numwords, outdim=numlabels) #m = StupidAtisScanModNative(inpembdim=wordembdim, indim=numwords, outdim=numlabels) # training '''m.train([traindata, trainmask], traingold).adagrad(lr=lr).grad_total_norm(5.0).seq_cross_entropy().l2(wreg)\ .split_validate(splits=5, random=True).seq_cross_entropy().seq_accuracy().validinter(validinter)\ .train(numbats, epochs)''' m.train([traindata], traingold).adagrad(lr=lr).grad_total_norm(5.0).seq_cross_entropy().l2(wreg)\ .split_validate(splits=5, random=True).seq_cross_entropy().seq_accuracy().validinter(validinter)\ .train(numbats, epochs) # predict after training testpredprobs = m.predict(testdata, testmask) testpred = np.argmax(testpredprobs, axis=2) - 1 #testpred = testpred * testmask #print np.vectorize(lambda x: label2idxrev[x] if x > -1 else " ")(testpred) evalres = atiseval(testpred, testgold - 1, label2idxrev) print evalres
def run(epochs=50, numbats=25, lr=0.1, layers=1, embdim=100, encdim=200, bidir=False, wordlevel=False, # "char" or "word" maxlen=75, maxwordlen=15, ): mode = "word" if wordlevel else "char" (traindata, traingold), (testdata, testgold), dic = \ readdata("../../../data/hatespeech/train.csv", "../../../data/hatespeech/test.csv", mode=mode, maxlen=maxlen) revdic = {v: k for k, v in dic.items()} def pp(s): print "".join([revdic[x] if x in revdic else "<???>" for x in s]) embed() # data stats print "class distribution in train: {}% positive".format(np.sum(traingold)*1. / np.sum(np.ones_like(traingold))) print "class distribution in test: {}% positive".format(np.sum(testgold)*1. / np.sum(np.ones_like(testgold))) wordemb = VectorEmbed(indim=len(dic), dim=embdim) clasemb = VectorEmbed(indim=2, dim=embdim) encdim = [encdim] * layers enc = SimpleSeqTransducer(inpemb=Eye(embdim*2), innerdim=encdim, outdim=len(dic)) m = GenClass(wordemb, clasemb, enc) # shift traindata straindata = np.ones((traindata.shape[0], 1), dtype="int32") straindata = np.concatenate([straindata, traindata[:, :-1]], axis=1) m = m.train([straindata, traingold], traindata)\ .adadelta(lr=lr).grad_total_norm(1.0).seq_cross_entropy()\ .split_validate(6, random=True).seq_cross_entropy().seq_accuracy()\ .train(numbats=numbats, epochs=epochs) #enc.save("hatemodel.{}.Emb{}D.Enc{}D.{}L.model".format(mode, embdim, encdim, layers)) # pre predict stestdata = np.ones((testdata.shape[0], 1), dtype="int32") stestdata = np.concatenate([stestdata, testdata[:, :-1]], axis=1) negpreds = m.predict(stestdata, np.zeros_like(testgold)) # (batsize, seqlen, vocsize) pospreds = m.predict(stestdata, np.ones_like(testgold)) negprobs = negpreds[ np.arange(negpreds.shape[0])[:, None], np.arange(negpreds.shape[1])[None, :], testdata] posprobs = pospreds[ np.arange(pospreds.shape[0])[:, None], np.arange(pospreds.shape[1])[None, :], testdata] negprobs = np.sum(-np.log(negprobs), axis=1) posprobs = np.sum(-np.log(posprobs), axis=1) pred = negprobs < posprobs embed()