#!/usr/bin/env python from matplotlib import pyplot as plt from sklearn.decomposition import PCA, KernelPCA from sklearn.manifold import Isomap, TSNE from analogy import Analogy from vstore import VStore a = Analogy(VStore("vectors.lmdb", "big-glove")) buf = "" linebuf = raw_input("Please enter some words to plot, or empty for a canned list: ") while linebuf: buf += linebuf + " " linebuf = raw_input("... ") labels = buf.split() \ or "doctor nurse politician senator lawyer barrister defend accuse heal treat cure elect vote".split() vs = [a.w(x) for x in labels if a.w(x) is not None ] flatplot = TSNE(2) ps = flatplot.fit_transform(vs) plt.title("Reduced vector space model") plt.xlabel("First Principal Component") plt.ylabel("Second Principal Component") plt.scatter(ps[:, 0], ps[:, 1]) for (x, y), label in zip(ps, labels): print "plotting %f, %f, %s" %(x, y, label) plt.annotate(label, xy = (x, y), xytext = (0, 0), textcoords = 'offset points')
import sys import argparse import numpy from vstore import VStore parser = argparse.ArgumentParser(description="Load GloVe vectors into LMDB") parser.add_argument( "--name", action="store", type=str, default="glove", help="name of the database into which to load the vectors" ) parser.add_argument("--dbfile", action="store", type=str, default="vectors.lmdb", help="shared database filename") parser.add_argument("--merge", action="store_true", help="merge the new dataset, rather than replacing") parser.add_argument("source", type=file, help="uncompressed GloVe dataset") args = parser.parse_args() # Invert control in order to use one transaction table = VStore(args.dbfile, args.name) table.drop() def loader(): for loaded, line in enumerate(args.source): line = line.split() name = line.pop(0) if loaded % 10000 == 0: print "Loaded {} rows".format(loaded) yield name, numpy.array(line, dtype=numpy.float32) table.load(loader()) print "Finished loading. Creating similarity index."
import numpy from vstore import VStore parser = argparse.ArgumentParser(description="Load GloVe vectors into LMDB") parser.add_argument("--name", action="store", type=str, default="glove", help="name of the database into which to load the vectors") parser.add_argument("--dbfile", action="store", type=str, default="vectors.lmdb", help="shared database filename") parser.add_argument("--merge", action="store_true", help="merge the new dataset, rather than replacing") parser.add_argument("source", type=file, help="uncompressed GloVe dataset") args = parser.parse_args() # Invert control in order to use one transaction table = VStore(args.dbfile, args.name) table.drop() def loader(): for loaded, line in enumerate(args.source): line = line.split() name = line.pop(0) ## Tokenization errors can cause a word to be too long for lmdb if len(name) > 100: continue if loaded % 10000 == 0: print "Loaded {} rows".format(loaded) yield name, numpy.array(line, dtype=numpy.float32) table.load(loader()) print "Finished loading"
for idnum in unidict) print "using LDA" unildavstore = VStore(SOURCE+"vectors.lmdb", "mini-plain-lda") unilda = models.LdaMulticore(unicorpus, id2word=unidict, num_topics=300, chunksize=25000, passes=10, iterations=50, workers=40, batch=True) # Lda unilda.save(SOURCE+'.unilda_model') unilda.print_topics(20) unildavstore.drop() unildavstore.load( (unidict[idnum].encode(), matutils.sparse2full(unilda[[(idnum, 1)]], 300)) for idnum in unidict) ''' print "using W2V" uniw2vvstore = VStore(SOURCE+"vectors.lmdb", "w2v-word8-lines") uniw2v = models.word2vec.Word2Vec([line.split() for line in open('word8-lines.short')], size=300, window=5, min_count=5, workers=8) uniw2v.save('word8-lines.short.uniw2vmodel') uniw2vvstore.drop() uniw2vvstore.load( (rnidict[idnum].encode(), matutils.sparse2full(uniw2v[[(idnum, 1)]], 300)) for idnum in unidict) #uniindex = similarities.MatrixSimilarity(unilsi[unicorpus], num_features=300) #uniindex.save('word8-lines.short.matsim') ## Get a query import sys sys.exit() ''' query = raw_input("Search: ")
unidict.save(SOURCE + '.dict') print(unidict) #unidict = corpora.Dictionary.load("word8.dict") ### Preprocessing unicorpus = digestion.LineCorpus(SOURCE, unidict) ### Creating the index tfidf = models.TfidfModel(unicorpus) corpus_tfidf = tfidf[unicorpus] import code code.interact(local=vars()) print "using LSI" unilsivstore = VStore(SOURCE+"vectors.lmdb", "lsi") unilsi = models.LsiModel(corpus_tfidf, chunksize=1000000, id2word=unidict, num_topics=300) # initialize an LSI transformation unilsi.save(SOURCE + '.unilsimodel') unilsi.print_topics(20) unilsivstore.drop() unilsivstore.load( (unidict[idnum].encode(), matutils.sparse2full(unilsi[[(idnum, 1)]], 300)) for idnum in unidict) print "using LDA" unildavstore = VStore(SOURCE+"vectors.lmdb", "mini-plain-lda") unilda = models.LdaMulticore(unicorpus, id2word=unidict, num_topics=300, chunksize=25000, passes=10, iterations=50, workers=40, batch=True) # Lda unilda.save(SOURCE+'.unildamodel') unilda.print_topics(20) unildavstore.drop() unildavstore.load(
return (np.sum(left * right) / (np.sqrt(np.sum(left**2)) * np.sqrt(np.sum(right**2)))) return None if __name__ == '__main__': np.set_printoptions(threshold=20) parser = ArgumentParser(description="Perform simple algebra on words") parser.add_argument("--dbfile", default="vectors.lmdb", help="use this database file to get vectors") parser.add_argument("model", action="store", help="compare using this model database (e.g. glove)") args = parser.parse_args() model = VStore(args.dbfile, args.model) analogy = Analogy(model) w = analogy.w pcaw = analogy.pcaw context = analogy.context sim = analogy.sim print "What follows is a python prompt." print "w('elicidate') --> vector for `elicidate`" print "w('mogrify') + w('frobnicate') --> vector sum" print " same for -, *, /, **, etc as usual for numpy" print "sim(w('republican'), w('democrat')) -> society in a 32bit float" print " (actually a simple cosine similarity)" code.interact(local=vars())