import argparse import numpy from vstore import VStore parser = argparse.ArgumentParser(description="Load GloVe vectors into LMDB") parser.add_argument( "--name", action="store", type=str, default="glove", help="name of the database into which to load the vectors" ) parser.add_argument("--dbfile", action="store", type=str, default="vectors.lmdb", help="shared database filename") parser.add_argument("--merge", action="store_true", help="merge the new dataset, rather than replacing") parser.add_argument("source", type=file, help="uncompressed GloVe dataset") args = parser.parse_args() # Invert control in order to use one transaction table = VStore(args.dbfile, args.name) table.drop() def loader(): for loaded, line in enumerate(args.source): line = line.split() name = line.pop(0) if loaded % 10000 == 0: print "Loaded {} rows".format(loaded) yield name, numpy.array(line, dtype=numpy.float32) table.load(loader()) print "Finished loading. Creating similarity index."
unilda = models.LdaMulticore(unicorpus, id2word=unidict, num_topics=300, chunksize=25000, passes=10, iterations=50, workers=40, batch=True) # Lda unilda.save(SOURCE+'.unilda_model') unilda.print_topics(20) unildavstore.drop() unildavstore.load( (unidict[idnum].encode(), matutils.sparse2full(unilda[[(idnum, 1)]], 300)) for idnum in unidict) ''' print "using W2V" uniw2vvstore = VStore(SOURCE+"vectors.lmdb", "w2v-word8-lines") uniw2v = models.word2vec.Word2Vec([line.split() for line in open('word8-lines.short')], size=300, window=5, min_count=5, workers=8) uniw2v.save('word8-lines.short.uniw2vmodel') uniw2vvstore.drop() uniw2vvstore.load( (rnidict[idnum].encode(), matutils.sparse2full(uniw2v[[(idnum, 1)]], 300)) for idnum in unidict) #uniindex = similarities.MatrixSimilarity(unilsi[unicorpus], num_features=300) #uniindex.save('word8-lines.short.matsim') ## Get a query import sys sys.exit() ''' query = raw_input("Search: ") while query: vec = unidict.doc2bow(query.lower().split()) sims = uniindex[unilsi[vec]] print(sorted(list(enumerate(sims)), key=lambda x: -x[1])[:20])
import numpy from vstore import VStore parser = argparse.ArgumentParser(description="Load GloVe vectors into LMDB") parser.add_argument("--name", action="store", type=str, default="glove", help="name of the database into which to load the vectors") parser.add_argument("--dbfile", action="store", type=str, default="vectors.lmdb", help="shared database filename") parser.add_argument("--merge", action="store_true", help="merge the new dataset, rather than replacing") parser.add_argument("source", type=file, help="uncompressed GloVe dataset") args = parser.parse_args() # Invert control in order to use one transaction table = VStore(args.dbfile, args.name) table.drop() def loader(): for loaded, line in enumerate(args.source): line = line.split() name = line.pop(0) ## Tokenization errors can cause a word to be too long for lmdb if len(name) > 100: continue if loaded % 10000 == 0: print "Loaded {} rows".format(loaded) yield name, numpy.array(line, dtype=numpy.float32) table.load(loader()) print "Finished loading"
unicorpus = digestion.LineCorpus(SOURCE, unidict) ### Creating the index tfidf = models.TfidfModel(unicorpus) corpus_tfidf = tfidf[unicorpus] import code code.interact(local=vars()) print "using LSI" unilsivstore = VStore(SOURCE+"vectors.lmdb", "lsi") unilsi = models.LsiModel(corpus_tfidf, chunksize=1000000, id2word=unidict, num_topics=300) # initialize an LSI transformation unilsi.save(SOURCE + '.unilsimodel') unilsi.print_topics(20) unilsivstore.drop() unilsivstore.load( (unidict[idnum].encode(), matutils.sparse2full(unilsi[[(idnum, 1)]], 300)) for idnum in unidict) print "using LDA" unildavstore = VStore(SOURCE+"vectors.lmdb", "mini-plain-lda") unilda = models.LdaMulticore(unicorpus, id2word=unidict, num_topics=300, chunksize=25000, passes=10, iterations=50, workers=40, batch=True) # Lda unilda.save(SOURCE+'.unildamodel') unilda.print_topics(20) unildavstore.drop() unildavstore.load( (unidict[idnum].encode(), matutils.sparse2full(unilda[[(idnum, 1)]], 300)) for idnum in unidict) '''print "using W2V" uniw2vvstore = VStore(SOURCE+"vectors.lmdb", "w2v")