示例#1
0
import argparse
import numpy
from vstore import VStore

parser = argparse.ArgumentParser(description="Load GloVe vectors into LMDB")
parser.add_argument(
    "--name", action="store", type=str, default="glove", help="name of the database into which to load the vectors"
)
parser.add_argument("--dbfile", action="store", type=str, default="vectors.lmdb", help="shared database filename")
parser.add_argument("--merge", action="store_true", help="merge the new dataset, rather than replacing")
parser.add_argument("source", type=file, help="uncompressed GloVe dataset")
args = parser.parse_args()

# Invert control in order to use one transaction
table = VStore(args.dbfile, args.name)
table.drop()


def loader():
    for loaded, line in enumerate(args.source):
        line = line.split()
        name = line.pop(0)
        if loaded % 10000 == 0:
            print "Loaded {} rows".format(loaded)

        yield name, numpy.array(line, dtype=numpy.float32)


table.load(loader())
print "Finished loading. Creating similarity index."
示例#2
0
    unilda = models.LdaMulticore(unicorpus, id2word=unidict, num_topics=300, chunksize=25000, passes=10, iterations=50, workers=40, batch=True) # Lda
    unilda.save(SOURCE+'.unilda_model')
    unilda.print_topics(20)
    unildavstore.drop()
    unildavstore.load(
        (unidict[idnum].encode(), matutils.sparse2full(unilda[[(idnum, 1)]], 300))
        for idnum in unidict)
    '''

    print "using W2V"
    uniw2vvstore = VStore(SOURCE+"vectors.lmdb", "w2v-word8-lines")
    uniw2v = models.word2vec.Word2Vec([line.split() for line in open('word8-lines.short')], size=300, window=5, min_count=5, workers=8)
    uniw2v.save('word8-lines.short.uniw2vmodel')
    uniw2vvstore.drop()
    uniw2vvstore.load(
        (rnidict[idnum].encode(), matutils.sparse2full(uniw2v[[(idnum, 1)]], 300))
         for idnum in unidict)

    #uniindex = similarities.MatrixSimilarity(unilsi[unicorpus], num_features=300)
    #uniindex.save('word8-lines.short.matsim')

## Get a query
import sys
sys.exit()
'''
query = raw_input("Search: ")
while query:
    vec = unidict.doc2bow(query.lower().split())

    sims = uniindex[unilsi[vec]]
    print(sorted(list(enumerate(sims)), key=lambda x: -x[1])[:20])
示例#3
0
import numpy
from vstore import VStore

parser = argparse.ArgumentParser(description="Load GloVe vectors into LMDB")
parser.add_argument("--name", action="store", type=str, default="glove",
	help="name of the database into which to load the vectors")
parser.add_argument("--dbfile", action="store", type=str, default="vectors.lmdb",
	help="shared database filename")
parser.add_argument("--merge", action="store_true",
	help="merge the new dataset, rather than replacing")
parser.add_argument("source", type=file,
	help="uncompressed GloVe dataset")
args = parser.parse_args()

# Invert control in order to use one transaction
table = VStore(args.dbfile, args.name)
table.drop()
def loader():
	for loaded, line in enumerate(args.source):
		line = line.split()
		name = line.pop(0)
		## Tokenization errors can cause a word to be too long for lmdb
		if len(name) > 100:
			continue
		if loaded % 10000 == 0:
			print "Loaded {} rows".format(loaded)

		yield name, numpy.array(line, dtype=numpy.float32)
table.load(loader())
print "Finished loading"
示例#4
0
    unicorpus = digestion.LineCorpus(SOURCE, unidict)

    ### Creating the index
    tfidf = models.TfidfModel(unicorpus)
    corpus_tfidf = tfidf[unicorpus]
    import code
    code.interact(local=vars())

    print "using LSI"
    unilsivstore = VStore(SOURCE+"vectors.lmdb", "lsi")
    unilsi = models.LsiModel(corpus_tfidf, chunksize=1000000, id2word=unidict, num_topics=300) # initialize an LSI transformation
    unilsi.save(SOURCE + '.unilsimodel')
    unilsi.print_topics(20)
    unilsivstore.drop()
    unilsivstore.load(
        (unidict[idnum].encode(), matutils.sparse2full(unilsi[[(idnum, 1)]], 300))
        for idnum in unidict)

    print "using LDA"
    unildavstore = VStore(SOURCE+"vectors.lmdb", "mini-plain-lda")
    unilda = models.LdaMulticore(unicorpus, id2word=unidict, num_topics=300, chunksize=25000, passes=10, iterations=50, workers=40, batch=True) # Lda
    unilda.save(SOURCE+'.unildamodel')
    unilda.print_topics(20)
    unildavstore.drop()
    unildavstore.load(
        (unidict[idnum].encode(), matutils.sparse2full(unilda[[(idnum, 1)]], 300))
        for idnum in unidict)


    '''print "using W2V"
    uniw2vvstore = VStore(SOURCE+"vectors.lmdb", "w2v")