예제 #1
0
#!/usr/bin/env python
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA, KernelPCA
from sklearn.manifold import Isomap, TSNE
from analogy import Analogy
from vstore import VStore

a = Analogy(VStore("vectors.lmdb", "big-glove"))

buf = ""
linebuf = raw_input("Please enter some words to plot, or empty for a canned list: ")
while linebuf:
	buf += linebuf + " "
	linebuf = raw_input("... ")


labels = buf.split() \
    or "doctor nurse politician senator lawyer barrister defend accuse heal treat cure elect vote".split() 

vs = [a.w(x) for x in labels if a.w(x) is not None  ]
flatplot = TSNE(2)
ps = flatplot.fit_transform(vs)

plt.title("Reduced vector space model")
plt.xlabel("First Principal Component")
plt.ylabel("Second Principal Component")
plt.scatter(ps[:, 0], ps[:, 1])
for (x, y), label in zip(ps, labels):
    print "plotting %f, %f, %s" %(x, y, label)
    plt.annotate(label, xy = (x, y), xytext = (0, 0), textcoords = 'offset points')
예제 #2
0
import sys
import argparse
import numpy
from vstore import VStore

parser = argparse.ArgumentParser(description="Load GloVe vectors into LMDB")
parser.add_argument(
    "--name", action="store", type=str, default="glove", help="name of the database into which to load the vectors"
)
parser.add_argument("--dbfile", action="store", type=str, default="vectors.lmdb", help="shared database filename")
parser.add_argument("--merge", action="store_true", help="merge the new dataset, rather than replacing")
parser.add_argument("source", type=file, help="uncompressed GloVe dataset")
args = parser.parse_args()

# Invert control in order to use one transaction
table = VStore(args.dbfile, args.name)
table.drop()


def loader():
    for loaded, line in enumerate(args.source):
        line = line.split()
        name = line.pop(0)
        if loaded % 10000 == 0:
            print "Loaded {} rows".format(loaded)

        yield name, numpy.array(line, dtype=numpy.float32)


table.load(loader())
print "Finished loading. Creating similarity index."
예제 #3
0
import numpy
from vstore import VStore

parser = argparse.ArgumentParser(description="Load GloVe vectors into LMDB")
parser.add_argument("--name", action="store", type=str, default="glove",
	help="name of the database into which to load the vectors")
parser.add_argument("--dbfile", action="store", type=str, default="vectors.lmdb",
	help="shared database filename")
parser.add_argument("--merge", action="store_true",
	help="merge the new dataset, rather than replacing")
parser.add_argument("source", type=file,
	help="uncompressed GloVe dataset")
args = parser.parse_args()

# Invert control in order to use one transaction
table = VStore(args.dbfile, args.name)
table.drop()
def loader():
	for loaded, line in enumerate(args.source):
		line = line.split()
		name = line.pop(0)
		## Tokenization errors can cause a word to be too long for lmdb
		if len(name) > 100:
			continue
		if loaded % 10000 == 0:
			print "Loaded {} rows".format(loaded)

		yield name, numpy.array(line, dtype=numpy.float32)
table.load(loader())
print "Finished loading"
예제 #4
0
        for idnum in unidict)
    
    
    print "using LDA"
    unildavstore = VStore(SOURCE+"vectors.lmdb", "mini-plain-lda")
    unilda = models.LdaMulticore(unicorpus, id2word=unidict, num_topics=300, chunksize=25000, passes=10, iterations=50, workers=40, batch=True) # Lda
    unilda.save(SOURCE+'.unilda_model')
    unilda.print_topics(20)
    unildavstore.drop()
    unildavstore.load(
        (unidict[idnum].encode(), matutils.sparse2full(unilda[[(idnum, 1)]], 300))
        for idnum in unidict)
    '''

    print "using W2V"
    uniw2vvstore = VStore(SOURCE+"vectors.lmdb", "w2v-word8-lines")
    uniw2v = models.word2vec.Word2Vec([line.split() for line in open('word8-lines.short')], size=300, window=5, min_count=5, workers=8)
    uniw2v.save('word8-lines.short.uniw2vmodel')
    uniw2vvstore.drop()
    uniw2vvstore.load(
        (rnidict[idnum].encode(), matutils.sparse2full(uniw2v[[(idnum, 1)]], 300))
         for idnum in unidict)

    #uniindex = similarities.MatrixSimilarity(unilsi[unicorpus], num_features=300)
    #uniindex.save('word8-lines.short.matsim')

## Get a query
import sys
sys.exit()
'''
query = raw_input("Search: ")
예제 #5
0
    unidict.save(SOURCE + '.dict')
    print(unidict)

    #unidict = corpora.Dictionary.load("word8.dict")

    ### Preprocessing
    unicorpus = digestion.LineCorpus(SOURCE, unidict)

    ### Creating the index
    tfidf = models.TfidfModel(unicorpus)
    corpus_tfidf = tfidf[unicorpus]
    import code
    code.interact(local=vars())

    print "using LSI"
    unilsivstore = VStore(SOURCE+"vectors.lmdb", "lsi")
    unilsi = models.LsiModel(corpus_tfidf, chunksize=1000000, id2word=unidict, num_topics=300) # initialize an LSI transformation
    unilsi.save(SOURCE + '.unilsimodel')
    unilsi.print_topics(20)
    unilsivstore.drop()
    unilsivstore.load(
        (unidict[idnum].encode(), matutils.sparse2full(unilsi[[(idnum, 1)]], 300))
        for idnum in unidict)

    print "using LDA"
    unildavstore = VStore(SOURCE+"vectors.lmdb", "mini-plain-lda")
    unilda = models.LdaMulticore(unicorpus, id2word=unidict, num_topics=300, chunksize=25000, passes=10, iterations=50, workers=40, batch=True) # Lda
    unilda.save(SOURCE+'.unildamodel')
    unilda.print_topics(20)
    unildavstore.drop()
    unildavstore.load(
예제 #6
0
            return (np.sum(left * right) /
                    (np.sqrt(np.sum(left**2)) * np.sqrt(np.sum(right**2))))
        return None


if __name__ == '__main__':
    np.set_printoptions(threshold=20)

    parser = ArgumentParser(description="Perform simple algebra on words")
    parser.add_argument("--dbfile",
                        default="vectors.lmdb",
                        help="use this database file to get vectors")
    parser.add_argument("model",
                        action="store",
                        help="compare using this model database (e.g. glove)")
    args = parser.parse_args()
    model = VStore(args.dbfile, args.model)

    analogy = Analogy(model)
    w = analogy.w
    pcaw = analogy.pcaw
    context = analogy.context
    sim = analogy.sim
    print "What follows is a python prompt."
    print "w('elicidate') --> vector for `elicidate`"
    print "w('mogrify') + w('frobnicate') --> vector sum"
    print "    same for -, *, /, **, etc as usual for numpy"
    print "sim(w('republican'), w('democrat')) -> society in a 32bit float"
    print "    (actually a simple cosine similarity)"
    code.interact(local=vars())