def nearest_word(vector, vector_lib, index2word, n=5, skip=0, chunk_size=100000, use_ne=False, use_shortdot=True, thresh=0.0): words = [] if use_ne: d = ne.evaluate('sum(vector_lib * vector, axis=1)') idx = np.argsort(d)[::-1] words = [index2word[i] for i in idx[:n]] elif use_shortdot: d = np.zeros(vector_lib.shape[0], dtype='f4') shortdot.shortdot(vector_lib, vector, d, 100, thresh) idx = np.argsort(d)[::-1] words = [index2word[i] for i in idx[:n]] else: sims = [] offset = 0 for vl in chunks(vector_lib, chunk_size): d = similarity(vl, vector) da = np.argsort(d)[::-1] for idx in da[:n]: words.append(index2word[idx + offset]) sims.append(d[idx]) offset += chunk_size idx = np.argsort(sims)[::-1] words = [words[i] for i in idx[:n]] vectors = [vector_lib[i] for i in idx[:n] ] sim = [d[i] for i in idx[:n]] return words, vectors, sim
def nearest_word(vector, vector_lib, index2word, n=5, skip=0, chunk_size=100000, use_ne=False, use_shortdot=True, thresh=0.0): words = [] if use_ne: d = ne.evaluate('sum(vector_lib * vector, axis=1)') idx = np.argsort(d)[::-1] words = [index2word[i] for i in idx[:n]] elif use_shortdot: d = np.zeros(vector_lib.shape[0], dtype='f4') shortdot.shortdot(vector_lib, vector, d, 100, thresh) idx = np.argsort(d)[::-1] words = [index2word[i] for i in idx[:n]] else: sims = [] offset = 0 for vl in chunks(vector_lib, chunk_size): d = similarity(vl, vector) da = np.argsort(d)[::-1] for idx in da[:n]: words.append(index2word[idx + offset]) sims.append(d[idx]) offset += chunk_size idx = np.argsort(sims)[::-1] words = [words[i] for i in idx[:n]] vectors = [vector_lib[i] for i in idx[:n]] sim = [d[i] for i in idx[:n]] return words, vectors, sim
thresh = -1.0 print "created matrix" else: A = np.load("/nobackupp5/cmoody3/data/ids/trained/vectors.fullwiki.1000.s50.num.npy") A = A.astype('f4') B = A[A.shape[0]/2] C = np.zeros(A.shape[0]).astype('f8') thresh = 0.0 rows = A.shape[0] dims = A.shape[1] C = C.astype('f4') n = 20 start = time.time() for i in range(n): C = np.zeros(rows).astype('f4') skipped = shortdot.shortdot(A, B, C, 50, thresh) stop = time.time() frac = skipped * 1.0 / (rows * dims) print "finished cython, skipped %i, %1.1f%%" % (skipped, frac * 100.0) cy = (stop - start) * 1.0 / n * 1e6 print 'cython top 5:', np.sort(C)[-5:], np.argsort(C)[-5:] start = time.time() for i in range(n): x = np.zeros(rows).astype('f4') D = np.dot(A, B) stop = time.time() py = (stop - start) * 1.0 / n * 1e6 print 'numpy top 5:', np.sort(D)[-5:], np.argsort(D)[-5:] comp = np.where(np.argsort(D)[::-1] != np.argsort(C)[::-1])[0] print "first inequal indices", comp[:10]
else: A = np.load( "/nobackupp5/cmoody3/data/ids/trained/vectors.fullwiki.1000.s50.num.npy" ) A = A.astype('f4') B = A[A.shape[0] / 2] C = np.zeros(A.shape[0]).astype('f8') thresh = 0.0 rows = A.shape[0] dims = A.shape[1] C = C.astype('f4') n = 20 start = time.time() for i in range(n): C = np.zeros(rows).astype('f4') skipped = shortdot.shortdot(A, B, C, 50, thresh) stop = time.time() frac = skipped * 1.0 / (rows * dims) print "finished cython, skipped %i, %1.1f%%" % (skipped, frac * 100.0) cy = (stop - start) * 1.0 / n * 1e6 print 'cython top 5:', np.sort(C)[-5:], np.argsort(C)[-5:] start = time.time() for i in range(n): x = np.zeros(rows).astype('f4') D = np.dot(A, B) stop = time.time() py = (stop - start) * 1.0 / n * 1e6 print 'numpy top 5:', np.sort(D)[-5:], np.argsort(D)[-5:] comp = np.where(np.argsort(D)[::-1] != np.argsort(C)[::-1])[0] print "first inequal indices", comp[:10]