type='int', default=5, help='number of near neighbours to emit') opts, arguments = optparser.parse_args() print >> sys.stderr, "options", opts token_idx = TokenIdx() token_idx.read_from_file(opts.vocab) # checking that tokens are in vocab for token in opts.tokens.split(" "): if not token_idx.id_exists_for(token): print >> sys.stderr, "token [%s] not in vocab?" % token exit(1) E = np.load(opts.matrix_file) #lshf = LSHForest() #lshf.fit(E) #distances, indices = lshf.kneighbors(E[[token_idx.id_for("monday_NNS")]], n_neighbors=10) #for d, i in zip(distances[0], indices[0]): # print d, token_idx.token_for(i) ball_tree = BallTree(E, leaf_size=30) for token in opts.tokens.split(" "): print distances, indices = ball_tree.query(E[[token_idx.id_for(token)]], k=min(opts.k, E.shape[0])) for d, nn in zip(distances[0], indices[0]): print d, token_idx.token_for(nn)
optparser.add_option('--vocab', None, dest='vocab', type='string', default='vocab.tsv', help='vocab for token idx') optparser.add_option('--matrix-file', None, dest='matrix_file', type='string', help='np matrix file to load; eg ckpt.X.E') optparser.add_option('--tokens', None, dest='tokens', type='string', help='space separated list of tokens to emit NNs for') optparser.add_option('--k', None, dest='k', type='int', default=5, help='number of near neighbours to emit') opts, arguments = optparser.parse_args() print >>sys.stderr, "options", opts token_idx = TokenIdx() token_idx.read_from_file(opts.vocab) # checking that tokens are in vocab for token in opts.tokens.split(" "): if not token_idx.id_exists_for(token): print >>sys.stderr, "token [%s] not in vocab?" % token exit(1) E = np.load(opts.matrix_file) #lshf = LSHForest() #lshf.fit(E) #distances, indices = lshf.kneighbors(E[[token_idx.id_for("monday_NNS")]], n_neighbors=10) #for d, i in zip(distances[0], indices[0]): # print d, token_idx.token_for(i) ball_tree = BallTree(E, leaf_size=30) for token in opts.tokens.split(" "): print distances, indices = ball_tree.query(E[[token_idx.id_for(token)]], k=min(opts.k, E.shape[0])) for d, nn in zip(distances[0], indices[0]): print d, token_idx.token_for(nn)
# sanity check files are consistent with prefix and vars prefix, _time, var = prefix_time_var_of_ckpt(files[0]) for f in files: next_prefix, _time, next_var = prefix_time_var_of_ckpt(f) if prefix != next_prefix or next_var != var: raise Exception("glob includes files that dont match in prefix or var") print "\t".join("ckpt_time idx token x_l_dist x_f_dist".split()) first = None # always compare to first last = None # also compare to last for f in files: X = np.load(f) if ids: X = X[ids] if last is not None: _prefix, ckpt_time, _var = prefix_time_var_of_ckpt(f) row_wise_distance_X_L = np.sqrt(np.sum((X - last) ** 2, axis=1)) row_wise_distance_X_F = np.sqrt(np.sum((X - first) ** 2, axis=1)) for i, (x_l_dist, x_f_dist) in enumerate(zip(row_wise_distance_X_L, row_wise_distance_X_F)): t_id = ids[i] if ids else i print "%s\t%d\t%s\t%f\t%f" % (ckpt_time, t_id, token_idx.token_for(t_id), x_l_dist, x_f_dist) if first is None: first = X last = X