def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--vector_file", type=str, required=True, help="Path to the vector file.") parser.add_argument('--sparse', action='store_true', help="Load sparse representation.") parser.add_argument('--normalize', action='store_true', help="If set, vector is normalized.") parser.add_argument("--top_num", type=int, default=10, help="The number of neighbours returned.") args = parser.parse_args() if args.sparse: matrix, vocab, _ = load_sparse(args.vector_file) else: matrix, vocab, _ = load_dense(args.vector_file) if args.normalize: matrix = normalize(matrix, args.sparse) top_num = args.top_num while (True): target = input("Enter a word (EXIT to break): ") if target == "EXIT": break if target not in vocab["i2w"]: print("Out of vocabulary") continue target_vocab = {} target_vocab["i2w"], target_vocab["w2i"] = [target], {target: 0} sim_matrix = prepare_similarities(matrix, target_vocab, vocab, args.sparse) neighbours = [] for i, w in enumerate(vocab["i2w"]): sim = sim_matrix[0, i] if target == w: continue if len(neighbours) == 0: neighbours.append((w, sim)) continue if sim <= neighbours[-1][1] and len(neighbours) >= top_num: continue for j in range(len(neighbours)): if sim > neighbours[j][1]: neighbours.insert(j, (w, sim)) break if len(neighbours) > top_num: neighbours.pop(-1) print("{0: <20} {1: <20}".format("word", "similarity")) for w, sim in neighbours: print("{0: <20} {1: <20}".format(w, sim))
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--ppmi_file", type=str, required=True, help="Path to the counts (matrix) file.") parser.add_argument("--svd_file", type=str, required=True, help="Path to the SVD file.") parser.add_argument("--input_vocab_file", type=str, required=True, help="Path to the input vocabulary file.") parser.add_argument("--output_vocab_file", type=str, required=True, help="Path to the output vocabulary file.") parser.add_argument("--size", type=int, default=100, help="Vector size.") parser.add_argument("--normalize", action="store_true", help="If set, we factorize normalized PPMI matrix") args = parser.parse_args() print("Ppmi2svd") input_vocab, _ = load_vocabulary(args.input_vocab_file) output_vocab, _ = load_vocabulary(args.output_vocab_file) ppmi, _, _ = load_sparse(args.ppmi_file) if args.normalize: ppmi = normalize(ppmi, sparse=True) ut, s, vt = sparsesvd(ppmi.tocsc(), args.size) np.save(args.svd_file + ".ut.npy", ut) np.save(args.svd_file + ".s.npy", s) np.save(args.svd_file + ".vt.npy", vt) save_dense(args.svd_file + ".input", ut.T, input_vocab) save_dense(args.svd_file + ".output", vt.T, output_vocab) print("Ppmi2svd finished")
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--input_vector_file", type=str, required=True, help="Path to the input vector file.") parser.add_argument("--output_vector_file", type=str, help="Path to the output vector file.") parser.add_argument("--test_file", type=str, required=True, help="Path to the similarity task.") parser.add_argument('--sparse', action='store_true', help="Load sparse representation.") parser.add_argument('--normalize', action='store_true', help="If set, vector is normalized.") parser.add_argument("--ensemble", type=str, default="input", choices=["input", "output", "add", "concat"], help="""Strategies for using input/output vectors. One can use only input, only output, the addition of input and output, or their concatenation. Options are [input|output|add|concat].""") args = parser.parse_args() testset = load_similarity(args.test_file) if args.sparse: matrix, vocab, _ = load_sparse(args.input_vector_file) else: matrix, vocab, _ = load_dense(args.input_vector_file) if not args.sparse: if args.ensemble == "add": output_matrix, output_vocab, _ = load_dense(args.output_vector_file) output_matrix = align_matrix(matrix, output_matrix, vocab, output_vocab) matrix = matrix + output_matrix elif args.ensemble == "concat": output_matrix, output_vocab, _ = load_dense(args.output_vector_file) output_matrix = align_matrix(matrix, output_matrix, vocab, output_vocab) matrix = np.concatenate([matrix, output_matrix], axis=1) elif args.ensemble == "output": matrix, vocab, _ = load_dense(args.output_vector_file) else: # args.ensemble == "input": pass if args.normalize: matrix = normalize(matrix, args.sparse) results = [] for (w1, w2), sim_expected in testset: sim_actual = similarity(matrix, vocab["w2i"], w1, w2, args.sparse) if sim_actual is not None: results.append((sim_actual, sim_expected)) actual, expected = zip(*results) print("seen/total: {}/{}".format(len(results), len(testset))) print("{}: {:.3f}".format(args.test_file, spearmanr(actual, expected)[0]))
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--input_vector_file", type=str, required=True, help="") parser.add_argument("--output_vector_file", type=str, help="") parser.add_argument("--test_file", type=str, required=True, help="") parser.add_argument('--sparse', action='store_true', help="Load sparse representation.") parser.add_argument('--normalize', action='store_true', help="If set, vector is normalized.") parser.add_argument("--ensemble", type=str, default="input", choices=["input", "output", "add", "concat"], help="""Strategies for using input/output vectors. One can use only input, only output, the addition of input and output, or their concatenation. Options are [input|output|add|concat].""") args = parser.parse_args() testset = load_analogy(args.test_file) ana_vocab, vocab = {}, {} ana_vocab["i2w"], ana_vocab["w2i"] = get_ana_vocab(testset) if args.sparse: matrix, vocab, _ = load_sparse(args.input_vector_file) else: matrix, vocab, _ = load_dense(args.input_vector_file) if not args.sparse: if args.ensemble == "add": output_matrix, output_vocab, _ = load_dense( args.output_vector_file) output_matrix = align_matrix(matrix, output_matrix, vocab, output_vocab) matrix = matrix + output_matrix elif args.ensemble == "concat": output_matrix, output_vocab, _ = load_dense( args.output_vector_file) output_matrix = align_matrix(matrix, output_matrix, vocab, output_vocab) matrix = np.concatenate([matrix, output_matrix], axis=1) elif args.ensemble == "output": matrix, vocab, _ = load_dense(args.output_vector_file) else: # args.ensemble == "input" pass if args.normalize: matrix = normalize(matrix, args.sparse) matrix, vocab["i2w"], vocab["w2i"] = retain_words(matrix, vocab["i2w"], vocab["w2i"]) sim_matrix = prepare_similarities(matrix, ana_vocab, vocab, sparse=args.sparse) seen, correct_add, correct_mul = 0, 0, 0 for a, a_, b, b_ in testset: if a not in vocab["i2w"] or a_ not in vocab["i2w"] or b not in vocab[ "i2w"]: continue seen += 1 guess_add, guess_mul = guess(sim_matrix, ana_vocab, vocab, a, a_, b) if guess_add == b_: correct_add += 1 if guess_mul == b_: correct_mul += 1 accuracy_add = float(correct_add) / seen accuracy_mul = float(correct_mul) / seen print("seen/total: {}/{}".format(seen, len(testset))) print("{}: {:.3f} {:.3f}".format(args.test_file, accuracy_add, accuracy_mul))