def main() -> None: # pylint: disable=missing-function-docstring formats = ["word2vec", "finalfusion", "fasttext", "text", "textdims"] parser = argparse.ArgumentParser(prog="ffp-similar", description="Similarity queries.") parser.add_argument("embeddings", type=str, help="Input embeddings", metavar="EMBEDDINGS") add_format_args(parser, "f", "format", formats, "finalfusion") parser.add_argument("-k", type=int, help="Number of neighbours. Default: 10", default=10, metavar="K") parser.add_argument( "input", help= "Optional input file with one word per line. If unspecified reads from stdin", nargs='?', default=0, ) add_common_args(parser) args = parser.parse_args() embeds = Format(args.format).load(args.embeddings, args.lossy, args.mmap) with open(args.input) as queries: for query in queries: query = query.strip() if not query: continue res = embeds.word_similarity(query, k=args.k) if res is None: print(f"Could not compute neighbours for: {query}", file=sys.stderr) else: print("\n".join(f"{ws.word} {ws.similarity}" for ws in res))
def main() -> None: # pylint: disable=missing-function-docstring formats = ["word2vec", "finalfusion", "fasttext", "text", "textdims"] parser = argparse.ArgumentParser(prog="ffp-convert", description="Convert embeddings.") add_input_output_args(parser) add_format_args(parser, "f", "from", formats, "word2vec") add_format_args(parser, "t", "to", formats, "finalfusion") add_common_args(parser) args = parser.parse_args() embeds = Format(getattr(args, 'from')).load(args.input, args.lossy, args.mmap) Format(args.to).write(args.output, embeds)
def main() -> None: # pylint: disable=missing-function-docstring parser = argparse.ArgumentParser( prog="ffp-bucket-to-explicit", description="Convert bucket embeddings to explicit lookups.") add_input_output_args(parser) add_format_args(parser, "f", "from", ["finalfusion", "fasttext"], "finalfusion") add_common_args(parser) args = parser.parse_args() embeds = Format(getattr(args, 'from')).load(args.input, args.lossy, args.mmap) embeds.bucket_to_explicit().write(args.output)
def main() -> None: # pylint: disable=missing-function-docstring formats = ["word2vec", "finalfusion", "fasttext", "text", "textdims"] parser = argparse.ArgumentParser(prog="ffp-analogy", description="Analogy queries.") parser.add_argument("embeddings", help="Input embeddings", type=str, metavar="EMBEDDINGS") add_format_args(parser, "f", "format", formats, "finalfusion") parser.add_argument( "-i", "--include", choices=["a", "b", "c"], nargs="+", default=[], help= "Specify query parts that should be allowed as answers. Valid choices: ['a', 'b', 'c']" ) parser.add_argument("-k", type=int, default=10, help="Number of neighbours. Default: 10", metavar="K") parser.add_argument( "input", help= "Optional input file with 3 words per line. If unspecified reads from stdin", nargs='?', default=0) add_common_args(parser) args = parser.parse_args() if args.include != [] and len(args.include) > 3: print("-i/--include can take up to 3 unique values: a, b and c.", file=sys.stderr) sys.exit(1) embeds = Format(args.format).load(args.embeddings, args.lossy, args.mmap) with open(args.input) as queries: for query in queries: query_a, query_b, query_c = query.strip().split() skips = get_skips(query_a, query_b, query_c, args.include) res = embeds.analogy(query_a, query_b, query_c, k=args.k, skip=skips) if res is None: print( f"Could not compute for: {query_a} : {query_b}, {query_c} : ? ", file=sys.stderr) else: print("\n".join(f"{ws.word} {ws.similarity}" for ws in res))
def main() -> None: # pylint: disable=missing-function-docstring formats = ["word2vec", "finalfusion", "fasttext", "text", "textdims"] parser = argparse.ArgumentParser( prog="ffp-select", description="Build embeddings from list of words.") add_input_output_args(parser) add_format_args(parser, "f", "format", formats, "finalfusion") parser.add_argument( "words", nargs='?', default=0, metavar="WORDS", help= "List of words to include in the embeddings. One word per line. Spaces permitted." "Reads from stdin if unspecified.") parser.add_argument("--ignore_unk", "-i", action="store_true", default=False, help="Skip unrepresentable words.") parser.add_argument( "--verbose", "-v", action="store_true", default=False, help= "Print which tokens are skipped because they can't be represented to stderr." ) add_common_args(parser) args = parser.parse_args() embeds = Format(args.format).load(args.input, args.lossy, args.mmap) with open(args.words, errors='replace' if args.lossy else 'strict') as inp: unique_words = set(word.strip() for word in inp) matrix = np.zeros((len(unique_words), embeds.storage.shape[1]), dtype=np.float32) vocab = SimpleVocab(list(unique_words)) for i, word in enumerate(vocab): try: matrix[i] = embeds[word] except KeyError: if args.verbose or not args.ignore_unk: print(f"Cannot represent '{word}'.", file=sys.stderr) if not args.ignore_unk: sys.exit(1) metadata = Metadata({"source_embeddings": args.input}) if embeds.metadata is not None: metadata["source_metadata"] = embeds.metadata Embeddings(storage=NdArray(matrix), vocab=vocab, metadata=metadata).write(args.output)