class DocumentFrequenciesTests(unittest.TestCase): def setUp(self): self.model = DocumentFrequencies().load(source=paths.DOCFREQ) def test_docs(self): docs = self.model.docs self.assertIsInstance(docs, int) self.assertEqual(docs, 1000) def test_get(self): self.assertEqual(self.model["aaaaaaa"], 341) with self.assertRaises(KeyError): print(self.model["xaaaaaa"]) self.assertEqual(self.model.get("aaaaaaa", 0), 341) self.assertEqual(self.model.get("xaaaaaa", 100500), 100500) def test_tokens(self): self.assertEqual(list(self.model._df), self.model.tokens()) def test_len(self): # the remaining 18 are not unique - the model was generated badly self.assertEqual(len(self.model), 982) def test_iter(self): aaa = False for tok, freq in self.model: if "aaaaaaa" in tok: aaa = True int(freq) break self.assertTrue(aaa) def test_prune(self): pruned = self.model.prune(4) for tok, freq in pruned: self.assertGreaterEqual(freq, 4) self.assertEqual(len(pruned), 346) def test_prune_self(self): pruned = self.model.prune(1) self.assertIs(self.model, pruned) def test_greatest(self): pruned = self.model.greatest(100) freqs = [v for v in self.model._df.values()] freqs.sort(reverse=True) border = freqs[100] for v in pruned._df.values(): self.assertGreaterEqual(v, border) df1 = pruned._df df2 = self.model.greatest(100)._df self.assertEqual(df1, df2) def test_write(self): buffer = BytesIO() self.model.save(buffer) buffer.seek(0) new_model = DocumentFrequencies().load(buffer) self.assertEqual(self.model._df, new_model._df) self.assertEqual(self.model.docs, new_model.docs)
def test_write(self): buffer = BytesIO() self.model.save(buffer) buffer.seek(0) new_model = DocumentFrequencies().load(buffer) self.assertEqual(self.model._df, new_model._df) self.assertEqual(self.model.docs, new_model.docs)
def __init__(self, id2vec=None, df=None, nbow=None, prune_df_threshold=1, wmd_cache_centroids=True, wmd_kwargs: Dict[str, Any] = None, languages: Tuple[List, bool] = (None, False), engine_kwargs: Dict[str, Any] = None): backend = create_backend() if id2vec is None: self._id2vec = Id2Vec().load(backend=backend) else: assert isinstance(id2vec, Id2Vec) self._id2vec = id2vec self._log.info("Loaded id2vec model: %s", self._id2vec) if df is None: if df is not False: self._df = DocumentFrequencies().load(backend=backend) else: self._df = None self._log.warning("Disabled document frequencies - you will " "not be able to query custom repositories.") else: assert isinstance(df, DocumentFrequencies) self._df = df if self._df is not None: self._df = self._df.prune(prune_df_threshold) self._log.info("Loaded document frequencies: %s", self._df) if nbow is None: self._bow = BOW().load(backend=backend) else: assert isinstance(nbow, BOW) self._bow = nbow self._log.info("Loaded BOW model: %s", self._bow) assert self._bow.get_dep("id2vec")["uuid"] == self._id2vec.meta["uuid"] if len(self._id2vec) != self._bow.matrix.shape[1]: raise ValueError( "Models do not match: id2vec has %s tokens while nbow has %s" % (len(self._id2vec), self._bow.matrix.shape[1])) self._log.info("Creating the WMD engine...") self._wmd = WMD(self._id2vec.embeddings, self._bow, **(wmd_kwargs or {})) if wmd_cache_centroids: self._wmd.cache_centroids() self._languages = languages self._engine_kwargs = engine_kwargs
def projector_entry(args): MAX_TOKENS = 10000 # hardcoded in Tensorflow Projector log = logging.getLogger("id2vec_projector") id2vec = Id2Vec(log_level=args.log_level).load(source=args.input) if args.docfreq: from sourced.ml.models import DocumentFrequencies df = DocumentFrequencies(log_level=args.log_level).load(source=args.docfreq) else: df = None if len(id2vec) < MAX_TOKENS: tokens = numpy.arange(len(id2vec), dtype=int) if df is not None: freqs = [df.get(id2vec.tokens[i], 0) for i in tokens] else: freqs = None else: if df is not None: log.info("Filtering tokens through docfreq") items = [] for token, idx in id2vec.items(): try: items.append((df[token], idx)) except KeyError: continue log.info("Sorting") items.sort(reverse=True) tokens = [i[1] for i in items[:MAX_TOKENS]] freqs = [i[0] for i in items[:MAX_TOKENS]] else: log.warning("You have not specified --df => picking random %d tokens", MAX_TOKENS) numpy.random.seed(777) tokens = numpy.random.choice( numpy.arange(len(id2vec), dtype=int), MAX_TOKENS, replace=False) freqs = None log.info("Gathering the embeddings") embeddings = numpy.vstack([id2vec.embeddings[i] for i in tokens]) tokens = [id2vec.tokens[i] for i in tokens] labels = ["subtoken"] if freqs is not None: labels.append("docfreq") tokens = list(zip(tokens, (str(i) for i in freqs))) import sourced.ml.utils.projector as projector projector.present_embeddings(args.output, not args.no_browser, labels, tokens, embeddings) if not args.no_browser: projector.wait()
def test_finalize(self): self.merge_df.convert_model(self.model1) self.merge_df.convert_model(self.model2) with tempfile.TemporaryDirectory(prefix="merge-df-") as tmpdir: dest = os.path.join(tmpdir, "df.asdf") self.merge_df.finalize(0, dest) df = DocumentFrequencies().load(dest) self.assertEqual(df.docs, 6) self.assertEqual(df._df, self.merge_result)
def setUp(self): self.model1 = DocumentFrequencies().construct(3, { "one": 1, "two": 2, "three": 3 }) self.model2 = DocumentFrequencies().construct(3, { "four": 4, "three": 3, "five": 5 }) self.merge_df = MergeDocFreq(min_docfreq=1, vocabulary_size=100) self.merge_result = { "one": 1, "two": 2, "three": 6, "four": 4, "five": 5 }
def test_save_load(self): indexer = Indexer("to_index") res = indexer(self.data_rdd) with tempfile.NamedTemporaryFile(suffix="-index.asdf") as tmp: cached_index_path = tmp.name indexer.save_index(cached_index_path) docfreq = DocumentFrequencies().load(source=cached_index_path) document_index = {key: int(val) for (key, val) in docfreq} indexer = Indexer("to_index", column2id=document_index) self.assertEqual(res.collect(), indexer(self.data_rdd).collect())
def setUp(self): self.session = create_spark_for_test() df = DocumentFrequencies().construct(10, {str(i): i for i in range(1, 5)}) self.docs = df.docs self.tfidf = TFIDF(df, df.docs, self.session.sparkContext) class Columns: """ Stores column names for return value. """ token = "t" document = "d" value = "v" self.tfidf.Columns = Columns
def setUp(self): self.sc = create_spark("test") df = DocumentFrequencies().construct(10, {str(i): i for i in range(1, 5)}) self.tfidf = TFIDF(df=df) class Columns: """ Stores column names for return value. """ token = "t" document = "d" value = "v" self.tfidf.Columns = Columns
def repos2bow_template(args, cache_hook: Transformer = None, save_hook: Transformer = None): log = logging.getLogger("repos2bow") extractors = create_extractors_from_args(args) session_name = "repos2bow-%s" % uuid4() root, start_point = create_uast_source(args, session_name) log.info("Loading the document index from %s ...", args.cached_index_path) docfreq = DocumentFrequencies().load(source=args.cached_index_path) document_index = {key: int(val) for (key, val) in docfreq} try: if args.quant is not None: create_or_apply_quant(args.quant, extractors, None) df_model = create_or_load_ordered_df(args, None, None) except ValueError: return 1 ec = EngineConstants.Columns if args.mode == Moder.Options.repo: def keymap(r): return r[ec.RepositoryId] else: def keymap(r): return r[ec.RepositoryId] + UastRow2Document.REPO_PATH_SEP + \ r[ec.Path] + UastRow2Document.PATH_BLOB_SEP + r[ec.BlobId] log.info("Caching UASTs to disk after partitioning by document ...") start_point = start_point.link(Moder(args.mode)) \ .link(Repartitioner.maybe(args.num_iterations, keymap=keymap)) \ .link(Cacher.maybe("DISK_ONLY")) for num_part in range(args.num_iterations): log.info("Running job %s of %s", num_part + 1, args.num_iterations) selected_part = start_point \ .link(PartitionSelector(num_part)) \ .link(Repartitioner.maybe(args.partitions, args.shuffle)) \ .link(Cacher.maybe(args.persist)) if cache_hook is not None: selected_part.link(cache_hook()).execute() uast_extractor = selected_part \ .link(UastRow2Document()) \ .link(Cacher.maybe(args.persist)) log.info("Collecting distinct documents ...") documents = uast_extractor \ .link(FieldsSelector([Uast2BagFeatures.Columns.document])) \ .link(Distinct()) \ .link(Collector()) \ .execute() selected_part.unpersist() documents = {row.document for row in documents} reduced_doc_index = { key: document_index[key] for key in document_index if key in documents} document_indexer = Indexer(Uast2BagFeatures.Columns.document, reduced_doc_index) log.info("Processing %s distinct documents", len(documents)) bags = uast_extractor \ .link(UastDeserializer()) \ .link(Uast2BagFeatures(*extractors)) \ .link(BagFeatures2TermFreq()) \ .link(Cacher.maybe(args.persist)) log.info("Extracting UASTs and collecting distinct tokens ...") tokens = bags \ .link(FieldsSelector([Uast2BagFeatures.Columns.token])) \ .link(Distinct()) \ .link(Collector()) \ .execute() uast_extractor.unpersist() tokens = {row.token for row in tokens} reduced_token_freq = {key: df_model[key] for key in df_model.df if key in tokens} reduced_token_index = {key: df_model.order[key] for key in df_model.df if key in tokens} log.info("Processing %s distinct tokens", len(reduced_token_freq)) log.info("Indexing by document and token ...") bags_writer = bags \ .link(TFIDF(reduced_token_freq, df_model.docs, root.session.sparkContext)) \ .link(document_indexer) \ .link(Indexer(Uast2BagFeatures.Columns.token, reduced_token_index)) if save_hook is not None: bags_writer = bags_writer \ .link(Repartitioner.maybe(args.partitions, args.shuffle)) \ .link(save_hook()) bow = args.bow.split(".asdf")[0] + "_" + str(num_part + 1) + ".asdf" bags_writer \ .link(Repartitioner.maybe( args.partitions, keymap=lambda x: x[Uast2BagFeatures.Columns.document])) \ .link(BOWWriter(document_indexer, df_model, bow, args.batch)) \ .execute() bags.unpersist() pipeline_graph(args, log, root)
def save_index(self, path): self._log.info("Saving the index to %s", path) DocumentFrequencies().construct(len(self._value_to_index), self._value_to_index) \ .save(output=path, series="docfreq")
def id2vec_preprocess(args): """ Loads co-occurrence matrices for several repositories and generates the document frequencies and the Swivel protobuf dataset. :param args: :class:`argparse.Namespace` with "input", "vocabulary_size", \ "shard_size", "df" and "output". :return: None """ log = logging.getLogger("preproc") log.info("Loading docfreq model from %s", args.docfreq_in) df_model = DocumentFrequencies(log_level=args.log_level).load( source=args.docfreq_in) coocc_model = Cooccurrences().load(args.input) if numpy.any(coocc_model.matrix.data < 0): raise ValueError( ("Co-occurrence matrix %s contains negative elements. " "Please check its correctness.") % args.input) if numpy.any(numpy.isnan(coocc_model.matrix.data)): raise ValueError(("Co-occurrence matrix %s contains nan elements. " "Please check its correctness.") % args.input) try: df_meta = coocc_model.get_dep(DocumentFrequencies.NAME) if df_model.meta != df_meta: raise ValueError(( "Document frequency model you provided does not match dependency inside " "Cooccurrences model:\nargs.docfreq.meta:\n%s\ncoocc_model.get_dep" "(\"docfreq\")\n%s\n") % (df_model.meta, df_meta)) except KeyError: pass # There is no docfreq dependency vs = args.vocabulary_size if len(df_model) < vs: vs = len(df_model) sz = args.shard_size if vs < sz: raise ValueError( "vocabulary_size=%s is less than shard_size=%s. You should specify a smaller " "shard_size (e.g. shard_size=%s)." % (vs, sz, vs)) vs -= vs % sz log.info("Effective vocabulary size: %d", vs) df_model = df_model.greatest(vs) log.info("Sorting the vocabulary...") chosen_words = sorted(df_model.tokens()) word_indices = {w: i for i, w in enumerate(chosen_words)} if not os.path.exists(args.output): os.makedirs(args.output) with open(os.path.join(args.output, "row_vocab.txt"), "w") as out: out.write('\n'.join(chosen_words)) log.info("Saved row_vocab.txt") shutil.copyfile(os.path.join(args.output, "row_vocab.txt"), os.path.join(args.output, "col_vocab.txt")) log.info("Saved col_vocab.txt") del chosen_words ccmatrix = extract_coocc_matrix((vs, vs), word_indices, coocc_model) log.info("Planning the sharding...") bool_sums = ccmatrix.indptr[1:] - ccmatrix.indptr[:-1] reorder = numpy.argsort(-bool_sums) with open(os.path.join(args.output, "row_sums.txt"), "w") as out: out.write('\n'.join(map(str, bool_sums.tolist()))) log.info("Saved row_sums.txt") shutil.copyfile(os.path.join(args.output, "row_sums.txt"), os.path.join(args.output, "col_sums.txt")) log.info("Saved col_sums.txt") log.info("Writing the shards...") os.makedirs(args.output, exist_ok=True) nshards = vs // args.shard_size for row in progress_bar(range(nshards), log, expected_size=nshards): for col in range(nshards): indices_row = reorder[row::nshards] indices_col = reorder[col::nshards] shard = ccmatrix[indices_row][:, indices_col].tocoo() example = tf.train.Example(features=tf.train.Features( feature={ "global_row": _int64s(indices_row), "global_col": _int64s(indices_col), "sparse_local_row": _int64s(shard.row), "sparse_local_col": _int64s(shard.col), "sparse_value": _floats(shard.data) })) with open( os.path.join(args.output, "shard-%03d-%03d.pb" % (row, col)), "wb") as out: out.write(example.SerializeToString()) log.info("Success")
class SimilarRepositories: GITHUB_URL_RE = re.compile( r"(https://|ssh://git@|git://)(github.com/[^/]+/[^/]+)(|.git|/)") _log = logging.getLogger("SimilarRepositories") def __init__(self, id2vec=None, df=None, nbow=None, prune_df_threshold=1, wmd_cache_centroids=True, wmd_kwargs: Dict[str, Any] = None, languages: Tuple[List, bool] = (None, False), engine_kwargs: Dict[str, Any] = None): backend = create_backend() if id2vec is None: self._id2vec = Id2Vec().load(backend=backend) else: assert isinstance(id2vec, Id2Vec) self._id2vec = id2vec self._log.info("Loaded id2vec model: %s", self._id2vec) if df is None: if df is not False: self._df = DocumentFrequencies().load(backend=backend) else: self._df = None self._log.warning("Disabled document frequencies - you will " "not be able to query custom repositories.") else: assert isinstance(df, DocumentFrequencies) self._df = df if self._df is not None: self._df = self._df.prune(prune_df_threshold) self._log.info("Loaded document frequencies: %s", self._df) if nbow is None: self._bow = BOW().load(backend=backend) else: assert isinstance(nbow, BOW) self._bow = nbow self._log.info("Loaded BOW model: %s", self._bow) assert self._bow.get_dep("id2vec")["uuid"] == self._id2vec.meta["uuid"] if len(self._id2vec) != self._bow.matrix.shape[1]: raise ValueError( "Models do not match: id2vec has %s tokens while nbow has %s" % (len(self._id2vec), self._bow.matrix.shape[1])) self._log.info("Creating the WMD engine...") self._wmd = WMD(self._id2vec.embeddings, self._bow, **(wmd_kwargs or {})) if wmd_cache_centroids: self._wmd.cache_centroids() self._languages = languages self._engine_kwargs = engine_kwargs def query(self, url_or_path_or_name: str, **kwargs) -> List[Tuple[str, float]]: try: repo_index = self._bow.documents.index(url_or_path_or_name) except ValueError: repo_index = -1 if repo_index == -1: match = self.GITHUB_URL_RE.match(url_or_path_or_name) if match is not None: name = match.group(2) try: repo_index = self._bow.documents.index(name) except ValueError: pass if repo_index >= 0: neighbours = self._query_domestic(repo_index, **kwargs) else: neighbours = self._query_foreign(url_or_path_or_name, **kwargs) neighbours = [(self._bow[n[0]][0], n[1]) for n in neighbours] return neighbours def _query_domestic(self, repo_index, **kwargs): return self._wmd.nearest_neighbors(repo_index, **kwargs) def _query_foreign(self, url_or_path: str, **kwargs): df = self._df if df is None: raise ValueError("Cannot query custom repositories if the " "document frequencies are disabled.") with tempfile.TemporaryDirectory(prefix="vecino-") as tempdir: target = os.path.join(tempdir, "repo") if os.path.isdir(url_or_path): url_or_path = os.path.abspath(url_or_path) os.symlink(url_or_path, target, target_is_directory=True) repo_format = "standard" else: self._log.info("Cloning %s to %s", url_or_path, target) porcelain.clone(url_or_path, target, bare=True, outstream=sys.stderr) repo_format = "bare" bow = repo2bow(tempdir, repo_format, 1, df, *self._languages, engine_kwargs=self._engine_kwargs) ibow = {} for key, val in bow.items(): try: ibow[self._id2vec[key]] = val except KeyError: continue words, weights = zip(*sorted(ibow.items())) return self._wmd.nearest_neighbors((words, weights), **kwargs)
def setUp(self): self.model = DocumentFrequencies().load(source=paths.DOCFREQ)
def test_greatest2(self): df = DocumentFrequencies().construct(100, {str(x): x for x in range(1000)}) df_greatest_true = {str(x): x for x in range(500, 1000)} df_greatest = df.greatest(500) self.assertEqual(df_greatest._df, df_greatest_true) df._df["500a"] = 500 df._df["500b"] = 500 df._df["500c"] = 500 df._df["500d"] = 500 df._df["500e"] = 500 df_greatest = df.greatest(500) self.assertEqual(df_greatest._df, df_greatest_true) df_greatest_true["500a"] = 500 df_greatest = df.greatest(501) self.assertEqual(df_greatest._df, df_greatest_true) df_greatest_true["500b"] = 500 df_greatest_true["500c"] = 500 df_greatest_true["500d"] = 500 df_greatest_true["500e"] = 500 df_greatest = df.greatest(505) self.assertEqual(df_greatest._df, df_greatest_true) df_greatest_true["499"] = 499 df_greatest = df.greatest(506) self.assertEqual(df_greatest._df, df_greatest_true)
def main(): parser = argparse.ArgumentParser() parser.add_argument("input", help="Repository URL or path or name.") parser.add_argument("--log-level", default="INFO", choices=logging._nameToLevel, help="Logging verbosity.") parser.add_argument("--id2vec", default=None, help="id2vec model URL or path.") parser.add_argument("--df", default=None, help="Document frequencies URL or path.") parser.add_argument("--bow", default=None, help="BOW model URL or path.") parser.add_argument("--prune-df", default=20, type=int, help="Minimum number of times an identifier must occur in the dataset " "to be taken into account.") parser.add_argument("--vocabulary-min", default=50, type=int, help="Minimum number of words in a bag.") parser.add_argument("--vocabulary-max", default=500, type=int, help="Maximum number of words in a bag.") parser.add_argument("-n", "--nnn", default=10, type=int, help="Number of nearest neighbours.") parser.add_argument("--early-stop", default=0.1, type=float, help="Maximum fraction of the nBOW dataset to scan.") parser.add_argument("--max-time", default=300, type=int, help="Maximum time to spend scanning in seconds.") parser.add_argument("--skipped-stop", default=0.95, type=float, help="Minimum fraction of skipped samples to stop.") languages = ["Java", "Python", "Go", "JavaScript", "TypeScript", "Ruby", "Bash", "Php"] parser.add_argument( "-l", "--languages", nargs="+", choices=languages, default=None, # Default value for --languages arg should be None. # Otherwise if you process parquet files without 'lang' column, you will # fail to process it with any --languages argument. help="The programming languages to analyse.") parser.add_argument("--blacklist-languages", action="store_true", help="Exclude the languages in --languages from the analysis " "instead of filtering by default.") parser.add_argument( "-s", "--spark", default=SparkDefault.MASTER_ADDRESS, help="Spark's master address.") parser.add_argument("--bblfsh", default=EngineDefault.BBLFSH, help="Babelfish server's address.") parser.add_argument("--engine", default=EngineDefault.VERSION, help="source{d} jgit-spark-connector version.") args = parser.parse_args() setup_logging(args.log_level) backend = create_backend() if args.id2vec is not None: args.id2vec = Id2Vec().load(source=args.id2vec, backend=backend) if args.df is not None: args.df = DocumentFrequencies().load(source=args.df, backend=backend) if args.bow is not None: args.bow = BOW().load(source=args.bow, backend=backend) sr = SimilarRepositories( id2vec=args.id2vec, df=args.df, nbow=args.bow, prune_df_threshold=args.prune_df, wmd_cache_centroids=False, # useless for a single query wmd_kwargs={"vocabulary_min": args.vocabulary_min, "vocabulary_max": args.vocabulary_max}, languages=(args.languages, args.blacklist_languages), engine_kwargs={"spark": args.spark, "bblfsh": args.bblfsh, "engine": args.engine}, ) neighbours = sr.query( args.input, k=args.nnn, early_stop=args.early_stop, max_time=args.max_time, skipped_stop=args.skipped_stop) for index, rate in neighbours: print("%48s\t%.2f" % (index, rate))