Пример #1
0
 def __init__(self,
              topics=None,
              docfreq=None,
              bow=None,
              verbosity=logging.DEBUG,
              prune_df_threshold=1,
              gcs_bucket=None,
              initialize_environment=True,
              repo2bow_kwargs=None):
     if initialize_environment:
         initialize()
     self._log = logging.getLogger("topic_detector")
     self._log.setLevel(verbosity)
     if gcs_bucket:
         backend = create_backend(args="bucket=" + gcs_bucket)
     else:
         backend = create_backend()
     if topics is None:
         self._topics = Topics(log_level=verbosity).load(backend=backend)
     else:
         assert isinstance(topics, Topics)
         self._topics = topics
     self._log.info("Loaded topics model: %s", self._topics)
     if docfreq is None:
         if docfreq is not False:
             self._docfreq = DocumentFrequencies(log_level=verbosity).load(
                 source=self._topics.dep("docfreq")["uuid"],
                 backend=backend)
         else:
             self._docfreq = None
             self._log.warning("Disabled document frequencies - you will "
                               "not be able to query custom repositories.")
     else:
         assert isinstance(docfreq, DocumentFrequencies)
         self._docfreq = docfreq
     if self._docfreq is not None:
         self._docfreq = self._docfreq.prune(prune_df_threshold)
     self._log.info("Loaded docfreq model: %s", self._docfreq)
     if bow is not None:
         assert isinstance(bow, BOWBase)
         self._bow = bow
         if self._topics.matrix.shape[1] != self._bow.matrix.shape[1]:
             raise ValueError(
                 "Models do not match: topics has %s tokens while bow has %s"
                 %
                 (self._topics.matrix.shape[1], self._bow.matrix.shape[1]))
         self._log.info("Attached BOW model: %s", self._bow)
     else:
         self._bow = None
         self._log.warning("No BOW cache was loaded.")
     if self._docfreq is not None:
         self._repo2bow = Repo2BOW(
             {t: i
              for i, t in enumerate(self._topics.tokens)}, self._docfreq,
             **(repo2bow_kwargs or {}))
     else:
         self._repo2bow = None
Пример #2
0
 def __init__(self,
              id2vec=None,
              df=None,
              nbow=None,
              verbosity=logging.DEBUG,
              wmd_cache_centroids=True,
              wmd_kwargs=None,
              gcs_bucket=None,
              repo2nbow_kwargs=None,
              initialize_environment=True):
     if initialize_environment:
         initialize()
     self._log = logging.getLogger("similar_repos")
     self._log.setLevel(verbosity)
     if gcs_bucket:
         backend = create_backend(args="bucket=" + gcs_bucket)
     else:
         backend = create_backend()
     if id2vec is None:
         self._id2vec = Id2Vec(log_level=verbosity, backend=backend)
     else:
         assert isinstance(id2vec, Id2Vec)
         self._id2vec = id2vec
     self._log.info("Loaded id2vec model: %s", self._id2vec)
     if df is None:
         if df is not False:
             self._df = DocumentFrequencies(log_level=verbosity,
                                            backend=backend)
         else:
             self._df = None
             self._log.warning("Disabled document frequencies - you will "
                               "not be able to query custom repositories.")
     else:
         assert isinstance(df, DocumentFrequencies)
         self._df = df
     self._log.info("Loaded document frequencies: %s", self._df)
     if nbow is None:
         self._nbow = NBOW(log_level=verbosity, backend=backend)
     else:
         assert isinstance(nbow, NBOW)
         self._nbow = nbow
     self._log.info("Loaded nBOW model: %s", self._nbow)
     self._repo2nbow = Repo2nBOW(self._id2vec,
                                 self._df,
                                 log_level=verbosity,
                                 **(repo2nbow_kwargs or {}))
     self._log.info("Creating the WMD engine...")
     self._wmd = WMD(self._id2vec.embeddings,
                     self._nbow,
                     verbosity=verbosity,
                     **(wmd_kwargs or {}))
     if wmd_cache_centroids:
         self._wmd.cache_centroids()
Пример #3
0
 def __init__(self, id2vec=None, df=None, nbow=None, prune_df_threshold=1,
              verbosity=logging.DEBUG, wmd_cache_centroids=True, wmd_kwargs=None,
              gcs_bucket=None, repo2nbow_kwargs=None, initialize_environment=True):
     if initialize_environment:
         initialize()
     self._log = logging.getLogger("similar_repos")
     self._log.setLevel(verbosity)
     if gcs_bucket:
         backend = create_backend(args="bucket=" + gcs_bucket)
     else:
         backend = create_backend()
     if id2vec is None:
         self._id2vec = Id2Vec(log_level=verbosity).load(backend=backend)
     else:
         assert isinstance(id2vec, Id2Vec)
         self._id2vec = id2vec
     self._log.info("Loaded id2vec model: %s", self._id2vec)
     if df is None:
         if df is not False:
             self._df = DocumentFrequencies(log_level=verbosity).load(backend=backend)
         else:
             self._df = None
             self._log.warning("Disabled document frequencies - you will "
                               "not be able to query custom repositories.")
     else:
         assert isinstance(df, DocumentFrequencies)
         self._df = df
     if self._df is not None:
         self._df = self._df.prune(prune_df_threshold)
     self._log.info("Loaded document frequencies: %s", self._df)
     if nbow is None:
         self._nbow = NBOW(log_level=verbosity).load(backend=backend)
     else:
         assert isinstance(nbow, NBOW)
         self._nbow = nbow
     self._log.info("Loaded nBOW model: %s", self._nbow)
     self._repo2nbow = Repo2nBOW(
         self._id2vec, self._df, log_level=verbosity, **(repo2nbow_kwargs or {}))
     assert self._nbow.dep("id2vec")["uuid"] == self._id2vec.meta["uuid"]
     if len(self._id2vec) != self._nbow.matrix.shape[1]:
         raise ValueError("Models do not match: id2vec has %s tokens while nbow has %s" %
                          (len(self._id2vec), self._nbow.matrix.shape[1]))
     self._log.info("Creating the WMD engine...")
     self._wmd = WMD(self._id2vec.embeddings, self._nbow,
                     verbosity=verbosity, **(wmd_kwargs or {}))
     if wmd_cache_centroids:
         self._wmd.cache_centroids()
Пример #4
0
    def test_create_backend_invalid_args(self):
        backup = back.config.BACKEND_ARGS
        back.config.BACKEND_ARGS = "lalala"
        with self.assertRaises(ValueError):
            back.create_backend("Bar")
        back.config.BACKEND_ARGS = backup
        backup = back.config.BACKEND_ARGS
        back.config.BACKEND_ARGS = ""

        class Bar(back.StorageBackend):
            NAME = "Bar"

        back.register_backend(Bar)
        git_index = ind.GitIndex(index_repo=self.default_url,
                                 cache=self.cached_path)
        try:
            self.assertIsInstance(back.create_backend("Bar", git_index), Bar)
        finally:
            back.config.BACKEND_ARGS = backup
Пример #5
0
 def __init__(self, id2vec=None, docfreq=None, gcs_bucket=None, **kwargs):
     if gcs_bucket:
         backend = create_backend("gcs", "bucket=" + gcs_bucket)
     else:
         backend = None
     self._id2vec = kwargs["id2vec"] = Id2Vec().load(id2vec or None,
                                                     backend=backend)
     self._df = kwargs["docfreq"] = DocumentFrequencies().load(
         docfreq or None, backend=backend)
     super(Repo2nBOWTransformer, self).__init__(**kwargs)
Пример #6
0
 def __init__(self, id2vec=None, docfreq=None, gcs_bucket=None, **kwargs):
     if gcs_bucket:
         backend = create_backend("gcs", "bucket=" + gcs_bucket)
     else:
         backend = None
     self._id2vec = kwargs["id2vec"] = Id2Vec().load(id2vec or None, backend=backend)
     self._df = kwargs["docfreq"] = DocumentFrequencies().load(docfreq or None, backend=backend)
     prune_df = kwargs.pop("prune_df", 1)
     if prune_df > 1:
         self._df = self._df.prune(prune_df)
     super().__init__(**kwargs)
Пример #7
0
    def test_create_backend_invalid_args(self):
        backup = backends.config.BACKEND_ARGS
        backends.config.BACKEND_ARGS = "lalala"

        try:
            with self.assertRaises(ValueError):
                backends.create_backend("Bar")
        finally:
            backends.config.BACKEND_ARGS = backup

        backup = backends.config.BACKEND_ARGS
        backends.config.BACKEND_ARGS = ""

        class Bar(backends.StorageBackend):
            NAME = "Bar"

        backends.register_backend(Bar)

        try:
            self.assertIsInstance(backends.create_backend("Bar"), Bar)
        finally:
            backends.config.BACKEND_ARGS = backup
Пример #8
0
 def __init__(self,
              id2vec=None,
              df=None,
              nbow=None,
              prune_df_threshold=1,
              wmd_cache_centroids=True,
              wmd_kwargs: Dict[str, Any] = None,
              languages: Tuple[List, bool] = (None, False),
              engine_kwargs: Dict[str, Any] = None):
     backend = create_backend()
     if id2vec is None:
         self._id2vec = Id2Vec().load(backend=backend)
     else:
         assert isinstance(id2vec, Id2Vec)
         self._id2vec = id2vec
     self._log.info("Loaded id2vec model: %s", self._id2vec)
     if df is None:
         if df is not False:
             self._df = DocumentFrequencies().load(backend=backend)
         else:
             self._df = None
             self._log.warning("Disabled document frequencies - you will "
                               "not be able to query custom repositories.")
     else:
         assert isinstance(df, DocumentFrequencies)
         self._df = df
     if self._df is not None:
         self._df = self._df.prune(prune_df_threshold)
     self._log.info("Loaded document frequencies: %s", self._df)
     if nbow is None:
         self._bow = BOW().load(backend=backend)
     else:
         assert isinstance(nbow, BOW)
         self._bow = nbow
     self._log.info("Loaded BOW model: %s", self._bow)
     assert self._bow.get_dep("id2vec")["uuid"] == self._id2vec.meta["uuid"]
     if len(self._id2vec) != self._bow.matrix.shape[1]:
         raise ValueError(
             "Models do not match: id2vec has %s tokens while nbow has %s" %
             (len(self._id2vec), self._bow.matrix.shape[1]))
     self._log.info("Creating the WMD engine...")
     self._wmd = WMD(self._id2vec.embeddings, self._bow, **(wmd_kwargs
                                                            or {}))
     if wmd_cache_centroids:
         self._wmd.cache_centroids()
     self._languages = languages
     self._engine_kwargs = engine_kwargs
Пример #9
0
    def test_auto(self):
        class FakeModel(GenericModel):
            NAME = "docfreq"

        def route(url):
            if GCSBackend.INDEX_FILE in url:
                return '{"models": {"docfreq": {' \
                       '"f64bacd4-67fb-4c64-8382-399a8e7db52a": ' \
                       '{"url": "https://xxx"}, ' \
                       '"default": "f64bacd4-67fb-4c64-8382-399a8e7db52a"' \
                       '}}}'.encode()
            self.assertEqual("https://xxx", url)
            with open(get_path(self.DOCFREQ_PATH), "rb") as fin:
                return fin.read()

        modelforge.gcs_backend.requests = FakeRequests(route)
        model = FakeModel(backend=create_backend())
        self._validate_meta(model)
Пример #10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("input", help="Repository URL or path or name.")
    parser.add_argument("--log-level",
                        default="INFO",
                        choices=logging._nameToLevel,
                        help="Logging verbosity.")
    parser.add_argument("--topics",
                        default=None,
                        help="Topic model URL or path.")
    parser.add_argument("--df",
                        default=None,
                        help="Document frequencies URL or path.")
    parser.add_argument("--bow", default=None, help="BOW model URL or path.")
    parser.add_argument("--bblfsh",
                        default=None,
                        help="babelfish server address.")
    parser.add_argument(
        "--timeout",
        type=int,
        default=None,
        help="Babelfish timeout - longer requests are dropped. Default is %s."
        % DEFAULT_BBLFSH_TIMEOUT)
    parser.add_argument("--gcs", default=None, help="GCS bucket to use.")
    parser.add_argument("--linguist",
                        default=None,
                        help="Path to src-d/enry or github/linguist.")
    parser.add_argument(
        "--prune-df",
        default=20,
        type=int,
        help="Minimum number of times an identifer must occur in different "
        "documents to be taken into account.")
    parser.add_argument("-n",
                        "--nnn",
                        default=10,
                        type=int,
                        help="Number of topics to print.")
    parser.add_argument("-f",
                        "--format",
                        default="human",
                        choices=["json", "human"],
                        help="Output format.")

    args = parser.parse_args()
    if args.linguist is None:
        args.linguist = "./enry"
    initialize(args.log_level, enry=args.linguist)
    if args.gcs:
        backend = create_backend(args="bucket=" + args.gcs)
    else:
        backend = create_backend()
    if args.topics is not None:
        args.topics = Topics(log_level=args.log_level).load(source=args.topics,
                                                            backend=backend)
    if args.df is not None:
        args.df = DocumentFrequencies(log_level=args.log_level).load(
            source=args.df, backend=backend)
    if args.bow is not None:
        args.bow = BOWBase(log_level=args.log_level).load(source=args.bow,
                                                          backend=backend)
    sr = TopicDetector(topics=args.topics,
                       docfreq=args.df,
                       bow=args.bow,
                       verbosity=args.log_level,
                       prune_df_threshold=args.prune_df,
                       gcs_bucket=args.gcs,
                       repo2bow_kwargs={
                           "linguist": args.linguist,
                           "bblfsh_endpoint": args.bblfsh,
                           "timeout": args.timeout
                       })
    topics = sr.query(args.input, size=args.nnn)
    if args.format == "json":
        json.dump({"repository": args.input, "topics": topics}, sys.stdout)
    elif args.format == "human":
        for t, r in topics:
            print("%64s" % t, "%.2f" % r, sep="\t")
Пример #11
0
 def setUp(self):
     self.backend = create_backend()
Пример #12
0
 def setUp(self):
     ind.git = fake_git
     ind.Repo = fake_git.FakeRepo
     fake_git.FakeRepo.reset(self.default_index)
     self.backend = create_backend(git_index=ind.GitIndex(
         remote=self.default_url, cache=self.cached_path))
Пример #13
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("input", help="Repository URL or path or name.")
    parser.add_argument("--log-level", default="INFO",
                        choices=logging._nameToLevel,
                        help="Logging verbosity.")
    parser.add_argument("--id2vec", default=None,
                        help="id2vec model URL or path.")
    parser.add_argument("--df", default=None,
                        help="Document frequencies URL or path.")
    parser.add_argument("--bow", default=None,
                        help="BOW model URL or path.")
    parser.add_argument("--prune-df", default=20, type=int,
                        help="Minimum number of times an identifier must occur in the dataset "
                             "to be taken into account.")
    parser.add_argument("--vocabulary-min", default=50, type=int,
                        help="Minimum number of words in a bag.")
    parser.add_argument("--vocabulary-max", default=500, type=int,
                        help="Maximum number of words in a bag.")
    parser.add_argument("-n", "--nnn", default=10, type=int,
                        help="Number of nearest neighbours.")
    parser.add_argument("--early-stop", default=0.1, type=float,
                        help="Maximum fraction of the nBOW dataset to scan.")
    parser.add_argument("--max-time", default=300, type=int,
                        help="Maximum time to spend scanning in seconds.")
    parser.add_argument("--skipped-stop", default=0.95, type=float,
                        help="Minimum fraction of skipped samples to stop.")
    languages = ["Java", "Python", "Go", "JavaScript", "TypeScript", "Ruby", "Bash", "Php"]
    parser.add_argument(
        "-l", "--languages", nargs="+", choices=languages,
        default=None,  # Default value for --languages arg should be None.
        # Otherwise if you process parquet files without 'lang' column, you will
        # fail to process it with any --languages argument.
        help="The programming languages to analyse.")
    parser.add_argument("--blacklist-languages", action="store_true",
                        help="Exclude the languages in --languages from the analysis "
                             "instead of filtering by default.")
    parser.add_argument(
        "-s", "--spark", default=SparkDefault.MASTER_ADDRESS,
        help="Spark's master address.")
    parser.add_argument("--bblfsh", default=EngineDefault.BBLFSH,
                        help="Babelfish server's address.")
    parser.add_argument("--engine", default=EngineDefault.VERSION,
                        help="source{d} jgit-spark-connector version.")
    args = parser.parse_args()
    setup_logging(args.log_level)
    backend = create_backend()
    if args.id2vec is not None:
        args.id2vec = Id2Vec().load(source=args.id2vec, backend=backend)
    if args.df is not None:
        args.df = DocumentFrequencies().load(source=args.df, backend=backend)
    if args.bow is not None:
        args.bow = BOW().load(source=args.bow, backend=backend)
    sr = SimilarRepositories(
        id2vec=args.id2vec, df=args.df, nbow=args.bow,
        prune_df_threshold=args.prune_df,
        wmd_cache_centroids=False,  # useless for a single query
        wmd_kwargs={"vocabulary_min": args.vocabulary_min,
                    "vocabulary_max": args.vocabulary_max},
        languages=(args.languages, args.blacklist_languages),
        engine_kwargs={"spark": args.spark,
                       "bblfsh": args.bblfsh,
                       "engine": args.engine},
    )
    neighbours = sr.query(
        args.input, k=args.nnn, early_stop=args.early_stop,
        max_time=args.max_time, skipped_stop=args.skipped_stop)
    for index, rate in neighbours:
        print("%48s\t%.2f" % (index, rate))
Пример #14
0
    def load(self,
             source: Union[str, BinaryIO, "Model"] = None,
             cache_dir: str = None,
             backend: StorageBackend = None,
             lazy=False) -> "Model":
        """
        Build a new Model instance.

        :param source: UUID, file system path, file object or an URL; None means auto.
        :param cache_dir: The directory where to store the downloaded model.
        :param backend: Remote storage backend to use if ``source`` is a UUID or a URL.
        :param lazy: Do not really load numpy arrays into memory. Instead, mmap() them. \
                     User is expected to call Model.close() when the tree is no longer needed.
        """
        if isinstance(source, Model):
            if not isinstance(source, type(self)):
                raise TypeError("Incompatible model instance: %s <> %s" %
                                (type(source), type(self)))
            self.__dict__ = source.__dict__
            return self

        if backend is not None and not isinstance(backend, StorageBackend):
            raise TypeError("backend must be an instance of "
                            "modelforge.storage_backend.StorageBackend")
        self._source = str(source)
        generic = self.NAME == self.GENERIC_NAME
        try:
            if source is None or (isinstance(source, str)
                                  and not os.path.isfile(source)):
                if cache_dir is None:
                    if not generic:
                        cache_dir = os.path.join(vendor_cache_dir(), self.NAME)
                    else:
                        cache_dir = tempfile.mkdtemp(prefix="modelforge-")
                try:
                    uuid.UUID(source)
                    is_uuid = True
                except (TypeError, ValueError):
                    is_uuid = False
                model_id = self.DEFAULT_NAME if not is_uuid else source
                file_name = model_id + self.DEFAULT_FILE_EXT
                file_name = os.path.join(cache_dir, file_name)
                if os.path.exists(file_name) and (not source or
                                                  not os.path.exists(source)):
                    source = file_name
                elif source is None or is_uuid:
                    if backend is None:
                        try:
                            backend = create_backend()
                        except ValueError as e:
                            raise ValueError(
                                "A backend must be set to load a UUID or the default model. The "
                                "attempt to create a backend with default parameters failed."
                            ) from e
                    index = backend.index.contents
                    config = index["models"]
                    if not generic:
                        if not is_uuid:
                            model_id = index["meta"][self.NAME][model_id]
                        source = config[self.NAME][model_id]
                    else:
                        if not is_uuid:
                            raise ValueError(
                                "File path, URL or UUID is needed.")
                        for models in config.values():
                            if source in models:
                                source = models[source]
                                break
                        else:
                            raise FileNotFoundError("Model %s not found." %
                                                    source)
                    source = source["url"]
                if re.match(r"\w+://", source):
                    download_http(source, file_name, self._log)
                    self._source = source
                    source = file_name
            if isinstance(source, str):
                size = os.stat(source).st_size
            else:
                self._source = "<file object>"
                pos = source.tell()
                size = source.seek(0, os.SEEK_END) - pos
                source.seek(pos, os.SEEK_SET)
            self._log.info("Reading %s (%s)...", source,
                           humanize.naturalsize(size))
            model = asdf.open(source, copy_arrays=not lazy, lazy_load=lazy)
            try:
                tree = model.tree
                self._meta = tree["meta"]
                self._initial_version = list(self.version)
                if not generic:
                    meta_name = self._meta["model"]
                    matched = self.NAME == meta_name
                    if not matched:
                        needed = {self.NAME}
                        for child in type(self).__subclasses__():
                            needed.add(child.NAME)
                            matched |= child.NAME == meta_name
                        if not matched:
                            raise ValueError(
                                "The supplied model is of the wrong type: needed "
                                "%s, got %s." % (needed, meta_name))
                self._load_tree(tree)
            finally:
                if not lazy:
                    model.close()
                else:
                    self._asdf = model
        finally:
            if generic and cache_dir is not None:
                shutil.rmtree(cache_dir)
        self._size = size
        return self
Пример #15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("input", help="Repository URL or path or name.")
    parser.add_argument("--log-level",
                        default="INFO",
                        choices=logging._nameToLevel,
                        help="Logging verbosity.")
    parser.add_argument("--id2vec",
                        default=None,
                        help="id2vec model URL or path.")
    parser.add_argument("--df",
                        default=None,
                        help="Document frequencies URL or path.")
    parser.add_argument("--nbow", default=None, help="nBOW model URL or path.")
    parser.add_argument("--no-cache-centroids",
                        action="store_true",
                        help="Do not cache WMD centroids.")
    parser.add_argument("--bblfsh",
                        default=None,
                        help="babelfish server address.")
    parser.add_argument(
        "--timeout",
        type=int,
        default=Repo2Base.DEFAULT_BBLFSH_TIMEOUT,
        help="Babelfish timeout - longer requests are dropped.")
    parser.add_argument("--gcs", default=None, help="GCS bucket to use.")
    parser.add_argument("--linguist",
                        default=None,
                        help="Path to github/linguist or src-d/enry.")
    parser.add_argument("--vocabulary-min",
                        default=50,
                        type=int,
                        help="Minimum number of words in a bag.")
    parser.add_argument("--vocabulary-max",
                        default=500,
                        type=int,
                        help="Maximum number of words in a bag.")
    parser.add_argument("-n",
                        "--nnn",
                        default=10,
                        type=int,
                        help="Number of nearest neighbours.")
    parser.add_argument("--early-stop",
                        default=0.1,
                        type=float,
                        help="Maximum fraction of the nBOW dataset to scan.")
    parser.add_argument("--max-time",
                        default=300,
                        type=int,
                        help="Maximum time to spend scanning in seconds.")
    parser.add_argument("--skipped-stop",
                        default=0.95,
                        type=float,
                        help="Minimum fraction of skipped samples to stop.")
    args = parser.parse_args()
    if args.linguist is None:
        args.linguist = "./enry"
    initialize(args.log_level, enry=args.linguist)
    if args.gcs:
        backend = create_backend(args="bucket=" + args.gcs)
    else:
        backend = create_backend()
    if args.id2vec is not None:
        args.id2vec = Id2Vec(source=args.id2vec, backend=backend)
    if args.df is not None:
        args.df = DocumentFrequencies(source=args.df, backend=backend)
    if args.nbow is not None:
        args.nbow = NBOW(source=args.nbow, backend=backend)
    sr = SimilarRepositories(id2vec=args.id2vec,
                             df=args.df,
                             nbow=args.nbow,
                             verbosity=args.log_level,
                             wmd_cache_centroids=not args.no_cache_centroids,
                             gcs_bucket=args.gcs,
                             repo2nbow_kwargs={
                                 "linguist": args.linguist,
                                 "bblfsh_endpoint": args.bblfsh,
                                 "timeout": args.timeout
                             },
                             wmd_kwargs={
                                 "vocabulary_min": args.vocabulary_min,
                                 "vocabulary_max": args.vocabulary_max
                             })
    neighbours = sr.query(args.input,
                          k=args.nnn,
                          early_stop=args.early_stop,
                          max_time=args.max_time,
                          skipped_stop=args.skipped_stop)
    for index, rate in neighbours:
        print("%48s\t%.2f" % (index, rate))