Пример #1
0
    def test_parquet(self):
        languages1 = ["Python", "Java"]
        languages2 = ["Java"]

        engine = create_engine("test", SIVA_DIR)
        res = Ignition(engine) \
            .link(HeadFiles()) \
            .link(LanguageExtractor()) \
            .link(LanguageSelector(languages1)) \
            .link(Collector()) \
            .execute()
        self.assertEqual({x.lang for x in res}, set(languages1))

        res = Ignition(engine) \
            .link(HeadFiles()) \
            .link(LanguageExtractor()) \
            .link(LanguageSelector(languages2)) \
            .link(Collector()) \
            .execute()
        self.assertEqual({x.lang for x in res}, set(languages2))

        res = Ignition(engine) \
            .link(HeadFiles()) \
            .link(LanguageExtractor()) \
            .link(LanguageSelector(languages2, blacklist=True)) \
            .link(Collector()) \
            .execute()
        self.assertEqual(set(), {x.lang for x in res} & set(languages2))

        res = Ignition(engine) \
            .link(HeadFiles()) \
            .link(LanguageExtractor()) \
            .link(LanguageSelector([])) \
            .link(Collector()) \
            .execute()
        self.assertEqual(set(), {x.lang for x in res})

        parquet_loader = create_parquet_loader("test_parquet",
                                               repositories=PARQUET_DIR)
        df = parquet_loader.execute()
        with self.assertRaises(AttributeError):
            LanguageSelector(languages1)(df)

        df_with_lang = df.withColumn("lang", lit("BestLang"))
        self.assertEqual(
            0, len(LanguageSelector(languages1)(df_with_lang).collect()))

        self.assertEqual(
            df_with_lang.collect(),
            LanguageSelector(["BestLang"])(df_with_lang).collect())
Пример #2
0
def repo2bow(repository: str,
             repository_format: str,
             docfreq_threshold: int,
             docfreq: DocumentFrequencies,
             languages: List[str] = None,
             blacklist_languages=False,
             engine_kwargs: Dict[str, Any] = None) -> Dict[str, float]:
    log = logging.getLogger("repo2bow")
    token_index = {"i." + key: int(val) for (key, val) in docfreq}
    session_name = "repo2bow-%s" % uuid4()
    engine_args = {
        "repositories": repository,
        "repository_format": repository_format,
    }
    if engine_kwargs is not None:
        engine_args.update(engine_kwargs)
    engine = create_engine(session_name, **engine_args)
    root = Ignition(engine) >> RepositoriesFilter(r"^file://.*") >> HeadFiles()
    if languages is not None:
        file_source = root >> \
                      LanguageExtractor() >> \
                      LanguageSelector(languages=languages, blacklist=blacklist_languages)
    else:
        file_source = root
    bag = (file_source >> UastExtractor() >> Moder("repo") >>
           UastDeserializer() >> UastRow2Document() >> Uast2BagFeatures(
               IdentifiersBagExtractor(docfreq_threshold)) >>
           BagFeatures2TermFreq() >> TFIDF(
               token_index, docfreq.docs,
               engine.session.sparkContext) >> Collector()).execute()
    log.info("extracted %d identifiers", len(bag))
    return {r.token[2:]: r.value for r in bag}
Пример #3
0
def code2vec(args):
    log = logging.getLogger("code2vec")
    session_name = "code2vec-%s" % uuid4()
    root, start_point = create_uast_source(args, session_name)

    res = start_point \
        .link(UastRow2Document()) \
        .link(UastDeserializer()) \
        .link(Uast2BagFeatures([UastPathsBagExtractor(args.max_length, args.max_width)])) \
        .link(Collector()) \
        .execute()

    # TODO: Add rest of data pipeline: extract distinct paths and terminal nodes for embedding mapping
    # TODO: Add transformer to write bags and vocabs to a model
    # TODO: Add ML pipeline

    pipeline_graph(args, log, root)
Пример #4
0
def repos2bow_template(args, cache_hook: Transformer = None,
                       save_hook: Transformer = None):

    log = logging.getLogger("repos2bow")
    extractors = create_extractors_from_args(args)
    session_name = "repos2bow-%s" % uuid4()
    root, start_point = create_uast_source(args, session_name)
    log.info("Loading the document index from %s ...", args.cached_index_path)
    docfreq = DocumentFrequencies().load(source=args.cached_index_path)
    document_index = {key: int(val) for (key, val) in docfreq}

    try:
        if args.quant is not None:
            create_or_apply_quant(args.quant, extractors, None)
        df_model = create_or_load_ordered_df(args, None, None)
    except ValueError:
        return 1
    ec = EngineConstants.Columns

    if args.mode == Moder.Options.repo:
        def keymap(r):
            return r[ec.RepositoryId]
    else:
        def keymap(r):
            return r[ec.RepositoryId] + UastRow2Document.REPO_PATH_SEP + \
                r[ec.Path] + UastRow2Document.PATH_BLOB_SEP + r[ec.BlobId]

    log.info("Caching UASTs to disk after partitioning by document ...")
    start_point = start_point.link(Moder(args.mode)) \
        .link(Repartitioner.maybe(args.num_iterations, keymap=keymap)) \
        .link(Cacher.maybe("DISK_ONLY"))
    for num_part in range(args.num_iterations):
        log.info("Running job %s of %s", num_part + 1, args.num_iterations)
        selected_part = start_point \
            .link(PartitionSelector(num_part))  \
            .link(Repartitioner.maybe(args.partitions, args.shuffle)) \
            .link(Cacher.maybe(args.persist))
        if cache_hook is not None:
            selected_part.link(cache_hook()).execute()
        uast_extractor = selected_part \
            .link(UastRow2Document()) \
            .link(Cacher.maybe(args.persist))
        log.info("Collecting distinct documents ...")
        documents = uast_extractor \
            .link(FieldsSelector([Uast2BagFeatures.Columns.document])) \
            .link(Distinct()) \
            .link(Collector()) \
            .execute()
        selected_part.unpersist()
        documents = {row.document for row in documents}
        reduced_doc_index = {
            key: document_index[key] for key in document_index if key in documents}
        document_indexer = Indexer(Uast2BagFeatures.Columns.document, reduced_doc_index)
        log.info("Processing %s distinct documents", len(documents))
        bags = uast_extractor \
            .link(UastDeserializer()) \
            .link(Uast2BagFeatures(*extractors)) \
            .link(BagFeatures2TermFreq()) \
            .link(Cacher.maybe(args.persist))
        log.info("Extracting UASTs and collecting distinct tokens ...")
        tokens = bags \
            .link(FieldsSelector([Uast2BagFeatures.Columns.token])) \
            .link(Distinct()) \
            .link(Collector()) \
            .execute()
        uast_extractor.unpersist()
        tokens = {row.token for row in tokens}
        reduced_token_freq = {key: df_model[key] for key in df_model.df if key in tokens}
        reduced_token_index = {key: df_model.order[key] for key in df_model.df if key in tokens}
        log.info("Processing %s distinct tokens", len(reduced_token_freq))
        log.info("Indexing by document and token ...")
        bags_writer = bags \
            .link(TFIDF(reduced_token_freq, df_model.docs, root.session.sparkContext)) \
            .link(document_indexer) \
            .link(Indexer(Uast2BagFeatures.Columns.token, reduced_token_index))
        if save_hook is not None:
            bags_writer = bags_writer \
                .link(Repartitioner.maybe(args.partitions, args.shuffle)) \
                .link(save_hook())
        bow = args.bow.split(".asdf")[0] + "_" + str(num_part + 1) + ".asdf"
        bags_writer \
            .link(Repartitioner.maybe(
                args.partitions, keymap=lambda x: x[Uast2BagFeatures.Columns.document])) \
            .link(BOWWriter(document_indexer, df_model, bow, args.batch)) \
            .execute()
        bags.unpersist()
    pipeline_graph(args, log, root)
Пример #5
0
 def test_collector(self):
     data = ParquetLoader(session=self.spark, paths=PARQUET_DIR).link(Collector()) \
         .execute()
     self.assertEqual(len(data), 6)
Пример #6
0
 def test_repositories_filter(self):
     start_point = Ignition(self.engine)
     repos = start_point.link(RepositoriesFilter(".*antoniolg.*")).link(
         Collector()).execute()
     self.assertEqual(len(repos), 1)
     self.assertEqual(repos[0].id, "github.com/antoniolg/androidmvp.git")