示例#1
0
 def test_uast_deserializer(self):
     df = HeadFiles()(self.engine.repositories)
     df_uast = UastExtractor()(df)
     r2d = UastRow2Document()
     row_uast = r2d.documentize(df_uast.first())
     uasts_empty = list(UastDeserializer().deserialize_uast(df.first()))
     uasts = list(UastDeserializer().deserialize_uast(row_uast))
     self.assertTrue(len(uasts_empty) == 0)
     self.assertTrue(len(uasts) > 0)
示例#2
0
文件: repos2coocc.py 项目: warenlg/ml
def repos2coocc(args):
    log = logging.getLogger("repos2coocc")
    id_extractor = IdentifiersBagExtractor(docfreq_threshold=args.min_docfreq,
                                           split_stem=args.split)
    session_name = "repos2coocc-%s" % uuid4()
    root, start_point = create_uast_source(args, session_name)

    uast_extractor = start_point \
        .link(UastRow2Document()) \
        .link(Repartitioner.maybe(args.partitions, args.shuffle)) \
        .link(Cacher.maybe(args.persist))
    log.info("Extracting UASTs...")
    ndocs = uast_extractor.link(Counter()).execute()
    log.info("Number of documents: %d", ndocs)
    uast_extractor = uast_extractor.link(UastDeserializer())

    df_model = create_or_load_ordered_df(
        args, ndocs, uast_extractor.link(Uast2BagFeatures(id_extractor)))

    token2index = root.session.sparkContext.broadcast(df_model.order)
    uast_extractor \
        .link(CooccConstructor(token2index=token2index,
                               token_parser=id_extractor.id2bag.token_parser,
                               namespace=id_extractor.NAMESPACE)) \
        .link(CooccModelSaver(args.output, df_model)) \
        .execute()
    pipeline_graph(args, log, root)
示例#3
0
def repos2df(args):
    log = logging.getLogger("repos2df")
    extractors = create_extractors_from_args(args)
    session_name = "repos2df-%s" % uuid4()
    root, start_point = create_uast_source(args, session_name)

    uast_extractor = start_point \
        .link(UastRow2Document()) \
        .link(Cacher.maybe(args.persist))
    log.info("Extracting UASTs...")
    ndocs = uast_extractor.link(Counter()).execute()
    log.info("Number of documents: %d", ndocs)
    uast_extractor = uast_extractor.link(UastDeserializer())
    quant = Uast2Quant(extractors)
    uast_extractor.link(quant).execute()
    if quant.levels:
        log.info("Writing quantization levels to %s", args.quant)
        QuantizationLevels().construct(quant.levels).save(args.quant)
    df = uast_extractor \
        .link(Uast2BagFeatures(extractors)) \
        .link(BagFeatures2DocFreq()) \
        .execute()
    log.info("Writing docfreq model to %s", args.docfreq_out)
    OrderedDocumentFrequencies().construct(ndocs, df).save(args.docfreq_out)
    pipeline_graph(args, log, root)
示例#4
0
文件: repo2bow.py 项目: zurk/vecino
def repo2bow(repository: str,
             repository_format: str,
             docfreq_threshold: int,
             docfreq: DocumentFrequencies,
             languages: List[str] = None,
             blacklist_languages=False,
             engine_kwargs: Dict[str, Any] = None) -> Dict[str, float]:
    log = logging.getLogger("repo2bow")
    token_index = {"i." + key: int(val) for (key, val) in docfreq}
    session_name = "repo2bow-%s" % uuid4()
    engine_args = {
        "repositories": repository,
        "repository_format": repository_format,
    }
    if engine_kwargs is not None:
        engine_args.update(engine_kwargs)
    engine = create_engine(session_name, **engine_args)
    root = Ignition(engine) >> RepositoriesFilter(r"^file://.*") >> HeadFiles()
    if languages is not None:
        file_source = root >> \
                      LanguageExtractor() >> \
                      LanguageSelector(languages=languages, blacklist=blacklist_languages)
    else:
        file_source = root
    bag = (file_source >> UastExtractor() >> Moder("repo") >>
           UastDeserializer() >> UastRow2Document() >> Uast2BagFeatures(
               IdentifiersBagExtractor(docfreq_threshold)) >>
           BagFeatures2TermFreq() >> TFIDF(
               token_index, docfreq.docs,
               engine.session.sparkContext) >> Collector()).execute()
    log.info("extracted %d identifiers", len(bag))
    return {r.token[2:]: r.value for r in bag}
示例#5
0
文件: test_df_util.py 项目: qakart/ml
    def test_error(self):
        with self.assertRaises(ValueError):
            create_or_load_ordered_df(argparse.Namespace(docfreq_in=None), 10, None)

        with self.assertRaises(ValueError):
            session = create_spark("test_df_util")
            uast_extractor = ParquetLoader(session, paths.PARQUET_DIR) \
                .link(Moder("file")) \
                .link(UastRow2Document()) \
                .link(UastDeserializer()) \
                .link(Uast2BagFeatures(IdentifiersBagExtractor()))
            create_or_load_ordered_df(argparse.Namespace(docfreq_in=None), None, uast_extractor)
示例#6
0
def repos2roles_and_ids(args):
    log = logging.getLogger("repos2roles_and_ids")
    session_name = "repos2roles_and_ids-%s" % uuid4()
    root, start_point = create_uast_source(args, session_name)

    start_point \
        .link(UastRow2Document()) \
        .link(UastDeserializer()) \
        .link(Uast2BagFeatures([RolesAndIdsExtractor(args.split)])) \
        .link(Rower(lambda x: dict(identifier=x[0][0], role=x[1]))) \
        .link(CsvSaver(args.output)) \
        .execute()
    pipeline_graph(args, log, root)
示例#7
0
 def test_create(self):
     session = create_spark("test_df_util")
     uast_extractor = ParquetLoader(session, paths.PARQUET_DIR) \
         .link(UastRow2Document())
     ndocs = uast_extractor.link(Counter()).execute()
     uast_extractor = uast_extractor.link(UastDeserializer()) \
         .link(Uast2BagFeatures([IdentifiersBagExtractor()]))
     with tempfile.TemporaryDirectory() as tmpdir:
         tmp_path = os.path.join(tmpdir, "df.asdf")
         args = argparse.Namespace(docfreq_in=None, docfreq_out=tmp_path, min_docfreq=1,
                                   vocabulary_size=1000)
         df_model = create_or_load_ordered_df(args, ndocs, uast_extractor)
         self.assertEqual(df_model.docs, ndocs)
         self.assertTrue(os.path.exists(tmp_path))
示例#8
0
def repos2roles_and_ids(args):
    log = logging.getLogger("repos2roles_and_ids")
    session_name = "repos2roles_and_ids-%s" % uuid4()
    extractor = RoleIdsExtractor()
    root, start_point = create_uast_source(args, session_name)

    start_point \
        .link(UastRow2Document()) \
        .link(UastDeserializer()) \
        .link(Uast2Features(extractor)) \
        .link(Rower(lambda x: {"identifier": x["roleids"][0], "role": x["roleids"][1]})) \
        .link(CsvSaver(args.output)) \
        .execute()
    pipeline_graph(args, log, root)
示例#9
0
def repos2bow_entry_template(args,
                             select=HeadFiles,
                             cache_hook=None,
                             save_hook=None):
    log = logging.getLogger("repos2bow")
    extractors = create_extractors_from_args(args)
    session_name = "repos2bow-%s" % uuid4()
    root, start_point = create_uast_source(args, session_name, select=select)
    uast_extractor = start_point.link(Moder(args.mode)) \
        .link(Repartitioner.maybe(args.partitions, args.shuffle)) \
        .link(Cacher.maybe(args.persist))
    if cache_hook is not None:
        uast_extractor.link(cache_hook()).execute()
    # We link UastRow2Document after Cacher here because cache_hook() may want to have all possible
    # Row items.
    uast_extractor = uast_extractor.link(UastRow2Document())
    log.info("Extracting UASTs and indexing documents...")
    document_indexer = Indexer(Uast2BagFeatures.Columns.document)
    uast_extractor.link(document_indexer).execute()
    ndocs = len(document_indexer)
    log.info("Number of documents: %d", ndocs)
    uast_extractor = uast_extractor.link(UastDeserializer())
    quant = Uast2Quant(extractors)
    uast_extractor.link(quant).execute()
    if quant.levels:
        log.info("Writing quantization levels to %s", args.quant)
        QuantizationLevels().construct(quant.levels).save(args.quant)
    uast_extractor = uast_extractor \
        .link(Uast2BagFeatures(extractors))
    log.info("Calculating the document frequencies...")
    df = uast_extractor.link(BagFeatures2DocFreq()).execute()
    log.info("Writing docfreq to %s", args.docfreq)
    df_model = OrderedDocumentFrequencies() \
        .construct(ndocs, df) \
        .prune(args.min_docfreq) \
        .greatest(args.vocabulary_size) \
        .save(args.docfreq)
    bags_writer = uast_extractor \
        .link(BagFeatures2TermFreq()) \
        .link(TFIDF(df_model)) \
        .link(document_indexer) \
        .link(Indexer(Uast2BagFeatures.Columns.token, df_model.order))
    if save_hook is not None:
        bags_writer = bags_writer \
            .link(Repartitioner.maybe(args.partitions * 10, args.shuffle)) \
            .link(save_hook())
    bags_writer.link(BOWWriter(document_indexer, df_model, args.bow, args.batch)) \
        .execute()
    pipeline_graph(args, log, root)
示例#10
0
 def test_create(self):
     session = create_spark("test_quant_util")
     extractor = ChildrenBagExtractor()
     with tempfile.NamedTemporaryFile(mode="r+b", suffix="-quant.asdf") as tmp:
         path = tmp.name
         uast_extractor = ParquetLoader(session, paths.PARQUET_DIR) \
             .link(Moder("file")) \
             .link(UastRow2Document()) \
             .link(UastDeserializer())
         create_or_apply_quant(path, [extractor], uast_extractor)
         self.assertIsNotNone(extractor.levels)
         self.assertTrue(os.path.exists(path))
         model_levels = QuantizationLevels().load(source=path)._levels["children"]
         for key in model_levels:
             self.assertListEqual(list(model_levels[key]), list(extractor.levels[key]))
示例#11
0
def repos2id_distance(args):
    log = logging.getLogger("repos2roles_and_ids")
    extractor = IdentifierDistance(args.split, args.type, args.max_distance)
    session_name = "repos2roles_and_ids-%s" % uuid4()
    root, start_point = create_uast_source(args, session_name)

    start_point \
        .link(UastRow2Document()) \
        .link(UastDeserializer()) \
        .link(Uast2BagFeatures(extractor)) \
        .link(Rower(lambda x: {"identifier1": x[0][0][0],
                               "identifier2": x[0][0][1],
                               "distance": x[1]})) \
        .link(CsvSaver(args.output)) \
        .execute()
    pipeline_graph(args, log, root)
示例#12
0
def repos2id_sequence(args):
    log = logging.getLogger("repos2id_distance")
    extractor = IdSequenceExtractor(args.split)
    session_name = "repos2roles_and_ids-%s" % uuid4()
    root, start_point = create_uast_source(args, session_name)
    if not args.skip_docname:
        mapper = Rower(lambda x: {"document": x[0][1], "identifiers": x[0][0]})
    else:
        mapper = Rower(lambda x: {"identifiers": x[0][0]})
    start_point \
        .link(UastRow2Document()) \
        .link(UastDeserializer()) \
        .link(Uast2BagFeatures(extractor)) \
        .link(mapper) \
        .link(CsvSaver(args.output)) \
        .execute()
    pipeline_graph(args, log, root)
示例#13
0
def code2vec(args):
    log = logging.getLogger("code2vec")
    session_name = "code2vec-%s" % uuid4()
    root, start_point = create_uast_source(args, session_name)

    res = start_point \
        .link(UastRow2Document()) \
        .link(UastDeserializer()) \
        .link(Uast2BagFeatures([UastPathsBagExtractor(args.max_length, args.max_width)])) \
        .link(Collector()) \
        .execute()

    # TODO: Add rest of data pipeline: extract distinct paths and terminal nodes for embedding mapping
    # TODO: Add transformer to write bags and vocabs to a model
    # TODO: Add ML pipeline

    pipeline_graph(args, log, root)
示例#14
0
def source2bags(args):
    log = logging.getLogger("bags")
    if os.path.exists(args.batches):
        log.critical("%s must not exist", args.batches)
        return 1
    if not args.config:
        args.config = []
    try:
        cassandra_utils.configure(args)
        engine = create_engine("source2bags-%s" % uuid4(), **args.__dict__)
        extractors = [
            __extractors__[s](args.min_docfreq,
                              **__extractors__[s].get_kwargs_fromcmdline(args))
            for s in args.feature
        ]
        pipeline = Engine(engine, explain=args.explain).link(
            DzhigurdaFiles(args.dzhigurda))
        uasts = pipeline.link(UastExtractor(languages=[args.language]))
        if args.persist is not None:
            uasts = uasts.link(Cacher(args.persist))
        uasts.link(MetadataSaver(args.keyspace, args.tables["meta"]))
        uasts = uasts.link(UastDeserializer())
        uasts.link(Repo2Quant(extractors, args.nb_partitions))
        uasts.link(Repo2DocFreq(extractors))
        pipeline.explode()
        bags = uasts.link(Repo2WeightedSet(extractors))
        if args.persist is not None:
            bags = bags.link(Cacher(args.persist))
        batcher = bags.link(BagsBatcher(extractors))
        batcher.link(BagsBatchSaver(args.batches, batcher))
        bags.link(BagsSaver(args.keyspace, args.tables["bags"]))
        bags.explode()
        log.info("Writing %s", args.docfreq)
        batcher.model.save(args.docfreq)
        if args.graph:
            log.info("Dumping the graph to %s", args.graph)
            with open(args.graph, "w") as f:
                pipeline.graph(stream=f)
    finally:
        if args.pause:
            input("Press Enter to exit...")
示例#15
0
def repos2bow_index_template(args):
    log = logging.getLogger("repos2bow_index")
    extractors = create_extractors_from_args(args)
    session_name = "repos2bow_index_features-%s" % uuid4()
    root, start_point = create_uast_source(args, session_name)
    uast_extractor = start_point.link(Moder(args.mode)) \
        .link(Repartitioner.maybe(args.partitions, args.shuffle)) \
        .link(UastRow2Document()) \
        .link(Cacher.maybe(args.persist))
    log.info("Extracting UASTs and indexing documents ...")
    document_indexer = Indexer(Uast2BagFeatures.Columns.document)
    uast_extractor.link(document_indexer).execute()
    document_indexer.save_index(args.cached_index_path)
    ndocs = len(document_indexer)
    log.info("Number of documents: %d", ndocs)
    uast_extractor = uast_extractor.link(UastDeserializer())
    if args.quant:
        create_or_apply_quant(args.quant, extractors, uast_extractor)
    if args.docfreq_out:
        create_or_load_ordered_df(args, ndocs, uast_extractor.link(Uast2BagFeatures(*extractors)))
    pipeline_graph(args, log, root)
示例#16
0
def repos2coocc_entry(args):
    log = logging.getLogger("repos2coocc")
    id_extractor = IdentifiersBagExtractor(docfreq_threshold=args.min_docfreq,
                                           split_stem=args.split)
    session_name = "repos2coocc-%s" % uuid4()
    root, start_point = create_uast_source(args, session_name)

    uast_extractor = start_point \
        .link(UastRow2Document()) \
        .link(Repartitioner.maybe(args.partitions, args.shuffle)) \
        .link(Cacher.maybe(args.persist))
    log.info("Extracting UASTs...")
    ndocs = uast_extractor.link(Counter()).execute()
    log.info("Number of documents: %d", ndocs)
    uast_extractor = uast_extractor.link(UastDeserializer())

    df = uast_extractor \
        .link(Uast2BagFeatures([id_extractor])) \
        .link(BagFeatures2DocFreq()) \
        .execute()

    log.info("Writing document frequency model to %s...", args.docfreq)
    df_model = OrderedDocumentFrequencies() \
        .construct(ndocs, df) \
        .prune(args.min_docfreq) \
        .greatest(args.vocabulary_size) \
        .save(args.docfreq)

    token2index = root.session.sparkContext.broadcast(df_model.order)
    uast_extractor \
        .link(CooccConstructor(token2index=token2index,
                               token_parser=id_extractor.id2bag.token_parser,
                               namespace=id_extractor.NAMESPACE)) \
        .link(CooccModelSaver(args.output, df_model)) \
        .execute()
    pipeline_graph(args, log, root)
示例#17
0
def repos2bow_template(args, cache_hook: Transformer = None,
                       save_hook: Transformer = None):

    log = logging.getLogger("repos2bow")
    extractors = create_extractors_from_args(args)
    session_name = "repos2bow-%s" % uuid4()
    root, start_point = create_uast_source(args, session_name)
    log.info("Loading the document index from %s ...", args.cached_index_path)
    docfreq = DocumentFrequencies().load(source=args.cached_index_path)
    document_index = {key: int(val) for (key, val) in docfreq}

    try:
        if args.quant is not None:
            create_or_apply_quant(args.quant, extractors, None)
        df_model = create_or_load_ordered_df(args, None, None)
    except ValueError:
        return 1
    ec = EngineConstants.Columns

    if args.mode == Moder.Options.repo:
        def keymap(r):
            return r[ec.RepositoryId]
    else:
        def keymap(r):
            return r[ec.RepositoryId] + UastRow2Document.REPO_PATH_SEP + \
                r[ec.Path] + UastRow2Document.PATH_BLOB_SEP + r[ec.BlobId]

    log.info("Caching UASTs to disk after partitioning by document ...")
    start_point = start_point.link(Moder(args.mode)) \
        .link(Repartitioner.maybe(args.num_iterations, keymap=keymap)) \
        .link(Cacher.maybe("DISK_ONLY"))
    for num_part in range(args.num_iterations):
        log.info("Running job %s of %s", num_part + 1, args.num_iterations)
        selected_part = start_point \
            .link(PartitionSelector(num_part))  \
            .link(Repartitioner.maybe(args.partitions, args.shuffle)) \
            .link(Cacher.maybe(args.persist))
        if cache_hook is not None:
            selected_part.link(cache_hook()).execute()
        uast_extractor = selected_part \
            .link(UastRow2Document()) \
            .link(Cacher.maybe(args.persist))
        log.info("Collecting distinct documents ...")
        documents = uast_extractor \
            .link(FieldsSelector([Uast2BagFeatures.Columns.document])) \
            .link(Distinct()) \
            .link(Collector()) \
            .execute()
        selected_part.unpersist()
        documents = {row.document for row in documents}
        reduced_doc_index = {
            key: document_index[key] for key in document_index if key in documents}
        document_indexer = Indexer(Uast2BagFeatures.Columns.document, reduced_doc_index)
        log.info("Processing %s distinct documents", len(documents))
        bags = uast_extractor \
            .link(UastDeserializer()) \
            .link(Uast2BagFeatures(*extractors)) \
            .link(BagFeatures2TermFreq()) \
            .link(Cacher.maybe(args.persist))
        log.info("Extracting UASTs and collecting distinct tokens ...")
        tokens = bags \
            .link(FieldsSelector([Uast2BagFeatures.Columns.token])) \
            .link(Distinct()) \
            .link(Collector()) \
            .execute()
        uast_extractor.unpersist()
        tokens = {row.token for row in tokens}
        reduced_token_freq = {key: df_model[key] for key in df_model.df if key in tokens}
        reduced_token_index = {key: df_model.order[key] for key in df_model.df if key in tokens}
        log.info("Processing %s distinct tokens", len(reduced_token_freq))
        log.info("Indexing by document and token ...")
        bags_writer = bags \
            .link(TFIDF(reduced_token_freq, df_model.docs, root.session.sparkContext)) \
            .link(document_indexer) \
            .link(Indexer(Uast2BagFeatures.Columns.token, reduced_token_index))
        if save_hook is not None:
            bags_writer = bags_writer \
                .link(Repartitioner.maybe(args.partitions, args.shuffle)) \
                .link(save_hook())
        bow = args.bow.split(".asdf")[0] + "_" + str(num_part + 1) + ".asdf"
        bags_writer \
            .link(Repartitioner.maybe(
                args.partitions, keymap=lambda x: x[Uast2BagFeatures.Columns.document])) \
            .link(BOWWriter(document_indexer, df_model, bow, args.batch)) \
            .execute()
        bags.unpersist()
    pipeline_graph(args, log, root)