Exemplo n.º 1
0
 def test_call(self):
     content2ids = ContentToIdentifiers(split=False)
     ids2dataset = IdentifiersToDataset(idfreq=False)
     for data, result in zip(tfidf_data.datasets, tfidf_data.ids_result):
         df = self.sc.sparkContext \
             .parallelize(range(len(data["content"]))) \
             .map(lambda x: Row(content=data["content"][x], path=str(data["file"][x]),
                                repository_id=str(data["document"][x]), lang=data["lang"][x])) \
             .toDF()
         rdd_processed = content2ids(df)
         self.assertEqual(result, set(ids2dataset(rdd_processed).collect()))
Exemplo n.º 2
0
def repos2ids(args):
    log = logging.getLogger("repos2ids")
    session_name = "repos2ids-%s" % uuid4()

    root, start_point = create_file_source(args, session_name)
    start_point \
        .link(Repartitioner(args.partitions, args.shuffle)) \
        .link(ContentToIdentifiers(args.split)) \
        .link(IdentifiersToDataset(args.idfreq)) \
        .link(CsvSaver(args.output)) \
        .execute()
    pipeline_graph(args, log, root)
Exemplo n.º 3
0
def repos2ids(args):
    log = logging.getLogger("repos2ids")
    session_name = "repos2ids-%s" % uuid4()
    language_selector = LanguageSelector(languages=["null"], blacklist=True)
    root, start_point = create_uast_source(args,
                                           session_name,
                                           language_selector=language_selector,
                                           extract_uast=False)
    start_point \
        .link(Repartitioner(args.partitions, args.shuffle)) \
        .link(ContentToIdentifiers(args.split)) \
        .link(IdentifiersToDataset(args.idfreq)) \
        .link(CsvSaver(args.output)) \
        .execute()
    pipeline_graph(args, log, root)