def test_head_files(self): df = HeadFiles()(self.engine.repositories) df_as_dict = df.first().asDict() keys = set(df_as_dict.keys()) self.assertIn("commit_hash", keys) self.assertIn("path", keys) self.assertIn("content", keys) self.assertIn("reference_name", keys)
def test_uast_deserializer(self): df = HeadFiles()(self.engine.repositories) df_uast = UastExtractor()(df) r2d = UastRow2Document() row_uast = r2d.documentize(df_uast.first()) uasts_empty = list(UastDeserializer().deserialize_uast(df.first())) uasts = list(UastDeserializer().deserialize_uast(row_uast)) self.assertTrue(len(uasts_empty) == 0) self.assertTrue(len(uasts) > 0)
def test_language_selector(self): language_selector = LanguageSelector(languages=["XML", "YAML"], blacklist=True) df = language_selector(HeadFiles()(self.engine.repositories).classify_languages()) langs = [x.lang for x in df.select("lang").distinct().collect()] self.assertEqual(langs, ["Markdown", "Gradle", "Text", "INI", "Batchfile", "Python", "Java", "Shell"]) language_selector = LanguageSelector(languages=["Python", "Java"], blacklist=False) df = language_selector(HeadFiles()(self.engine.repositories).classify_languages()) langs = [x.lang for x in df.select("lang").distinct().collect()] self.assertEqual(langs, ["Python", "Java"])
def test_parquet(self): languages1 = ["Python", "Java"] languages2 = ["Java"] engine = create_engine("test", SIVA_DIR) res = Ignition(engine) \ .link(HeadFiles()) \ .link(LanguageExtractor()) \ .link(LanguageSelector(languages1)) \ .link(Collector()) \ .execute() self.assertEqual({x.lang for x in res}, set(languages1)) res = Ignition(engine) \ .link(HeadFiles()) \ .link(LanguageExtractor()) \ .link(LanguageSelector(languages2)) \ .link(Collector()) \ .execute() self.assertEqual({x.lang for x in res}, set(languages2)) res = Ignition(engine) \ .link(HeadFiles()) \ .link(LanguageExtractor()) \ .link(LanguageSelector(languages2, blacklist=True)) \ .link(Collector()) \ .execute() self.assertEqual(set(), {x.lang for x in res} & set(languages2)) res = Ignition(engine) \ .link(HeadFiles()) \ .link(LanguageExtractor()) \ .link(LanguageSelector([])) \ .link(Collector()) \ .execute() self.assertEqual(set(), {x.lang for x in res}) parquet_loader = create_parquet_loader("test_parquet", repositories=PARQUET_DIR) df = parquet_loader.execute() with self.assertRaises(AttributeError): LanguageSelector(languages1)(df) df_with_lang = df.withColumn("lang", lit("BestLang")) self.assertEqual( 0, len(LanguageSelector(languages1)(df_with_lang).collect())) self.assertEqual( df_with_lang.collect(), LanguageSelector(["BestLang"])(df_with_lang).collect())
def repo2bow(repository: str, repository_format: str, docfreq_threshold: int, docfreq: DocumentFrequencies, languages: List[str] = None, blacklist_languages=False, engine_kwargs: Dict[str, Any] = None) -> Dict[str, float]: log = logging.getLogger("repo2bow") token_index = {"i." + key: int(val) for (key, val) in docfreq} session_name = "repo2bow-%s" % uuid4() engine_args = { "repositories": repository, "repository_format": repository_format, } if engine_kwargs is not None: engine_args.update(engine_kwargs) engine = create_engine(session_name, **engine_args) root = Ignition(engine) >> RepositoriesFilter(r"^file://.*") >> HeadFiles() if languages is not None: file_source = root >> \ LanguageExtractor() >> \ LanguageSelector(languages=languages, blacklist=blacklist_languages) else: file_source = root bag = (file_source >> UastExtractor() >> Moder("repo") >> UastDeserializer() >> UastRow2Document() >> Uast2BagFeatures( IdentifiersBagExtractor(docfreq_threshold)) >> BagFeatures2TermFreq() >> TFIDF( token_index, docfreq.docs, engine.session.sparkContext) >> Collector()).execute() log.info("extracted %d identifiers", len(bag)) return {r.token[2:]: r.value for r in bag}
def test_uast_extractor(self): df = HeadFiles()(self.engine.repositories) df_uast = UastExtractor()(df) self.assertIn("uast", df_uast.columns)