def test_ImportFromLanguage_Java_repo( test_db: contentfiles.ContentFiles, tempdir: pathlib.Path ): """An end-to-end test of a Java importer.""" (tempdir / "Owner_Name" / ".git").mkdir(parents=True) (tempdir / "Owner_Name" / "src").mkdir(parents=True) # A repo will only be imported if there is a repo meta file. pbutil.ToFile( scrape_repos_pb2.GitHubRepoMetadata(owner="Owner", name="Name"), tempdir / "Owner_Name.pbtxt", ) # Create some files in our test repo. with open(tempdir / "Owner_Name" / "src" / "A.java", "w") as f: f.write( """ public class A { public static void helloWorld() { System.out.println("Hello, world!"); } } """ ) with open(tempdir / "Owner_Name" / "src" / "B.java", "w") as f: f.write( """ public class B { private static int foo() {return 5;} } """ ) with open(tempdir / "Owner_Name" / "README.txt", "w") as f: f.write("Hello, world!") language = scrape_repos_pb2.LanguageToClone( language="foolang", query=[], destination_directory=str(tempdir), importer=[ scrape_repos_pb2.ContentFilesImporterConfig( source_code_pattern=".*\\.java", preprocessor=[ "datasets.github.scrape_repos.preprocessors." "extractors:JavaMethods" ], ), ], ) importer.ImportFromLanguage(test_db, language) with test_db.Session() as session: query = session.query(contentfiles.ContentFile) assert query.count() == 2 assert set([cf.text for cf in query]) == { ( "public static void helloWorld(){\n" ' System.out.println("Hello, world!");\n}\n' ), "private static int foo(){\n return 5;\n}\n", }
def test_ImportFromLanguage_no_importer(tempdir: pathlib.Path): """Test that error is raised if no importer specified.""" language = scrape_repos_pb2.LanguageToClone( language='test', query=[], destination_directory=str(tempdir), importer=[]) with pytest.raises(ValueError): indexer.ImportFromLanguage(language, multiprocessing.Pool(1))
def test_ImportFromLanguage_no_importer( test_db: contentfiles.ContentFiles, tempdir: pathlib.Path ): """Test that error is raised if no importer specified.""" language = scrape_repos_pb2.LanguageToClone( language="test", query=[], destination_directory=str(tempdir), importer=[] ) with test.Raises(ValueError): importer.ImportFromLanguage(test_db, language)
def test_ImportFromLanguage_Java_repo(tempdir: pathlib.Path): """An end-to-end test of a Java importer.""" (tempdir / 'src').mkdir() (tempdir / 'src' / 'Owner_Name' / '.git').mkdir(parents=True) (tempdir / 'src' / 'Owner_Name' / 'src').mkdir(parents=True) # A repo will only be imported if there is a repo meta file. pbutil.ToFile(scrape_repos_pb2.GitHubRepoMetadata( owner='Owner', name='Name'), tempdir / 'src' / 'Owner_Name.pbtxt') # Create some files in our test repo. with open(tempdir / 'src' / 'Owner_Name' / 'src' / 'A.java', 'w') as f: f.write(""" public class A { public static void helloWorld() { System.out.println("Hello, world!"); } } """) with open(tempdir / 'src' / 'Owner_Name' / 'src' / 'B.java', 'w') as f: f.write(""" public class B { private static int foo() {return 5;} } """) with open(tempdir / 'src' / 'Owner_Name' / 'README.txt', 'w') as f: f.write('Hello, world!') language = scrape_repos_pb2.LanguageToClone( language='foolang', query=[], destination_directory=str(tempdir / 'src'), importer=[ scrape_repos_pb2.ContentFilesImporterConfig( source_code_pattern='.*\\.java', preprocessor=["datasets.github.scrape_repos.preprocessors." "extractors:JavaMethods"]), ] ) indexer.ImportFromLanguage(language, multiprocessing.Pool(1)) test_repo = github_repo.GitHubRepo(tempdir / 'src' / 'Owner_Name.pbtxt') assert (test_repo.index_dir / 'DONE.txt').is_file() assert len(list(test_repo.index_dir.iterdir())) == 3 contentfiles = list(test_repo.ContentFiles()) assert len(contentfiles) == 2 assert set([cf.text for cf in contentfiles]) == { ('public static void helloWorld(){\n' ' System.out.println("Hello, world!");\n}\n'), 'private static int foo(){\n return 5;\n}\n', }
def language( tempdir: pathlib.Path, query: scrape_repos_pb2.GitHubRepositoryQuery ) -> scrape_repos_pb2.LanguageCloneList: return scrape_repos_pb2.LanguageToClone( language="java", query=[query], destination_directory=str(tempdir), importer=[ scrape_repos_pb2.ContentFilesImporterConfig( source_code_pattern=".*\\.java" ) ], )
def GetLanguageToClone( query_prefix: str, destination_dir: str ) -> scrape_repos_pb2.LanguageToClone: # See: https://help.github.com/en/articles/sorting-search-results sort_by = random.choice(["stars", "forks", "updated"]) return scrape_repos_pb2.LanguageToClone( language="java", query=[ scrape_repos_pb2.GitHubRepositoryQuery( string=f"{query_prefix} language:java sort:{sort_by} fork:false" ) ], destination_directory=destination_dir, importer=[ scrape_repos_pb2.ContentFilesImporterConfig( source_code_pattern=".*\\.java" ) ], clone_from_url_blacklist=BLACKLIST_GITHUB_REPOS, )