def ImportFromLanguage(db: contentfiles.ContentFiles, language: scrape_repos_pb2.LanguageToClone, pool: multiprocessing.Pool) -> None: """Import contentfiles from a language specification. Args: db: The database to import to. language: The language to import. pool: A multiprocessing pool. Raises: ValueError: If importer field not set. """ if not language.importer: raise ValueError('LanguageToClone.importer field not set') with db.Session() as session: repos_to_import = [ pathlib.Path(language.destination_directory / f) for f in pathlib.Path(language.destination_directory).iterdir() if ShouldImportRepo( session, pathlib.Path(language.destination_directory / f)) ] random.shuffle(repos_to_import) logging.info('Importing %s %s repos ...', humanize.intcomma(len(repos_to_import)), language.language.capitalize()) for metafile in repos_to_import: with db.Session(commit=True) as session: ImportRepo(session, language, metafile, pool)
def MaskOnMinStarCount(db: contentfiles.ContentFiles, min_star_count: int) -> None: """Mask by the minimum repo star count. Args: db: The database to modify. min_star_count: The minimum number of stars for a repo to be active. """ with db.Session(commit=not FLAGS.dry_run) as session: active_repo_count = (session.query( contentfiles.GitHubRepository).filter( contentfiles.GitHubRepository.active).count()) repos_to_mark_inactive = (session.query( contentfiles.GitHubRepository).filter( contentfiles.GitHubRepository.active == True).filter( contentfiles.GitHubRepository.num_stars < min_star_count)) repos_to_mark_inactive_count = repos_to_mark_inactive.count() app.Log( 1, "Marking %s of %s active repos inactive (%.2f %%)", humanize.Commas(repos_to_mark_inactive_count), humanize.Commas(active_repo_count), (repos_to_mark_inactive_count / active_repo_count) * 100, ) repos_to_mark_inactive.update({"active": False})
def MaskOnMaxRepoCount(db: contentfiles.ContentFiles, max_repo_count: int) -> None: """Mask by the maximum number of repos. Args: db: The database to modify. max_repo_count: The maximum number of active repos. """ with db.Session(commit=not FLAGS.dry_run) as session: active_repos = session.query( contentfiles.GitHubRepository.clone_from_url).filter( contentfiles.GitHubRepository.active == True) active_repos_count = active_repos.count() repos_to_mark_inactive_count = max(0, active_repos_count - max_repo_count) repos_to_mark_inactive = active_repos.order_by( db.Random()).limit(repos_to_mark_inactive_count) app.Log( 1, "Marking %s of %s active repos inactive (%.2f %%)", humanize.Commas(repos_to_mark_inactive_count), humanize.Commas(active_repos_count), (repos_to_mark_inactive_count / active_repos_count) * 100, ) # Can't call Query.update() or Query.delete() when limit() has been called, # hence the subquery. clone_from_urls = {r[0] for r in repos_to_mark_inactive} session.query(contentfiles.GitHubRepository).filter( contentfiles.GitHubRepository.clone_from_url.in_( clone_from_urls)).update({"active": False}, synchronize_session="fetch")
def test_Exporter( db: contentfiles.ContentFiles, empty_db: contentfiles.ContentFiles ): """Test that exporter behaves as expected.""" exporter = export_java_corpus.Exporter(db, empty_db, static_only=True) exporter.start() exporter.join() with empty_db.Session() as s: assert s.query(contentfiles.GitHubRepository).count() == 1 assert s.query(contentfiles.ContentFile).count() == 1 repo = s.query(contentfiles.GitHubRepository).first() assert repo.clone_from_url == "abc" contentfile = s.query(contentfiles.ContentFile).first() assert contentfile.sha256 != "000" assert contentfile.relpath == "foo" assert ( contentfile.text == """\ public static void main(String[] args){ System.out.println("Hello, world"); } """ ) assert contentfile.charcount == len( """\ public static void main(String[] args){ System.out.println("Hello, world"); } """ ) assert contentfile.linecount == 4
def test_ImportFromLanguage_Java_repo( test_db: contentfiles.ContentFiles, tempdir: pathlib.Path ): """An end-to-end test of a Java importer.""" (tempdir / "Owner_Name" / ".git").mkdir(parents=True) (tempdir / "Owner_Name" / "src").mkdir(parents=True) # A repo will only be imported if there is a repo meta file. pbutil.ToFile( scrape_repos_pb2.GitHubRepoMetadata(owner="Owner", name="Name"), tempdir / "Owner_Name.pbtxt", ) # Create some files in our test repo. with open(tempdir / "Owner_Name" / "src" / "A.java", "w") as f: f.write( """ public class A { public static void helloWorld() { System.out.println("Hello, world!"); } } """ ) with open(tempdir / "Owner_Name" / "src" / "B.java", "w") as f: f.write( """ public class B { private static int foo() {return 5;} } """ ) with open(tempdir / "Owner_Name" / "README.txt", "w") as f: f.write("Hello, world!") language = scrape_repos_pb2.LanguageToClone( language="foolang", query=[], destination_directory=str(tempdir), importer=[ scrape_repos_pb2.ContentFilesImporterConfig( source_code_pattern=".*\\.java", preprocessor=[ "datasets.github.scrape_repos.preprocessors." "extractors:JavaMethods" ], ), ], ) importer.ImportFromLanguage(test_db, language) with test_db.Session() as session: query = session.query(contentfiles.ContentFile) assert query.count() == 2 assert set([cf.text for cf in query]) == { ( "public static void helloWorld(){\n" ' System.out.println("Hello, world!");\n}\n' ), "private static int foo(){\n return 5;\n}\n", }
def ProcessRepo( input_db: contentfiles.ContentFiles, output_db: contentfiles.ContentFiles, clone_from_url: str, static_only: bool, ): """Preprocess all content files from a single scraped repo.""" with input_db.Session(commit=True) as input_session: with output_db.Session(commit=True) as output_session: with tempfile.TemporaryDirectory(prefix="phd_") as d: DoProcessRepo( input_session, output_session, clone_from_url, pathlib.Path(d), static_only, )
def test_Exporter_overloaded_method_extraction( db: contentfiles.ContentFiles, empty_db: contentfiles.ContentFiles ): """Test that exporter behaves as expected.""" exporter = export_java_corpus.Exporter(db, empty_db, static_only=True) with db.Session(commit=True) as s: s.add( contentfiles.ContentFile( clone_from_url="abc", relpath="a/file.txt", artifact_index=0, sha256="000", charcount=200, linecount=10, text=""" public class HelloWorld { private static int foo(int a) { return 5; } private static int foo(float a) { return 5; } private static int foo(double a) { return 5; } } """, ) ) exporter.start() exporter.join() with empty_db.Session() as s: query = s.query(contentfiles.ContentFile).filter( contentfiles.ContentFile.relpath == "a/file.txt" ) assert query.count() == 3 for cf in query: assert "private static int foo(" in cf.text indices = {cf.artifact_index for cf in query} assert indices == {0, 1, 2}
def ProcessBatch( input_db: contentfiles.ContentFiles, pp_db: preprocessed.PreprocessedContentFile, outdir: pathlib.Path, ids: typing.List[int], ): with pp_db.Session(commit=True) as pp_session: with input_db.Session() as input_session: to_preprocess = pp_session.query( preprocessed.PreprocessedContentFile ).filter(preprocessed.PreprocessedContentFile.id.in_(ids)) ProcessList(input_session, to_preprocess, outdir)
def test_PipelinedScraper_contentfiles_database_ignores_duplicates( language: scrape_repos_pb2.LanguageCloneList, query: scrape_repos_pb2.GitHubRepositoryQuery, connection: MockGitHubConnection, db: contentfiles.ContentFiles, ): """Test database contents.""" scraper = pipelined_scraper.PipelinedScraper(language, query, connection, db) scraper.start() scraper.join() with db.Session() as session: original_contentfile_count = session.query(contentfiles.ContentFile).count() assert original_contentfile_count # Run the scraper again. scraper = pipelined_scraper.PipelinedScraper(language, query, connection, db) scraper.start() scraper.join() with db.Session() as session: assert ( session.query(contentfiles.ContentFile).count() == original_contentfile_count )
def test_PipelinedScraper_contentfiles_database_repo_contents( language: scrape_repos_pb2.LanguageCloneList, query: scrape_repos_pb2.GitHubRepositoryQuery, connection: MockGitHubConnection, db: contentfiles.ContentFiles, ): """Test database contents.""" # This test will fail if the contents of GitHub repository # https://github.com/ChrisCummins/empty_repository_for_testing change. scraper = pipelined_scraper.PipelinedScraper(language, query, connection, db) scraper.start() scraper.join() with db.Session() as session: assert session.query(contentfiles.GitHubRepository).count() == 1 repo = session.query(contentfiles.GitHubRepository).first() assert repo.clone_from_url == ( "https://github.com/ChrisCummins/empty_repository_for_testing.git" )
def ResetExported(db: contentfiles.ContentFiles) -> None: """Restore exported status to database. Args: db: The database to modify. """ with db.Session(commit=not FLAGS.dry_run) as session: exported_repos = session.query(contentfiles.GitHubRepository).filter( contentfiles.GitHubRepository.exported == True) exported_repos_count = exported_repos.count() repos_count = session.query(contentfiles.GitHubRepository).count() app.Log( 1, "Marking %s of %s repos as not exported (%.2f %%)", humanize.Commas(exported_repos_count), humanize.Commas(repos_count), (exported_repos_count / repos_count) * 100, ) exported_repos.update({"exported": False})
def MaskOnMaxRepoFileCount(db: contentfiles.ContentFiles, max_repo_file_count: int) -> None: """Mask by the maximum repo file count. Args: db: The database to modify. max_repo_file_count: The maxmium number of contentfiles in a repo for it to be active. """ with db.Session(commit=not FLAGS.dry_run) as session: active_repo_count = (session.query( contentfiles.GitHubRepository).filter( contentfiles.GitHubRepository.active).count()) repos_to_mark_inactive = (session.query( contentfiles.ContentFile.clone_from_url, sql.func.count(contentfiles.ContentFile.clone_from_url), ).join(contentfiles.GitHubRepository).filter( contentfiles.GitHubRepository.active == True).group_by( contentfiles.ContentFile.clone_from_url).having( sql.func.count(contentfiles.ContentFile.clone_from_url) > max_repo_file_count)) repos_to_mark_inactive_count = repos_to_mark_inactive.count() app.Log( 1, "Marking %s of %s active repos inactive (%.2f %%)", humanize.Commas(repos_to_mark_inactive_count), humanize.Commas(active_repo_count), (repos_to_mark_inactive_count / active_repo_count) * 100, ) # Can't call Query.update() or Query.delete() when limit() has been called, # hence the subquery. clone_from_urls = {r.clone_from_url for r in repos_to_mark_inactive} session.query(contentfiles.GitHubRepository).filter( contentfiles.GitHubRepository.clone_from_url.in_( clone_from_urls)).update({"active": False}, synchronize_session="fetch")
def Reset(db: contentfiles.ContentFiles) -> None: """Restore active status to database. Args: db: The database to modify. """ with db.Session(commit=not FLAGS.dry_run) as session: inactive_repos = session.query(contentfiles.GitHubRepository).filter( contentfiles.GitHubRepository.active == False) inactive_repos_count = inactive_repos.count() repos_count = session.query(contentfiles.GitHubRepository).count() app.Log( 1, "Restoring active status to %s of %s repos (%.2f %%)", humanize.Commas(inactive_repos_count), humanize.Commas(repos_count), (inactive_repos_count / repos_count) * 100, ) inactive_repos.update({"active": True}) inactive_cf = session.query(contentfiles.ContentFile).filter( contentfiles.ContentFile.active == False) inactive_cf_count = inactive_cf.count() cf_count = session.query(contentfiles.ContentFile).count() app.Log( 1, "Restoring active status to %s of %s content files (%.2f %%)", humanize.Commas(inactive_cf_count), humanize.Commas(cf_count), (inactive_cf_count / cf_count) * 100, ) inactive_cf.update({"active": True})
def test_PipelinedScraper_contentfiles_database_contents( language: scrape_repos_pb2.LanguageCloneList, query: scrape_repos_pb2.GitHubRepositoryQuery, connection: MockGitHubConnection, db: contentfiles.ContentFiles, ): """Test database contents.""" # This test will fail if the contents of GitHub repository # https://github.com/ChrisCummins/empty_repository_for_testing change. scraper = pipelined_scraper.PipelinedScraper(language, query, connection, db) scraper.start() scraper.join() with db.Session() as session: assert session.query(contentfiles.ContentFile).count() == 1 contentfile = session.query(contentfiles.ContentFile).first() assert contentfile.clone_from_url == ( "https://github.com/ChrisCummins/empty_repository_for_testing.git" ) assert contentfile.relpath == "HelloWorld.java" assert contentfile.artifact_index == 0 assert contentfile.text == HELLO_WORLD_TEXT assert contentfile.charcount == len(HELLO_WORLD_TEXT) assert contentfile.linecount == len(HELLO_WORLD_TEXT.split("\n"))
def PopulateBytecodeTable( cf: contentfiles.ContentFiles, language: str, db: bytecode_database.Database, pool: typing.Optional[multiprocessing.Pool] = None, ): # Only one process at a time can run this method. mutex = lockfile.AutoLockFile(granularity="function") # We use the database URL as the name of the source. source_name = cf.url # Read source files from the contenfiles database, process them into # bytecodes, and, if successful, write them into the database. We process # files sorted by their numeric ID in the contentfiles database, so that if with db.Session() as s: # Get the ID of the last-processed bytecode file to resume from. resume_from = int( ( s.query(bytecode_database.LlvmBytecode.relpath) .filter(bytecode_database.LlvmBytecode.source_name == cf.url) .filter(bytecode_database.LlvmBytecode.language == language) # Note the cast to integer: relpath is a string column, sorting by it # in its native type would sort the string (e.g. '9' > '10'. .order_by( sql.cast(bytecode_database.LlvmBytecode.relpath, sql.Integer).desc() ) .limit(1) .first() or (0,) )[0] ) with mutex, cf.Session() as cf_s, sqlutil.BufferedDatabaseWriter( db, max_buffer_length=10 ) as writer: # Get the ID of the last contentfile to process. n = ( cf_s.query(contentfiles.ContentFile.id) .join(contentfiles.GitHubRepository) .filter(contentfiles.GitHubRepository.language == language) .order_by(contentfiles.ContentFile.id.desc()) .limit(1) .one_or_none() or (0,) )[0] app.Log( 1, "Starting at row %s / %s", humanize.Commas(resume_from), humanize.Commas(n), ) # A query to return the <id,text> tuples of files to process. q = ( cf_s.query(contentfiles.ContentFile.id, contentfiles.ContentFile.text) .filter(contentfiles.ContentFile.id > resume_from) .join(contentfiles.GitHubRepository) .filter(contentfiles.GitHubRepository.language == language) .order_by(contentfiles.ContentFile.id) ) row_batches = sqlutil.OffsetLimitBatchedQuery( q, batch_size=FLAGS.batch_size ) for i, batch in zip(range(resume_from, n + 1), row_batches): app.Log( 1, "Processing batch of %d contentfiles -> bytecodes, %s / %s (%.1f%%)", FLAGS.batch_size, humanize.Commas(i), humanize.Commas(n), (i / n) * 100, ) protos = GetBytecodesFromContentFiles(source_name, language, batch.rows) writer.AddMany( [ bytecode_database.LlvmBytecode( **bytecode_database.LlvmBytecode.FromProto(proto) ) for proto in protos ] )