def test_PreprocessContentfiles(): """Test preprocessing a basic input.""" pp_cfs = preprocess_java_corpus.PreprocessContentfiles( [ contentfiles.ContentFile( text=""" private static int Foobar(int foo) { int bar = 10 + 1; foo += bar; foo *= 2; return foo + 10; } """ ) ] ) assert len(pp_cfs) == 1 assert ( pp_cfs[0].text == """\ private static int fn_A(int a){ int b=10 + 1; a+=b; a*=2; return a + 10; } """ ) assert pp_cfs[0].preprocessing_succeeded
def test_PreprocessContentfiles_method_depends_on_java_util(): """Test that a method which uses java.util.ArrayList works.""" pp_cfs = preprocess_java_corpus.PreprocessContentfiles( [ contentfiles.ContentFile( text=""" private static int Foobar(int a, ArrayList<Integer> _) { int b=10 + 1; a+=b; a*=2; return a + 10; } """ ) ] ) assert len(pp_cfs) == 1 assert ( pp_cfs[0].text == """\ private static int fn_A(int a,ArrayList<Integer> b){ int c=10 + 1; a+=c; a*=2; return a + 10; } """ ) assert pp_cfs[0].preprocessing_succeeded
def test_Exporter_overloaded_method_extraction( db: contentfiles.ContentFiles, empty_db: contentfiles.ContentFiles ): """Test that exporter behaves as expected.""" exporter = export_java_corpus.Exporter(db, empty_db, static_only=True) with db.Session(commit=True) as s: s.add( contentfiles.ContentFile( clone_from_url="abc", relpath="a/file.txt", artifact_index=0, sha256="000", charcount=200, linecount=10, text=""" public class HelloWorld { private static int foo(int a) { return 5; } private static int foo(float a) { return 5; } private static int foo(double a) { return 5; } } """, ) ) exporter.start() exporter.join() with empty_db.Session() as s: query = s.query(contentfiles.ContentFile).filter( contentfiles.ContentFile.relpath == "a/file.txt" ) assert query.count() == 3 for cf in query: assert "private static int foo(" in cf.text indices = {cf.artifact_index for cf in query} assert indices == {0, 1, 2}
def ImportWorker( job: scrape_repos_pb2.ImportWorker ) -> typing.List[contentfiles.ContentFile]: """Import a content file.""" relpath = job.abspath[len(str(job.clone_dir)) + 1:] outputs: typing.List[contentfiles.ContentFile] = [] try: texts = preprocessors.Preprocess(pathlib.Path(job.clone_dir), relpath, job.all_files_relpaths, job.preprocessors) for i, text in enumerate(texts): sha256 = hashlib.sha256(text.encode('utf-8')) outputs.append(contentfiles.ContentFile( clone_from_url=job.clone_from_url, relpath=relpath, artifact_index=i, sha256=sha256.digest(), charcount=len(text), linecount=len(text.split('\n')), text=text)) except UnicodeDecodeError: logging.warning('Failed to decode %s', relpath) return outputs
def db(tempdir: pathlib.Path) -> contentfiles.ContentFiles: db_ = contentfiles.ContentFiles(f"sqlite:///{tempdir}/a") with db_.Session(commit=True) as session: session.add( contentfiles.GitHubRepository( owner="foo", name="bar", clone_from_url="abc", num_stars=0, num_forks=0, num_watchers=0, active=1, exported=0, date_scraped=datetime.datetime.utcnow(), language="java", ) ) session.add( contentfiles.ContentFile( clone_from_url="abc", relpath="foo", artifact_index=0, sha256="000", charcount=100, linecount=4, active=1, text=""" import java.util.ArrayList; public class HelloWorld { private int foo(ArrayList<Integer> x) { return 5; } public static void main(String[] args) { System.out.println("Hello, world"); } } """, ) ) return db_
def DoProcessRepo( input_session: sqlutil.Session, output_session: sqlutil.Session, clone_from_url: str, workding_dir: pathlib.Path, static_only: bool, ) -> None: """Preprocess all content files from a single scraped repo.""" candidate_contentfiles = input_session.query( contentfiles.ContentFile.relpath, contentfiles.ContentFile.text ).filter(contentfiles.ContentFile.clone_from_url == clone_from_url) contentfiles_to_export = ( candidate_contentfiles.filter( contentfiles.ContentFile.linecount >= FLAGS.min_line_count ) .filter(contentfiles.ContentFile.charcount >= FLAGS.min_char_count) .all() ) app.Log( 2, "Exporting %s of %s content files from %s", humanize.Commas(len(contentfiles_to_export)), humanize.Commas(candidate_contentfiles.count()), clone_from_url, ) # Create the directory tree first. for relpath, method_text in contentfiles_to_export: path = workding_dir / relpath path.parent.mkdir(parents=True, exist_ok=True) fs.Write(path, method_text.encode("utf-8"), overwrite_existing=False) # Copy repo to output. repo = input_session.query(contentfiles.GitHubRepository).filter( contentfiles.GitHubRepository.clone_from_url == clone_from_url ) ImportQueryResults(repo, output_session) # Run the preprocessors. methods_lists = extractors.BatchedMethodExtractor( [text for _, text in contentfiles_to_export] ) relpath_counters = collections.defaultdict(int) for (relpath, text), methods in zip(contentfiles_to_export, methods_lists): # Attempt to extract all imports for this content file. # NOTE(2019-06-28): Disabled import inlining to simplify the synthesis # pipeline. We may wish to revisit this at a later date. # imports = GetJavaImports(text) for i, original_method_text in enumerate(methods): # Insert "//import ..." comments before each method so that we know which # packages must be imported. # NOTE(2019-06-28): Disabled import inlining to simplify the synthesis # pipeline. We may wish to revisit this at a later date. # method_text = InsertImportCommentHeader(original_method_text, imports) method_text = original_method_text encoded_text = method_text.encode("ascii", "ignore") sha256 = hashlib.sha256(encoded_text).hexdigest() method_text = encoded_text.decode("ascii") # Add new contentfile. output_session.add( contentfiles.ContentFile( clone_from_url=clone_from_url, relpath=relpath, artifact_index=relpath_counters[relpath], sha256=sha256, charcount=len(original_method_text), linecount=len(original_method_text.split("\n")), text=method_text, ) ) relpath_counters[relpath] += 1 # Mark repo as exported. repo.update({"exported": True})