Exemplo n.º 1
0
def test_ImportFromLanguage_Java_repo(
  test_db: contentfiles.ContentFiles, tempdir: pathlib.Path
):
  """An end-to-end test of a Java importer."""
  (tempdir / "Owner_Name" / ".git").mkdir(parents=True)
  (tempdir / "Owner_Name" / "src").mkdir(parents=True)

  # A repo will only be imported if there is a repo meta file.
  pbutil.ToFile(
    scrape_repos_pb2.GitHubRepoMetadata(owner="Owner", name="Name"),
    tempdir / "Owner_Name.pbtxt",
  )

  # Create some files in our test repo.
  with open(tempdir / "Owner_Name" / "src" / "A.java", "w") as f:
    f.write(
      """
public class A {
  public static void helloWorld() {
    System.out.println("Hello, world!");
  }
}
"""
    )
  with open(tempdir / "Owner_Name" / "src" / "B.java", "w") as f:
    f.write(
      """
public class B {
  private static int foo() {return 5;}
}
"""
    )
  with open(tempdir / "Owner_Name" / "README.txt", "w") as f:
    f.write("Hello, world!")

  language = scrape_repos_pb2.LanguageToClone(
    language="foolang",
    query=[],
    destination_directory=str(tempdir),
    importer=[
      scrape_repos_pb2.ContentFilesImporterConfig(
        source_code_pattern=".*\\.java",
        preprocessor=[
          "datasets.github.scrape_repos.preprocessors." "extractors:JavaMethods"
        ],
      ),
    ],
  )
  importer.ImportFromLanguage(test_db, language)
  with test_db.Session() as session:
    query = session.query(contentfiles.ContentFile)
    assert query.count() == 2
    assert set([cf.text for cf in query]) == {
      (
        "public static void helloWorld(){\n"
        '  System.out.println("Hello, world!");\n}\n'
      ),
      "private static int foo(){\n  return 5;\n}\n",
    }
Exemplo n.º 2
0
def test_ImportFromLanguage_no_importer(tempdir: pathlib.Path):
  """Test that error is raised if no importer specified."""
  language = scrape_repos_pb2.LanguageToClone(
      language='test',
      query=[],
      destination_directory=str(tempdir),
      importer=[])
  with pytest.raises(ValueError):
    indexer.ImportFromLanguage(language, multiprocessing.Pool(1))
Exemplo n.º 3
0
def test_ImportFromLanguage_no_importer(
  test_db: contentfiles.ContentFiles, tempdir: pathlib.Path
):
  """Test that error is raised if no importer specified."""
  language = scrape_repos_pb2.LanguageToClone(
    language="test", query=[], destination_directory=str(tempdir), importer=[]
  )
  with test.Raises(ValueError):
    importer.ImportFromLanguage(test_db, language)
Exemplo n.º 4
0
def test_ImportFromLanguage_Java_repo(tempdir: pathlib.Path):
  """An end-to-end test of a Java importer."""
  (tempdir / 'src').mkdir()
  (tempdir / 'src' / 'Owner_Name' / '.git').mkdir(parents=True)
  (tempdir / 'src' / 'Owner_Name' / 'src').mkdir(parents=True)

  # A repo will only be imported if there is a repo meta file.
  pbutil.ToFile(scrape_repos_pb2.GitHubRepoMetadata(
      owner='Owner',
      name='Name'),
      tempdir / 'src' / 'Owner_Name.pbtxt')

  # Create some files in our test repo.
  with open(tempdir / 'src' / 'Owner_Name' / 'src' / 'A.java', 'w') as f:
    f.write("""
public class A {
  public static void helloWorld() {
    System.out.println("Hello, world!");
  }
}
""")
  with open(tempdir / 'src' / 'Owner_Name' / 'src' / 'B.java', 'w') as f:
    f.write("""
public class B {
  private static int foo() {return 5;}
}
""")
  with open(tempdir / 'src' / 'Owner_Name' / 'README.txt', 'w') as f:
    f.write('Hello, world!')

  language = scrape_repos_pb2.LanguageToClone(
      language='foolang',
      query=[],
      destination_directory=str(tempdir / 'src'),
      importer=[
        scrape_repos_pb2.ContentFilesImporterConfig(
            source_code_pattern='.*\\.java',
            preprocessor=["datasets.github.scrape_repos.preprocessors."
                          "extractors:JavaMethods"]),
      ]
  )
  indexer.ImportFromLanguage(language, multiprocessing.Pool(1))

  test_repo = github_repo.GitHubRepo(tempdir / 'src' / 'Owner_Name.pbtxt')
  assert (test_repo.index_dir / 'DONE.txt').is_file()
  assert len(list(test_repo.index_dir.iterdir())) == 3
  contentfiles = list(test_repo.ContentFiles())
  assert len(contentfiles) == 2
  assert set([cf.text for cf in contentfiles]) == {
    ('public static void helloWorld(){\n'
     '  System.out.println("Hello, world!");\n}\n'),
    'private static int foo(){\n  return 5;\n}\n',
  }
Exemplo n.º 5
0
def language(
  tempdir: pathlib.Path, query: scrape_repos_pb2.GitHubRepositoryQuery
) -> scrape_repos_pb2.LanguageCloneList:
  return scrape_repos_pb2.LanguageToClone(
    language="java",
    query=[query],
    destination_directory=str(tempdir),
    importer=[
      scrape_repos_pb2.ContentFilesImporterConfig(
        source_code_pattern=".*\\.java"
      )
    ],
  )
Exemplo n.º 6
0
def GetLanguageToClone(
  query_prefix: str, destination_dir: str
) -> scrape_repos_pb2.LanguageToClone:
  # See: https://help.github.com/en/articles/sorting-search-results
  sort_by = random.choice(["stars", "forks", "updated"])
  return scrape_repos_pb2.LanguageToClone(
    language="java",
    query=[
      scrape_repos_pb2.GitHubRepositoryQuery(
        string=f"{query_prefix} language:java sort:{sort_by} fork:false"
      )
    ],
    destination_directory=destination_dir,
    importer=[
      scrape_repos_pb2.ContentFilesImporterConfig(
        source_code_pattern=".*\\.java"
      )
    ],
    clone_from_url_blacklist=BLACKLIST_GITHUB_REPOS,
  )