Пример #1
0
def _CreateTestRepo(root_dir: pathlib.Path, owner: str, name: str) -> None:
  """Create an empty repo for testing indexers."""
  owner_name = f'{owner}_{name}'
  (root_dir / owner_name / '.git').mkdir(parents=True)
  (root_dir / owner_name / 'src').mkdir(parents=True)
  pbutil.ToFile(scrape_repos_pb2.GitHubRepoMetadata(owner=owner, name=name),
                root_dir / f'{owner_name}.pbtxt')
Пример #2
0
def CloneFromMetafile(metafile: pathlib.Path) -> None:
  meta = pbutil.FromFile(metafile, scrape_repos_pb2.GitHubRepoMetadata())
  if not meta.owner and meta.name:
    logging.error('Metafile missing owner and name fields %s', metafile)
    return
  clone_dir = metafile.parent / f'{meta.owner}_{meta.name}'
  logging.debug('%s', meta)
  if (clone_dir / '.git').is_dir():
    return

  # Remove anything left over from a previous attempt.
  subprocess.check_call(['rm', '-rf', str(clone_dir)])

  cmd = ['timeout', f'{FLAGS.repository_clone_timeout_minutes}m',
         '/usr/bin/git', 'clone', meta.clone_from_url, str(clone_dir)]
  logging.debug('$ %s', ' '.join(cmd))

  # Try to checkout the repository and submodules.
  p = subprocess.Popen(cmd + ['--recursive'], stdout=subprocess.PIPE,
                       stderr=subprocess.PIPE, universal_newlines=True)
  _, stderr = p.communicate()
  if p.returncode and 'submodule' in stderr:
    # Remove anything left over from a previous attempt.
    subprocess.check_call(['rm', '-rf', str(clone_dir)])
    # Try again, but this time without cloning submodules.
    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                         universal_newlines=True)
    _, stderr = p.communicate()

  if p.returncode:
    # Give up.
    logging.warning('\nClone failed %s:\n%s', meta.clone_from_url, stderr)
    # Remove anything left over.
    subprocess.check_call(['rm', '-rf', str(clone_dir)])
Пример #3
0
def test_ImportFromLanguage_Java_repo(
  test_db: contentfiles.ContentFiles, tempdir: pathlib.Path
):
  """An end-to-end test of a Java importer."""
  (tempdir / "Owner_Name" / ".git").mkdir(parents=True)
  (tempdir / "Owner_Name" / "src").mkdir(parents=True)

  # A repo will only be imported if there is a repo meta file.
  pbutil.ToFile(
    scrape_repos_pb2.GitHubRepoMetadata(owner="Owner", name="Name"),
    tempdir / "Owner_Name.pbtxt",
  )

  # Create some files in our test repo.
  with open(tempdir / "Owner_Name" / "src" / "A.java", "w") as f:
    f.write(
      """
public class A {
  public static void helloWorld() {
    System.out.println("Hello, world!");
  }
}
"""
    )
  with open(tempdir / "Owner_Name" / "src" / "B.java", "w") as f:
    f.write(
      """
public class B {
  private static int foo() {return 5;}
}
"""
    )
  with open(tempdir / "Owner_Name" / "README.txt", "w") as f:
    f.write("Hello, world!")

  language = scrape_repos_pb2.LanguageToClone(
    language="foolang",
    query=[],
    destination_directory=str(tempdir),
    importer=[
      scrape_repos_pb2.ContentFilesImporterConfig(
        source_code_pattern=".*\\.java",
        preprocessor=[
          "datasets.github.scrape_repos.preprocessors." "extractors:JavaMethods"
        ],
      ),
    ],
  )
  importer.ImportFromLanguage(test_db, language)
  with test_db.Session() as session:
    query = session.query(contentfiles.ContentFile)
    assert query.count() == 2
    assert set([cf.text for cf in query]) == {
      (
        "public static void helloWorld(){\n"
        '  System.out.println("Hello, world!");\n}\n'
      ),
      "private static int foo(){\n  return 5;\n}\n",
    }
Пример #4
0
def ShouldImportRepo(session: orm.session.Session,
                     metafile: pathlib.Path) -> bool:
    """Determine if the repository described by a metafile should be imported.

  A repository should be imported iff:
    * The metafile is a valid GitHubRepoMetadata proto.
    * The clone directory specified in the metafile appears to be a github repo.
    * The repo does not exist in the contentfiles database.
  """
    if not (metafile.is_file() and pbutil.ProtoIsReadable(
            metafile, scrape_repos_pb2.GitHubRepoMetadata())):
        return False
    meta = pbutil.FromFile(metafile, scrape_repos_pb2.GitHubRepoMetadata())
    clone_dir = metafile.parent / f'{meta.owner}_{meta.name}'
    if not (clone_dir / '.git').is_dir():
        return False
    return not contentfiles.GitHubRepository.IsInDatabase(session, meta)
Пример #5
0
def GetRepositoryMetadata(
    repo: Repository.Repository) -> scrape_repos_pb2.GitHubRepoMetadata():
  """Get metadata about a GitHub repository.

  Args:
    repo: A Repository instance.

  Returns:
    A GitHubRepoMetadata instance.
  """
  meta = scrape_repos_pb2.GitHubRepoMetadata()
  meta.scraped_utc_epoch_ms = labdate.MillisecondsTimestamp(
      labdate.GetUtcMillisecondsNow())
  meta.owner = repo.owner.login
  meta.name = repo.name
  meta.num_watchers = repo.watchers_count
  meta.num_forks = repo.forks_count
  meta.num_stars = repo.stargazers_count
  meta.clone_from_url = repo.clone_url
  return meta
Пример #6
0
def _CreateTestRepo(root_dir: pathlib.Path, owner: str,
                    name: str) -> github_repo.GitHubRepo:
    """Create an empty repo for testing indexers."""
    owner_name = f"{owner}_{name}"
    (root_dir / owner_name / ".git").mkdir(parents=True)
    (root_dir / owner_name / "src").mkdir(parents=True)
    pbutil.ToFile(
        scrape_repos_pb2.GitHubRepoMetadata(owner=owner, name=name),
        root_dir / f"{owner_name}.pbtxt",
    )
    return github_repo.GitHubRepo(root_dir / f"{owner_name}.pbtxt")
Пример #7
0
def test_ImportFromLanguage_Java_repo(tempdir: pathlib.Path):
  """An end-to-end test of a Java importer."""
  (tempdir / 'src').mkdir()
  (tempdir / 'src' / 'Owner_Name' / '.git').mkdir(parents=True)
  (tempdir / 'src' / 'Owner_Name' / 'src').mkdir(parents=True)

  # A repo will only be imported if there is a repo meta file.
  pbutil.ToFile(scrape_repos_pb2.GitHubRepoMetadata(
      owner='Owner',
      name='Name'),
      tempdir / 'src' / 'Owner_Name.pbtxt')

  # Create some files in our test repo.
  with open(tempdir / 'src' / 'Owner_Name' / 'src' / 'A.java', 'w') as f:
    f.write("""
public class A {
  public static void helloWorld() {
    System.out.println("Hello, world!");
  }
}
""")
  with open(tempdir / 'src' / 'Owner_Name' / 'src' / 'B.java', 'w') as f:
    f.write("""
public class B {
  private static int foo() {return 5;}
}
""")
  with open(tempdir / 'src' / 'Owner_Name' / 'README.txt', 'w') as f:
    f.write('Hello, world!')

  language = scrape_repos_pb2.LanguageToClone(
      language='foolang',
      query=[],
      destination_directory=str(tempdir / 'src'),
      importer=[
        scrape_repos_pb2.ContentFilesImporterConfig(
            source_code_pattern='.*\\.java',
            preprocessor=["datasets.github.scrape_repos.preprocessors."
                          "extractors:JavaMethods"]),
      ]
  )
  indexer.ImportFromLanguage(language, multiprocessing.Pool(1))

  test_repo = github_repo.GitHubRepo(tempdir / 'src' / 'Owner_Name.pbtxt')
  assert (test_repo.index_dir / 'DONE.txt').is_file()
  assert len(list(test_repo.index_dir.iterdir())) == 3
  contentfiles = list(test_repo.ContentFiles())
  assert len(contentfiles) == 2
  assert set([cf.text for cf in contentfiles]) == {
    ('public static void helloWorld(){\n'
     '  System.out.println("Hello, world!");\n}\n'),
    'private static int foo(){\n  return 5;\n}\n',
  }
Пример #8
0
def ImportRepo(session: orm.session.Session,
               language: scrape_repos_pb2.LanguageToClone,
               metafile: pathlib.Path, pool: multiprocessing.Pool) -> None:
    """Import contentfiles from repository.

  Args:
    session: A database session to import to.
    language: The language specification for the repo.
    metafile: The repo metafile.
    pool: A multiprocessing pool.
  """
    meta = pbutil.FromFile(metafile, scrape_repos_pb2.GitHubRepoMetadata())
    clone_dir = metafile.parent / f'{meta.owner}_{meta.name}'
    repo = contentfiles.GitHubRepository.GetOrAdd(session, meta)
    repo.language = language.language

    for importer in language.importer:
        if not importer.source_code_pattern:
            logging.error('No source_code_pattern specified! Stopping now.')
            return

        pat = importer.source_code_pattern
        pat = f'{clone_dir}/{pat[1:]}' if pat[
            0] == '^' else f'{clone_dir}/{pat}'
        cmd = [
            'find',
            str(clone_dir), '-type', 'f', '-regex', pat, '-not', '-path',
            '*/.git/*'
        ]
        logging.debug('$ %s', ' '.join(cmd))
        paths = subprocess.check_output(
            cmd, universal_newlines=True).rstrip().split('\n')
        if len(paths) == 1 and not paths[0]:
            logging.debug('No files to import from %s', clone_dir)
            return
        logging.info("Importing %s '%s' files from %s ...",
                     humanize.intcomma(len(paths)),
                     importer.source_code_pattern, clone_dir)
        all_files_relpaths = public.GetAllFilesRelativePaths(clone_dir)
        jobs = [
            scrape_repos_pb2.ImportWorker(
                clone_from_url=meta.clone_from_url,
                clone_dir=str(clone_dir),
                abspath=p,
                all_files_relpaths=all_files_relpaths,
                preprocessors=importer.preprocessor,
            ) for p in paths
        ]
        bar = progressbar.ProgressBar(max_value=len(jobs))
        for outputs in bar(pool.imap_unordered(ImportWorker, jobs)):
            for output in outputs:
                session.add(output)
Пример #9
0
  def ProcessRepo(self, repo: Repository.Repository) -> None:
    """Make metafile for a single repo."""
    meta_path = self.GetRepoMetaPath(repo)
    if not pbutil.ProtoIsReadable(
      meta_path, scrape_repos_pb2.GitHubRepoMetadata()
    ):
      meta = GetRepositoryMetadata(repo)
      app.Log(2, "%s", meta)

      # Ignore URLs in the blacklist.
      if meta.clone_from_url.lower() in self.language.clone_from_url_blacklist:
        return

      pbutil.ToFile(meta, meta_path)
Пример #10
0
  def MakeRepositoryMetas(self,
                          repos: typing.List[Repository.Repository]) -> None:
    """Make meta files for a list of repositories.

    Args:
      repos: A list of GitHub Repository instances.
    """
    logging.debug('Scraping %s repositories', humanize.intcomma(len(repos)))
    for repo in repos:
      self.i += 1
      concat_name = '_'.join([repo.owner.login, repo.name])
      clone_dir = self.destination_directory / concat_name
      meta_path = pathlib.Path(str(clone_dir) + '.pbtxt')
      if not pbutil.ProtoIsReadable(meta_path,
                                    scrape_repos_pb2.GitHubRepoMetadata()):
        meta = GetRepositoryMetadata(repo)
        logging.debug('%s', meta)
        pbutil.ToFile(meta, meta_path)
Пример #11
0
  def __init__(self, metafile: pathlib.Path):
    """Instantiate a github repo.

    Args:
      metafile: The path to the github meta file proto.

    Raises:
      ValueError: In case the metafile cannot be read.
    """
    self.metafile: pathlib.Path = metafile
    try:
      self.meta: scrape_repos_pb2.GitHubRepoMetadata = pbutil.FromFile(
          metafile, scrape_repos_pb2.GitHubRepoMetadata())
    except pbutil.DecodeError as e:
      raise ValueError(f"Failed to read metafile '{self.metafile}' {e}")
    self.name: str = f'{self.meta.owner}_{self.meta.name}'
    self.clone_dir: pathlib.Path = metafile.parent / self.name
    self.index_dir = (
        pathlib.Path(str(metafile.parent) + '.index') / self.name)
Пример #12
0
def CloneFromMetafile(metafile: pathlib.Path) -> None:
    meta = pbutil.FromFile(metafile, scrape_repos_pb2.GitHubRepoMetadata())
    clone_dir = GetCloneDir(metafile)
    if not clone_dir:
        app.Error("Failed to determine clone directory")
    app.Log(2, "%s", meta)
    if (clone_dir / ".git").is_dir():
        return

    # Remove anything left over from a previous attempt.
    subprocess.check_call(["rm", "-rf", str(clone_dir)])

    # Try to checkout the repository and submodules.
    try:
        git_clone.GitClone(
            meta.clone_from_url,
            clone_dir,
            shallow=True,
            recursive=True,
            timeout=FLAGS.repository_clone_timeout_minutes * 60,
        )
    except git_clone.RepoCloneFailed:
        # Remove anything left over from a previous attempt.
        subprocess.check_call(["rm", "-rf", str(clone_dir)])
        # Try again, but this time without cloning submodules.
        try:
            git_clone.GitClone(
                meta.clone_from_url,
                clone_dir,
                shallow=True,
                recursive=False,
                timeout=FLAGS.repository_clone_timeout_minutes * 60,
            )
        except git_clone.RepoCloneFailed:
            # Give up.
            app.Warning("\nClone failed %s:\n%s", meta.clone_from_url)
            # Remove anything left over.
            subprocess.check_call(["rm", "-rf", str(clone_dir)])
Пример #13
0
def IsRepoMetaFile(f: str):
  """Determine if a path is a GitHubRepoMetadata message."""
  return (fs.isfile(f) and pbutil.ProtoIsReadable(f,
                                                  scrape_repos_pb2.GitHubRepoMetadata()))
Пример #14
0
def GetCloneDir(metafile: pathlib.Path) -> Optional[pathlib.Path]:
    meta = pbutil.FromFile(metafile, scrape_repos_pb2.GitHubRepoMetadata())
    if not meta.owner and meta.name:
        app.Error("Metafile missing owner and name fields %s", metafile)
        return
    return metafile.parent / f"{meta.owner}_{meta.name}"