Exemplo n.º 1
0
def get_modified_datasets(
    *,
    since: datetime | None = None,
    until: datetime | None = None,
) -> set[str]:
    """Retrieve the modified datasets.

    Requires to set GITHUB_ACCESS_TOKEN as an environment variable.

    Parameters
    ----------
    since : Optional[datetime], optional
        Start date from which commits are retrieved, by default date of the previous crawl, if never crawled set to
        one week ago.
    until : Optional[datetime], optional
        Latest date at which commit are retrieved, by default `now`

    Returns
    -------
    set[str]
        Path of the dataset folders.
    """
    now = datetime.now().astimezone()

    if since is None:
        if os.path.exists(".conp-archive"):
            with open(".conp-archive") as fin:
                since = datetime.fromisoformat(fin.read().rstrip("\n"))
        else:
            since = now - timedelta(weeks=1)

    if until is None:
        until = now

    try:
        gh_access_token = os.environ.get("GITHUB_ACCESS_TOKEN", None)
        if gh_access_token is None:
            raise OSError("GITHUB_ACCESS_TOKEN is not defined.")

    except OSError as e:
        # The program is not stopped since GitHub allows 60 query per hours with
        # authentication. However the program will most likely fail.
        logger.critical(e)

    logger.info(f"Retrieving modified datasets since {since}")
    repo = Github(gh_access_token).get_repo("CONP-PCNO/conp-dataset")
    commits = repo.get_commits(since=since, until=until)

    with open(".conp-archive", "w") as fout:
        fout.write(now.isoformat())

    modified_datasets: set[str] = {
        os.path.basename(file_.filename)
        for commit in commits for file_ in commit.files
        if file_.filename.startswith("projects/")
    }

    return modified_datasets
Exemplo n.º 2
0
    with open(FILE_CONTRIBUTIONS_COMMITS, "r") as fh:
        checked_commits = set(json.load(fh))

except Exception:
    checked_commits = set()

for branch in repo.get_branches():
    # Ignore branches from `IGNORE_BRANCHES` list
    if branch.name in IGNORE_BRANCHES:
        print("Skipping branch: {}".format(branch.name))
        continue

    print("Reading branch: {}".format(branch.name))

    for commit in repo.get_commits(branch.name,
                                   since=READ_SINCE,
                                   until=READ_UNTIL):

        message = commit.commit.message.strip().lower()

        # Ignore commits with messages starting with `merge` or
        # ending with `(#digits)` (considered to be pull request)`
        if (message.startswith("merge") or re.search(r"\(#\d+\)$", message)):
            continue

        sha = commit.commit.tree.sha

        if sha in checked_commits:
            continue

        checked_commits.add(sha)
Exemplo n.º 3
0
def refresh_leaderboard_commits(id):
    # Populates a leaderboard with commits, also works if the leaderboard is out of date
    leaderboard = get_object_or_404(models.Leaderboard, id=id)
    if leaderboard.closed:
        return False

    # Use the leaderboard's token to access github
    token = SocialToken.objects.filter(
        account__user=leaderboard.owner, account__provider="github"
    ).values_list("token")
    if token:
        token = token[0][0]
    else:
        token = ""

    repo_str = urlparse(leaderboard.repo_url).path[1:]
    if repo_str[-1] == "/":
        repo_str = repo_str[:-1]

    if token:
        repo = Github(token).get_repo(repo_str)
    else:
        repo = Github().get_repo(repo_str)

    github_commits = models.Commit.objects.filter(leaderboard=leaderboard).order_by(
        "-timestamp"
    )
    if github_commits.exists():
        latest_commit = github_commits[0]
    else:
        latest_commit = None

    github_commits = []

    # Stop adding new commits when we see the latest commit of our table in github's response
    for commit in repo.get_commits():
        if not latest_commit or commit.sha != latest_commit.nodeid:
            github_commits.append(commit)

    updated = 0
    for commit in github_commits:
        if models.Commit.objects.filter(
            nodeid=commit.sha, leaderboard=leaderboard
        ).exists():
            # This if statement might be pointless because of the latest_commit check above
            # Someone needs to test this and remove it if that is the case
            continue  # skip if commit object already exists

        # Missing author bug for some reason
        if not commit.author:
            break

        models.Commit.objects.create(
            leaderboard=leaderboard,
            user=commit.author.login,
            nodeid=commit.sha,
            message=commit.commit.message,
            url=commit.url,
            html_url=commit.html_url,
            timestamp=commit.commit.author.date,
        )
        updated += 1
    return {"total": len(github_commits), "new": updated}