def get_modified_datasets( *, since: datetime | None = None, until: datetime | None = None, ) -> set[str]: """Retrieve the modified datasets. Requires to set GITHUB_ACCESS_TOKEN as an environment variable. Parameters ---------- since : Optional[datetime], optional Start date from which commits are retrieved, by default date of the previous crawl, if never crawled set to one week ago. until : Optional[datetime], optional Latest date at which commit are retrieved, by default `now` Returns ------- set[str] Path of the dataset folders. """ now = datetime.now().astimezone() if since is None: if os.path.exists(".conp-archive"): with open(".conp-archive") as fin: since = datetime.fromisoformat(fin.read().rstrip("\n")) else: since = now - timedelta(weeks=1) if until is None: until = now try: gh_access_token = os.environ.get("GITHUB_ACCESS_TOKEN", None) if gh_access_token is None: raise OSError("GITHUB_ACCESS_TOKEN is not defined.") except OSError as e: # The program is not stopped since GitHub allows 60 query per hours with # authentication. However the program will most likely fail. logger.critical(e) logger.info(f"Retrieving modified datasets since {since}") repo = Github(gh_access_token).get_repo("CONP-PCNO/conp-dataset") commits = repo.get_commits(since=since, until=until) with open(".conp-archive", "w") as fout: fout.write(now.isoformat()) modified_datasets: set[str] = { os.path.basename(file_.filename) for commit in commits for file_ in commit.files if file_.filename.startswith("projects/") } return modified_datasets
with open(FILE_CONTRIBUTIONS_COMMITS, "r") as fh: checked_commits = set(json.load(fh)) except Exception: checked_commits = set() for branch in repo.get_branches(): # Ignore branches from `IGNORE_BRANCHES` list if branch.name in IGNORE_BRANCHES: print("Skipping branch: {}".format(branch.name)) continue print("Reading branch: {}".format(branch.name)) for commit in repo.get_commits(branch.name, since=READ_SINCE, until=READ_UNTIL): message = commit.commit.message.strip().lower() # Ignore commits with messages starting with `merge` or # ending with `(#digits)` (considered to be pull request)` if (message.startswith("merge") or re.search(r"\(#\d+\)$", message)): continue sha = commit.commit.tree.sha if sha in checked_commits: continue checked_commits.add(sha)
def refresh_leaderboard_commits(id): # Populates a leaderboard with commits, also works if the leaderboard is out of date leaderboard = get_object_or_404(models.Leaderboard, id=id) if leaderboard.closed: return False # Use the leaderboard's token to access github token = SocialToken.objects.filter( account__user=leaderboard.owner, account__provider="github" ).values_list("token") if token: token = token[0][0] else: token = "" repo_str = urlparse(leaderboard.repo_url).path[1:] if repo_str[-1] == "/": repo_str = repo_str[:-1] if token: repo = Github(token).get_repo(repo_str) else: repo = Github().get_repo(repo_str) github_commits = models.Commit.objects.filter(leaderboard=leaderboard).order_by( "-timestamp" ) if github_commits.exists(): latest_commit = github_commits[0] else: latest_commit = None github_commits = [] # Stop adding new commits when we see the latest commit of our table in github's response for commit in repo.get_commits(): if not latest_commit or commit.sha != latest_commit.nodeid: github_commits.append(commit) updated = 0 for commit in github_commits: if models.Commit.objects.filter( nodeid=commit.sha, leaderboard=leaderboard ).exists(): # This if statement might be pointless because of the latest_commit check above # Someone needs to test this and remove it if that is the case continue # skip if commit object already exists # Missing author bug for some reason if not commit.author: break models.Commit.objects.create( leaderboard=leaderboard, user=commit.author.login, nodeid=commit.sha, message=commit.commit.message, url=commit.url, html_url=commit.html_url, timestamp=commit.commit.author.date, ) updated += 1 return {"total": len(github_commits), "new": updated}