Пример #1
0
def get_forks_info(origin: Repository, client: Client) -> List[ForkInfo]:
    """
    Requests forks and wraps them in 'ForkInfo' objects
    """
    result = []
    forks = origin.get_forks()
    log.info('got list of forks, total %d', forks.totalCount)
    client.count_rate_limit(1)
    try:
        rate_limits_check(client, forks.totalCount + pagination_correction(forks.totalCount, 30))
    except RateLimitError:
        return
    for fork in forks:
        try:
            log.info('comparing fork: %s', fork.full_name)
            comparison = origin.compare(origin.owner.login + ":master", fork.owner.login + ":master")
            fi = ForkInfo(fork.html_url, (abs(datetime.now() - fork.updated_at)).days, fork.stargazers_count, comparison.ahead_by, comparison.behind_by)
            result.append(fi)
        except UnknownObjectException as e:
            log.exception('possibly removed fork or user: %s, %d, message: %s', fork.html_url, e.status,
                          e.data.get('message', ''))
        except GithubException as e:
            message = e.data.get('message', '')
            if e.status == 404 and 'No common ancestor between ' in message:  # that can be handled
                log.error('404 %s', message)
                handle_github_exception(result, fork)
            else:
                log.exception('github error')
    client.count_rate_limit(forks.totalCount + pagination_correction(forks.totalCount, 30))
    return result
Пример #2
0
 def __get_fork(fork_username: str, repo: _GithubRepository) -> _GithubRepository:
     forks = list(
         filter(lambda fork: fork.owner.login == fork_username, repo.get_forks())
     )
     if not forks:
         raise GithubAPIException("Requested fork doesn't exist")
     return forks[0]
Пример #3
0
def get_forks_over_time(repo: Repository.Repository) -> pd.DataFrame:
    # TODO: for ~10k forks repositories, this operation is too costly for doing
    # it as part of each analyzer invocation. Move this to the fetcher, and
    # persist the data.
    log.info("fetch fork time series for repo %s", repo)

    reqlimit_before = GHUB.get_rate_limit().core.remaining
    log.info("GH request limit before operation: %s", reqlimit_before)

    forks = []
    for count, fork in enumerate(repo.get_forks(), 1):
        # Store `PullRequest` object with integer key in dictionary.
        forks.append(fork)
        if count % 200 == 0:
            log.info("%s forks fetched", count)

    reqlimit_after = GHUB.get_rate_limit().core.remaining
    log.info("GH request limit after operation: %s", reqlimit_after)
    log.info("http requests made (approximately): %s",
             reqlimit_before - reqlimit_after)
    log.info("current fork count: %s", len(forks))

    # The GitHub API returns ISO 8601 timestamp strings encoding the timezone
    # via the Z suffix, i.e. Zulu time, i.e. UTC. pygithub doesn't parse that
    # timezone. That is, whereas the API returns `starred_at` in UTC, the
    # datetime obj created by pygithub is a naive one. Correct for that.
    forktimes_aware = [
        pytz.timezone("UTC").localize(f.created_at) for f in forks
    ]

    # Create sorted pandas DatetimeIndex
    dtidx = pd.to_datetime(forktimes_aware)
    dtidx = dtidx.sort_values()

    # Each timestamp corresponds to *1* fork event. Build cumulative sum over
    # time.
    df = pd.DataFrame(
        data={"fork_events": [1] * len(forks)},
        index=dtidx,
    )
    df.index.name = "time"
    df["forks_cumulative"] = df["fork_events"].cumsum()
    df = df.drop(columns=["fork_events"]).astype(int)
    log.info("forks df: \n%s", df)
    return df