Пример #1
0
def get_user_data(user_data: tuple):
    """Get all missing data from a user."""
    try:
        contributor = user_data[0]
        login = contributor.login

        session = new_session()
        contributor = Contributor.get_contributor(login, session, True)

        user = call_github_function(github.github, 'get_user', [login])
        if user.location:
            contributor.location = user.location

        session.add(contributor)
        session.commit()
        response = {'message': f'Scanned user {login}'}

    except GithubException as e:
        # Forbidden or not found (Just made private or deleted)
        if e.status == 404:
            response = {'message': f'User {login} not found.'}
        pass

    except BaseException as e:
        # Catch any exception and print it, as we won't get any information due to threading otherwise.
        sentry.captureException()
        response = {
            'message': f'Error while getting repos for {login}:\n',
            'error': traceback.format_exc(),
        }
        pass
    finally:
        session.close()

    return response
Пример #2
0
    def get_commits_to_scan(self):
        """Walk through the repository and get all commits reachable in master."""
        try:
            master_commit = self.git_repo.head.get_object()
            self.queue.appendleft(master_commit)
        except GitError as e:
            sentry.captureException(
                extra={
                    'message': 'GitError during repo cloning. Probably empty',
                    'clone_url': self.repository.clone_url,
                },
            )
            return self.scanned_commits

        # List of commit hashes to check if we already were at this point in the tree.
        commits_to_scan = []
        known_commit_hashes = set()
        # This is a simple BFS through the git commit tree.
        # If we already know a node or already scanned a node, we don't add the parents.
        while len(self.queue) > 0:
            commit = self.queue.pop()
            # Break if we already visited this tree node
            if commit.hex in known_commit_hashes:
                continue
            known_commit_hashes.add(commit.hex)

            commit_known = commit.hex in self.repository.commits_by_hash
            # Repo has been completely scanned and a this is a known commit.
            if commit_known and self.repository.completely_scanned:
                break

            # Repo has been partially scanned and a this is a known commit.
            elif commit_known and not self.repository.completely_scanned:
                [self.queue.appendleft(parent) for parent in commit.parents]
                continue
            # This is an unknown commit.
            elif not commit_known:
                [self.queue.appendleft(parent) for parent in commit.parents]

            if len(commit.parents) == 1:
                self.diffs[commit.hex] = commit.tree.diff_to_tree(commit.parents[0].tree)
            commits_to_scan.append(commit)

            if len(commits_to_scan) > 100000:
                sentry.captureMessage(
                    'Repository too big',
                    extra={'url': self.repository.clone_url},
                    level='info',
                    tags={'type': 'too_big', 'entity': 'repository'},
                )
                self.repository.too_big = True
                self.session.add(self.repository)
                commits_to_scan = []
                break

        return commits_to_scan
Пример #3
0
 def run(self):
     """Process incoming tasks."""
     while True:
         try:
             next_task = self.task_queue.get()
             # Poison pill received: Shutdown
             if next_task is None:
                 self.task_queue.task_done()
                 break
             answer = next_task()
             self.task_queue.task_done()
             self.result_queue.put(answer)
             time.sleep(1)
         except KeyboardInterrupt:
             break
         except BaseException:
             sentry.captureException()
     return
Пример #4
0
    def scan_commit(self, git_commit, existing_commits):
        """Get all features of a specific commit."""
        # If we already know this commit just add the commit to this repository.
        if git_commit.hex in existing_commits:
            commit = existing_commits[git_commit.hex]
            commit.repositories.append(self.repository)
            self.session.add(commit)

        # Unknown commit, thereby we need to get all information
        else:
            try:
                author_email = self.emails[git_commit.author.email]
                committer_email = self.emails[git_commit.committer.email]
                commit = Commit(git_commit.hex, self.repository,
                                author_email, committer_email)

                diff = self.diffs.get(git_commit.hex)
                if diff:
                    commit.additions = diff.stats.insertions
                    commit.deletions = diff.stats.deletions

                if git_commit.author:
                    timestamp = git_commit.author.time
                    offset_delta = timedelta(minutes=git_commit.author.offset)
                    utc_offset = timezone(offset_delta)
                    commit.creation_time = datetime.fromtimestamp(timestamp, utc_offset)

                timestamp = git_commit.commit_time
                offset_delta = timedelta(minutes=git_commit.commit_time_offset)
                utc_offset = timezone(offset_delta)
                commit.commit_time = datetime.fromtimestamp(timestamp, utc_offset)
                commit.commit_time_offset = offset_delta

                self.session.add(commit)
            except BaseException as e:
                sentry.captureException(
                    extra={
                        'message': 'Error during Commit creation',
                        'clone_url': self.repository.clone_url,
                        'hex': git_commit.hex,
                    },
                )
Пример #5
0
def get_user_repos(user_login: str, skip=True):
    """Get all relevant Information for a single user."""
    try:
        session = new_session()
        contributor = Contributor.get_contributor(user_login, session, True)
        # Checks for already scanned users.
        if not contributor.should_scan():
            return user_up_to_date_message(user_login)
        if contributor.too_big:
            return user_too_big_message(user_login)

        user = call_github_function(github.github, 'get_user', [user_login])
        owned = user.get_repos()
        starred = user.get_starred()
        repos_to_scan = set()

        # Prefetch all owned repositories
        user_too_big = False
        owned_repos = 0
        while owned._couldGrow() and not user_too_big:
            owned_repos += 1
            call_github_function(owned, '_grow')

            # Debug messages to see that the repositories are still collected.
            if owned_repos % 100 == 0:
                logger.info(
                    f'{owned_repos} owned repos for user {user_login}.')

            # The user is too big. Just drop him.
            if skip and owned_repos > int(
                    config['aggregator']['max_repositories_for_user']):
                user_too_big = True

        # Prefetch all starred repositories
        starred_repos = 0
        while starred._couldGrow() and not user_too_big:
            starred_repos += 1
            call_github_function(starred, '_grow')
            # Debug messages to see that the repositories are still collected.
            if starred_repos % 100 == 0:
                logger.info(
                    f'{starred_repos} starred repos for user {user_login}.')

            # The user is too big. Just drop him.
            if skip and starred_repos > int(
                    config['aggregator']['max_repositories_for_user']):
                user_too_big = True

        # User has too many repositories. Flag him and return
        if user_too_big:
            contributor.too_big = True
            sentry.captureMessage(
                'User too big',
                extra={'url': contributor.login},
                level='info',
                tags={
                    'type': 'too_big',
                    'entity': 'user'
                },
            )
            session.add(contributor)
            session.commit()
            return user_too_big_message(user_login)

        # Check own repositories. We assume that we are collaborating in those
        for github_repo in owned:
            repository = Repository.get_or_create(
                session,
                github_repo.ssh_url,
                name=github_repo.name,
                full_name=github_repo.full_name,
            )
            if github_repo.fork and not repository.is_invalid():
                check_fork(github_repo, session, repository, repos_to_scan,
                           user_login)
            session.add(repository)

            if not repository.should_scan():
                continue

            session.commit()
            repos_to_scan.add(github_repo.full_name)

        # Check stars and if the user collaborated to them.
        for github_repo in starred:
            repository = Repository.get_or_create(
                session,
                github_repo.ssh_url,
                name=github_repo.name,
                full_name=github_repo.full_name,
            )

            if github_repo.fork and not repository.is_invalid():
                check_fork(github_repo, session, repository, repos_to_scan,
                           user_login)
            session.add(repository)

            if not repository.should_scan():
                continue

            repos_to_scan.add(github_repo.full_name)

        session.commit()

        rate = github.github.get_rate_limit().core
        message = f'Got repositories for {user.login}. '
        message += f'{user.login}. {rate.remaining} of 5000 remaining.'
        response = {
            'message': message,
            'tasks': list(repos_to_scan),
        }
    except BaseException:
        # Catch any exception and print it, as we won't get any information due to threading otherwise.
        sentry.captureException()
        response = {
            'message': f'Error while getting repos for {user_login}:\n',
            'error': traceback.format_exc(),
        }
        pass
    finally:
        session.close()

    return response
Пример #6
0
def get_github_repository(full_name: str):
    """Get all information from a single repository."""
    try:
        session = new_session()
        # Sleep for a random time to avoid hitting the abuse detection.
        sleeptime = randrange(1, 15)
        sleep(sleeptime)

        github_repo = call_github_function(github.github, 'get_repo',
                                           [full_name], {'lazy': False})

        repository = Repository.get_or_create(
            session,
            github_repo.ssh_url,
            name=github_repo.name,
            full_name=github_repo.full_name,
        )

        if repository.broken:
            return {'message': f'Skip broken repo {github_repo.ssh_url}'}
        elif github_repo.size > int(config['aggregator']['max_repository_size']):
            repository.too_big = True
            session.add(repository)
            session.commit()
            sentry.captureMessage(f'Repo filesize too big', level='info',
                                  extra={'repo': repository.clone_url})

            return {'message': f'Repo too big (filesize): {github_repo.ssh_url}'}

        current_time = datetime.now().strftime('%H:%M')

        owner = get_github_object(github_repo, 'owner')
        git_repo = get_git_repository(
            github_repo.ssh_url,
            owner.login,
            github_repo.name,
        )
        scanner = CommitScanner(git_repo, session, github_repo)
        commit_count = scanner.scan_repository()

        breadcrumbs.record(
            data={'action': 'Commits scanned. Set repo metadata and debug output'},
            category='info',
        )

        repository = session.query(Repository).get(github_repo.ssh_url)
        rate = github.github.get_rate_limit().core
        time = rate.reset.strftime("%H:%M")
        current_time = datetime.now().strftime('%H:%M')

        message = f'{current_time}: '
        message += f'Scanned {repository.clone_url} with {commit_count} commits.\n'
        message += f'{rate.remaining} of 5000 remaining. Reset at {time}\n'

        response = {'message': message}

        repository.updated_at = datetime.now()
        session.add(repository)
        session.commit()

    except GithubException as e:
        # 451: Access denied. Repository probably gone private.
        # 404: User or repository just got deleted
        if e.status == 451 or e.status == 404:
            repository = session.query(Repository) \
                .filter(Repository.full_name == full_name) \
                .one_or_none()

            if repository:
                repository.broken = True
                session.add(repository)
                session.commit()
            response = {'message': 'Repository access blocked.'}
        # Catch any other GithubException
        else:
            sentry.captureException()
            response = error_message('Error in get_repository:\n')

        pass

    except (GitError, UnicodeDecodeError):
        response = error_message('Error in get_repository:\n')
        pass

    except BaseException:
        # Catch any exception and print it, as we won't get any information due to threading otherwise.
        sentry.captureException()
        response = error_message('Error in get_repository:\n')
        pass

    finally:
        if 'owner' in locals() and 'github_repo' in locals():
            delete_git_repository(owner.login, github_repo.name)
        session.close()

    return response