Exemplo n.º 1
0
def get_organization_memberships():
    """Refresh all user organizations."""
    session = new_session()

    tz = pytz.timezone('Europe/Berlin')
    now = datetime.now(tz)
    contributors = session.query(Contributor).all()
    for contributor in contributors:
        if contributor.last_full_scan and contributor.last_full_scan > now - timedelta(
                days=2):
            continue
        logger.info(
            f'Checking {contributor.login}. {github.github.rate_limiting[0]} remaining.'
        )

        github_user = call_github_function(github.github, 'get_user',
                                           [contributor.login])

        github_orgs = call_github_function(github_user, 'get_orgs')
        for org in github_orgs:
            organization = Organization.get_organization(
                org.login, org.url, session)
            contributor.organizations.append(organization)
        contributor.last_full_scan = datetime.utcnow()
        session.add(contributor)
        session.commit()
Exemplo n.º 2
0
def get_github_repository_users(full_name: str):
    """Get all collaborators of a repository."""
    repo = call_github_function(github.github, 'get_repo', [full_name])
    collaborators = call_github_function(repo, 'get_collaborators')
    while collaborators._couldGrow():
        call_github_function(collaborators, '_grow')

    collaborator_list = [c.login for c in collaborators]

    sub_manager = Manager('github_repository', [])
    manager = Manager('github_contributor', collaborator_list, sub_manager)
    manager.start()
    manager.run()
Exemplo n.º 3
0
def get_user_data(user_data: tuple):
    """Get all missing data from a user."""
    try:
        contributor = user_data[0]
        login = contributor.login

        session = new_session()
        contributor = Contributor.get_contributor(login, session, True)

        user = call_github_function(github.github, 'get_user', [login])
        if user.location:
            contributor.location = user.location

        session.add(contributor)
        session.commit()
        response = {'message': f'Scanned user {login}'}

    except GithubException as e:
        # Forbidden or not found (Just made private or deleted)
        if e.status == 404:
            response = {'message': f'User {login} not found.'}
        pass

    except BaseException as e:
        # Catch any exception and print it, as we won't get any information due to threading otherwise.
        sentry.captureException()
        response = {
            'message': f'Error while getting repos for {login}:\n',
            'error': traceback.format_exc(),
        }
        pass
    finally:
        session.close()

    return response
Exemplo n.º 4
0
def check_fork(github_repo, session, repository, scan_list, user_login=None):
    """Handle github_repo forks."""
    # We already scanned this repository and only need to check
    # if it or its parent should be scanned
    if repository.completely_scanned:
        # Its a fork, check if the parent needs to be scanned
        if repository.fork:
            if repository.parent.should_scan():
                scan_list.add(github_repo.parent.full_name)
        # Its no fork just skip and return
        else:
            return

    # We don't know the repository yet.
    # Create the parent and check if it is a valid fork
    try:
        get_github_object(github_repo, 'parent')
        call_github_function(github_repo.parent, '_completeIfNeeded')
    except GithubException as e:
        if e.status == 451 or e.status == 404:
            repository.fork = False
            return

    parent_repository = Repository.get_or_create(
        session,
        github_repo.parent.ssh_url,
        name=github_repo.parent.name,
        full_name=github_repo.parent.full_name,
    )

    # If the names are identical it's likely not spite/hate fork.
    if github_repo.parent.name == github_repo.name:
        # Set the parent on the forked repository
        if not repository.parent:
            repository.parent = parent_repository

        # Mark the repository as a fork and scan the parent.
        repository.fork = True
        if parent_repository.should_scan():
            scan_list.add(parent_repository.full_name)

    session.add(repository)
Exemplo n.º 5
0
def get_user_with_followers(name: str):
    """Get all relevant Information about all friends of a specific user.."""
    user = call_github_function(github.github, 'get_user', [name])
    followers = call_github_function(user, 'get_followers')
    following = call_github_function(user, 'get_following')

    # Add all following and followed people into list
    # Deduplicate the list as we have to make as few API calls as possible.
    user_list = [user]
    for follower in followers:
        user_list.append(follower)
    for followed in following:
        exists = filter(lambda x: x.login == followed.login, user_list)
        if len(list(exists)) == 0:
            user_list.append(followed)

    user_logins = [u.login for u in user_list]
    #    for user in user_list:
    #        print(user)
    sub_manager = Manager('github_repository', [])
    manager = Manager('github_contributor', user_logins, sub_manager)
    manager.start()
    manager.run()

    try:
        session = new_session()
        for login in user_logins:
            contributor = session.query(Contributor) \
                .filter(Contributor.login.ilike(login)) \
                .one()
            if not contributor.too_big:
                contributor.last_full_scan = datetime.utcnow()
                session.add(contributor)
        session.commit()
    finally:
        session.close()
Exemplo n.º 6
0
def get_user(login: str):
    """Get a user by his login name."""
    user = call_github_function(github.github, 'get_user', [login])
    sub_manager = Manager('github_repository', [])
    manager = Manager('github_contributor', [user.login], sub_manager)
    manager.start()
    manager.run()

    try:
        session = db.get_session()
        contributor = session.query(Contributor) \
            .filter(Contributor.login.ilike(login)) \
            .one()
        contributor.last_full_scan = datetime.utcnow()
        session.add(contributor)
        session.commit()
    finally:
        session.close()
Exemplo n.º 7
0
    def get_github_author(self, email, git_commit, do_commit=True):
        """Get the related Github author."""
        # No Github repository or the contributor is already known
        if not self.github_repo or email.contributor is not None or email.unknown:
            return
        github_commit = call_github_function(self.github_repo, 'get_commit', [git_commit.hex])

        if github_commit.author and github_commit.author is not NotSet:
            # Workaround for issue https://github.com/PyGithub/PyGithub/issues/279
            if github_commit.author._url.value is None:
                sentry.captureMessage('Author has no _url', level='info')
                return

            contributor = Contributor.get_contributor(
                github_commit.author.login,
                self.session,
                do_commit=do_commit,
            )
            email.contributor = contributor
Exemplo n.º 8
0
def get_github_organization(name: str, members=False):
    """Get all collaborators of an organization."""
    session = new_session()
    orga = call_github_function(github.github, 'get_organization', [name])

    # Get orga repos
    orga_repos = call_github_function(orga, 'get_repos')
    while orga_repos._couldGrow():
        call_github_function(orga_repos, '_grow')

    # Check orga repos
    repos_to_scan = set()
    for github_repo in orga_repos:
        repository = Repository.get_or_create(
            session,
            github_repo.ssh_url,
            name=github_repo.name,
            full_name=github_repo.full_name,
        )
        if github_repo.fork:
            check_fork(github_repo, session, repository, repos_to_scan)
        session.add(repository)

        if not repository.should_scan():
            continue

        session.commit()
        repos_to_scan.add(github_repo.full_name)

    member_list = set()
    if members:
        # Get members
        members = call_github_function(orga, 'get_members')
        while members._couldGrow():
            call_github_function(members, '_grow')
        member_list = set([m.login for m in members])

    # Create and start manager with orga repos and memeber_list
    sub_manager = Manager('github_repository', repos_to_scan)
    manager = Manager('github_contributor', member_list, sub_manager)
    manager.start()
    manager.run()
Exemplo n.º 9
0
def get_user_repos(user_login: str, skip=True):
    """Get all relevant Information for a single user."""
    try:
        session = new_session()
        contributor = Contributor.get_contributor(user_login, session, True)
        # Checks for already scanned users.
        if not contributor.should_scan():
            return user_up_to_date_message(user_login)
        if contributor.too_big:
            return user_too_big_message(user_login)

        user = call_github_function(github.github, 'get_user', [user_login])
        owned = user.get_repos()
        starred = user.get_starred()
        repos_to_scan = set()

        # Prefetch all owned repositories
        user_too_big = False
        owned_repos = 0
        while owned._couldGrow() and not user_too_big:
            owned_repos += 1
            call_github_function(owned, '_grow')

            # Debug messages to see that the repositories are still collected.
            if owned_repos % 100 == 0:
                logger.info(
                    f'{owned_repos} owned repos for user {user_login}.')

            # The user is too big. Just drop him.
            if skip and owned_repos > int(
                    config['aggregator']['max_repositories_for_user']):
                user_too_big = True

        # Prefetch all starred repositories
        starred_repos = 0
        while starred._couldGrow() and not user_too_big:
            starred_repos += 1
            call_github_function(starred, '_grow')
            # Debug messages to see that the repositories are still collected.
            if starred_repos % 100 == 0:
                logger.info(
                    f'{starred_repos} starred repos for user {user_login}.')

            # The user is too big. Just drop him.
            if skip and starred_repos > int(
                    config['aggregator']['max_repositories_for_user']):
                user_too_big = True

        # User has too many repositories. Flag him and return
        if user_too_big:
            contributor.too_big = True
            sentry.captureMessage(
                'User too big',
                extra={'url': contributor.login},
                level='info',
                tags={
                    'type': 'too_big',
                    'entity': 'user'
                },
            )
            session.add(contributor)
            session.commit()
            return user_too_big_message(user_login)

        # Check own repositories. We assume that we are collaborating in those
        for github_repo in owned:
            repository = Repository.get_or_create(
                session,
                github_repo.ssh_url,
                name=github_repo.name,
                full_name=github_repo.full_name,
            )
            if github_repo.fork and not repository.is_invalid():
                check_fork(github_repo, session, repository, repos_to_scan,
                           user_login)
            session.add(repository)

            if not repository.should_scan():
                continue

            session.commit()
            repos_to_scan.add(github_repo.full_name)

        # Check stars and if the user collaborated to them.
        for github_repo in starred:
            repository = Repository.get_or_create(
                session,
                github_repo.ssh_url,
                name=github_repo.name,
                full_name=github_repo.full_name,
            )

            if github_repo.fork and not repository.is_invalid():
                check_fork(github_repo, session, repository, repos_to_scan,
                           user_login)
            session.add(repository)

            if not repository.should_scan():
                continue

            repos_to_scan.add(github_repo.full_name)

        session.commit()

        rate = github.github.get_rate_limit().core
        message = f'Got repositories for {user.login}. '
        message += f'{user.login}. {rate.remaining} of 5000 remaining.'
        response = {
            'message': message,
            'tasks': list(repos_to_scan),
        }
    except BaseException:
        # Catch any exception and print it, as we won't get any information due to threading otherwise.
        sentry.captureException()
        response = {
            'message': f'Error while getting repos for {user_login}:\n',
            'error': traceback.format_exc(),
        }
        pass
    finally:
        session.close()

    return response
Exemplo n.º 10
0
def get_github_repository(full_name: str):
    """Get all information from a single repository."""
    try:
        session = new_session()
        # Sleep for a random time to avoid hitting the abuse detection.
        sleeptime = randrange(1, 15)
        sleep(sleeptime)

        github_repo = call_github_function(github.github, 'get_repo',
                                           [full_name], {'lazy': False})

        repository = Repository.get_or_create(
            session,
            github_repo.ssh_url,
            name=github_repo.name,
            full_name=github_repo.full_name,
        )

        if repository.broken:
            return {'message': f'Skip broken repo {github_repo.ssh_url}'}
        elif github_repo.size > int(config['aggregator']['max_repository_size']):
            repository.too_big = True
            session.add(repository)
            session.commit()
            sentry.captureMessage(f'Repo filesize too big', level='info',
                                  extra={'repo': repository.clone_url})

            return {'message': f'Repo too big (filesize): {github_repo.ssh_url}'}

        current_time = datetime.now().strftime('%H:%M')

        owner = get_github_object(github_repo, 'owner')
        git_repo = get_git_repository(
            github_repo.ssh_url,
            owner.login,
            github_repo.name,
        )
        scanner = CommitScanner(git_repo, session, github_repo)
        commit_count = scanner.scan_repository()

        breadcrumbs.record(
            data={'action': 'Commits scanned. Set repo metadata and debug output'},
            category='info',
        )

        repository = session.query(Repository).get(github_repo.ssh_url)
        rate = github.github.get_rate_limit().core
        time = rate.reset.strftime("%H:%M")
        current_time = datetime.now().strftime('%H:%M')

        message = f'{current_time}: '
        message += f'Scanned {repository.clone_url} with {commit_count} commits.\n'
        message += f'{rate.remaining} of 5000 remaining. Reset at {time}\n'

        response = {'message': message}

        repository.updated_at = datetime.now()
        session.add(repository)
        session.commit()

    except GithubException as e:
        # 451: Access denied. Repository probably gone private.
        # 404: User or repository just got deleted
        if e.status == 451 or e.status == 404:
            repository = session.query(Repository) \
                .filter(Repository.full_name == full_name) \
                .one_or_none()

            if repository:
                repository.broken = True
                session.add(repository)
                session.commit()
            response = {'message': 'Repository access blocked.'}
        # Catch any other GithubException
        else:
            sentry.captureException()
            response = error_message('Error in get_repository:\n')

        pass

    except (GitError, UnicodeDecodeError):
        response = error_message('Error in get_repository:\n')
        pass

    except BaseException:
        # Catch any exception and print it, as we won't get any information due to threading otherwise.
        sentry.captureException()
        response = error_message('Error in get_repository:\n')
        pass

    finally:
        if 'owner' in locals() and 'github_repo' in locals():
            delete_git_repository(owner.login, github_repo.name)
        session.close()

    return response