def get_organization_memberships(): """Refresh all user organizations.""" session = new_session() tz = pytz.timezone('Europe/Berlin') now = datetime.now(tz) contributors = session.query(Contributor).all() for contributor in contributors: if contributor.last_full_scan and contributor.last_full_scan > now - timedelta( days=2): continue logger.info( f'Checking {contributor.login}. {github.github.rate_limiting[0]} remaining.' ) github_user = call_github_function(github.github, 'get_user', [contributor.login]) github_orgs = call_github_function(github_user, 'get_orgs') for org in github_orgs: organization = Organization.get_organization( org.login, org.url, session) contributor.organizations.append(organization) contributor.last_full_scan = datetime.utcnow() session.add(contributor) session.commit()
def get_github_repository_users(full_name: str): """Get all collaborators of a repository.""" repo = call_github_function(github.github, 'get_repo', [full_name]) collaborators = call_github_function(repo, 'get_collaborators') while collaborators._couldGrow(): call_github_function(collaborators, '_grow') collaborator_list = [c.login for c in collaborators] sub_manager = Manager('github_repository', []) manager = Manager('github_contributor', collaborator_list, sub_manager) manager.start() manager.run()
def get_user_data(user_data: tuple): """Get all missing data from a user.""" try: contributor = user_data[0] login = contributor.login session = new_session() contributor = Contributor.get_contributor(login, session, True) user = call_github_function(github.github, 'get_user', [login]) if user.location: contributor.location = user.location session.add(contributor) session.commit() response = {'message': f'Scanned user {login}'} except GithubException as e: # Forbidden or not found (Just made private or deleted) if e.status == 404: response = {'message': f'User {login} not found.'} pass except BaseException as e: # Catch any exception and print it, as we won't get any information due to threading otherwise. sentry.captureException() response = { 'message': f'Error while getting repos for {login}:\n', 'error': traceback.format_exc(), } pass finally: session.close() return response
def check_fork(github_repo, session, repository, scan_list, user_login=None): """Handle github_repo forks.""" # We already scanned this repository and only need to check # if it or its parent should be scanned if repository.completely_scanned: # Its a fork, check if the parent needs to be scanned if repository.fork: if repository.parent.should_scan(): scan_list.add(github_repo.parent.full_name) # Its no fork just skip and return else: return # We don't know the repository yet. # Create the parent and check if it is a valid fork try: get_github_object(github_repo, 'parent') call_github_function(github_repo.parent, '_completeIfNeeded') except GithubException as e: if e.status == 451 or e.status == 404: repository.fork = False return parent_repository = Repository.get_or_create( session, github_repo.parent.ssh_url, name=github_repo.parent.name, full_name=github_repo.parent.full_name, ) # If the names are identical it's likely not spite/hate fork. if github_repo.parent.name == github_repo.name: # Set the parent on the forked repository if not repository.parent: repository.parent = parent_repository # Mark the repository as a fork and scan the parent. repository.fork = True if parent_repository.should_scan(): scan_list.add(parent_repository.full_name) session.add(repository)
def get_user_with_followers(name: str): """Get all relevant Information about all friends of a specific user..""" user = call_github_function(github.github, 'get_user', [name]) followers = call_github_function(user, 'get_followers') following = call_github_function(user, 'get_following') # Add all following and followed people into list # Deduplicate the list as we have to make as few API calls as possible. user_list = [user] for follower in followers: user_list.append(follower) for followed in following: exists = filter(lambda x: x.login == followed.login, user_list) if len(list(exists)) == 0: user_list.append(followed) user_logins = [u.login for u in user_list] # for user in user_list: # print(user) sub_manager = Manager('github_repository', []) manager = Manager('github_contributor', user_logins, sub_manager) manager.start() manager.run() try: session = new_session() for login in user_logins: contributor = session.query(Contributor) \ .filter(Contributor.login.ilike(login)) \ .one() if not contributor.too_big: contributor.last_full_scan = datetime.utcnow() session.add(contributor) session.commit() finally: session.close()
def get_user(login: str): """Get a user by his login name.""" user = call_github_function(github.github, 'get_user', [login]) sub_manager = Manager('github_repository', []) manager = Manager('github_contributor', [user.login], sub_manager) manager.start() manager.run() try: session = db.get_session() contributor = session.query(Contributor) \ .filter(Contributor.login.ilike(login)) \ .one() contributor.last_full_scan = datetime.utcnow() session.add(contributor) session.commit() finally: session.close()
def get_github_author(self, email, git_commit, do_commit=True): """Get the related Github author.""" # No Github repository or the contributor is already known if not self.github_repo or email.contributor is not None or email.unknown: return github_commit = call_github_function(self.github_repo, 'get_commit', [git_commit.hex]) if github_commit.author and github_commit.author is not NotSet: # Workaround for issue https://github.com/PyGithub/PyGithub/issues/279 if github_commit.author._url.value is None: sentry.captureMessage('Author has no _url', level='info') return contributor = Contributor.get_contributor( github_commit.author.login, self.session, do_commit=do_commit, ) email.contributor = contributor
def get_github_organization(name: str, members=False): """Get all collaborators of an organization.""" session = new_session() orga = call_github_function(github.github, 'get_organization', [name]) # Get orga repos orga_repos = call_github_function(orga, 'get_repos') while orga_repos._couldGrow(): call_github_function(orga_repos, '_grow') # Check orga repos repos_to_scan = set() for github_repo in orga_repos: repository = Repository.get_or_create( session, github_repo.ssh_url, name=github_repo.name, full_name=github_repo.full_name, ) if github_repo.fork: check_fork(github_repo, session, repository, repos_to_scan) session.add(repository) if not repository.should_scan(): continue session.commit() repos_to_scan.add(github_repo.full_name) member_list = set() if members: # Get members members = call_github_function(orga, 'get_members') while members._couldGrow(): call_github_function(members, '_grow') member_list = set([m.login for m in members]) # Create and start manager with orga repos and memeber_list sub_manager = Manager('github_repository', repos_to_scan) manager = Manager('github_contributor', member_list, sub_manager) manager.start() manager.run()
def get_user_repos(user_login: str, skip=True): """Get all relevant Information for a single user.""" try: session = new_session() contributor = Contributor.get_contributor(user_login, session, True) # Checks for already scanned users. if not contributor.should_scan(): return user_up_to_date_message(user_login) if contributor.too_big: return user_too_big_message(user_login) user = call_github_function(github.github, 'get_user', [user_login]) owned = user.get_repos() starred = user.get_starred() repos_to_scan = set() # Prefetch all owned repositories user_too_big = False owned_repos = 0 while owned._couldGrow() and not user_too_big: owned_repos += 1 call_github_function(owned, '_grow') # Debug messages to see that the repositories are still collected. if owned_repos % 100 == 0: logger.info( f'{owned_repos} owned repos for user {user_login}.') # The user is too big. Just drop him. if skip and owned_repos > int( config['aggregator']['max_repositories_for_user']): user_too_big = True # Prefetch all starred repositories starred_repos = 0 while starred._couldGrow() and not user_too_big: starred_repos += 1 call_github_function(starred, '_grow') # Debug messages to see that the repositories are still collected. if starred_repos % 100 == 0: logger.info( f'{starred_repos} starred repos for user {user_login}.') # The user is too big. Just drop him. if skip and starred_repos > int( config['aggregator']['max_repositories_for_user']): user_too_big = True # User has too many repositories. Flag him and return if user_too_big: contributor.too_big = True sentry.captureMessage( 'User too big', extra={'url': contributor.login}, level='info', tags={ 'type': 'too_big', 'entity': 'user' }, ) session.add(contributor) session.commit() return user_too_big_message(user_login) # Check own repositories. We assume that we are collaborating in those for github_repo in owned: repository = Repository.get_or_create( session, github_repo.ssh_url, name=github_repo.name, full_name=github_repo.full_name, ) if github_repo.fork and not repository.is_invalid(): check_fork(github_repo, session, repository, repos_to_scan, user_login) session.add(repository) if not repository.should_scan(): continue session.commit() repos_to_scan.add(github_repo.full_name) # Check stars and if the user collaborated to them. for github_repo in starred: repository = Repository.get_or_create( session, github_repo.ssh_url, name=github_repo.name, full_name=github_repo.full_name, ) if github_repo.fork and not repository.is_invalid(): check_fork(github_repo, session, repository, repos_to_scan, user_login) session.add(repository) if not repository.should_scan(): continue repos_to_scan.add(github_repo.full_name) session.commit() rate = github.github.get_rate_limit().core message = f'Got repositories for {user.login}. ' message += f'{user.login}. {rate.remaining} of 5000 remaining.' response = { 'message': message, 'tasks': list(repos_to_scan), } except BaseException: # Catch any exception and print it, as we won't get any information due to threading otherwise. sentry.captureException() response = { 'message': f'Error while getting repos for {user_login}:\n', 'error': traceback.format_exc(), } pass finally: session.close() return response
def get_github_repository(full_name: str): """Get all information from a single repository.""" try: session = new_session() # Sleep for a random time to avoid hitting the abuse detection. sleeptime = randrange(1, 15) sleep(sleeptime) github_repo = call_github_function(github.github, 'get_repo', [full_name], {'lazy': False}) repository = Repository.get_or_create( session, github_repo.ssh_url, name=github_repo.name, full_name=github_repo.full_name, ) if repository.broken: return {'message': f'Skip broken repo {github_repo.ssh_url}'} elif github_repo.size > int(config['aggregator']['max_repository_size']): repository.too_big = True session.add(repository) session.commit() sentry.captureMessage(f'Repo filesize too big', level='info', extra={'repo': repository.clone_url}) return {'message': f'Repo too big (filesize): {github_repo.ssh_url}'} current_time = datetime.now().strftime('%H:%M') owner = get_github_object(github_repo, 'owner') git_repo = get_git_repository( github_repo.ssh_url, owner.login, github_repo.name, ) scanner = CommitScanner(git_repo, session, github_repo) commit_count = scanner.scan_repository() breadcrumbs.record( data={'action': 'Commits scanned. Set repo metadata and debug output'}, category='info', ) repository = session.query(Repository).get(github_repo.ssh_url) rate = github.github.get_rate_limit().core time = rate.reset.strftime("%H:%M") current_time = datetime.now().strftime('%H:%M') message = f'{current_time}: ' message += f'Scanned {repository.clone_url} with {commit_count} commits.\n' message += f'{rate.remaining} of 5000 remaining. Reset at {time}\n' response = {'message': message} repository.updated_at = datetime.now() session.add(repository) session.commit() except GithubException as e: # 451: Access denied. Repository probably gone private. # 404: User or repository just got deleted if e.status == 451 or e.status == 404: repository = session.query(Repository) \ .filter(Repository.full_name == full_name) \ .one_or_none() if repository: repository.broken = True session.add(repository) session.commit() response = {'message': 'Repository access blocked.'} # Catch any other GithubException else: sentry.captureException() response = error_message('Error in get_repository:\n') pass except (GitError, UnicodeDecodeError): response = error_message('Error in get_repository:\n') pass except BaseException: # Catch any exception and print it, as we won't get any information due to threading otherwise. sentry.captureException() response = error_message('Error in get_repository:\n') pass finally: if 'owner' in locals() and 'github_repo' in locals(): delete_git_repository(owner.login, github_repo.name) session.close() return response