def clean_db(): """Clean stuff.""" logger.info("Removing commits from fork repos.") session = new_session() try: all_repositories = session.query(Repository) \ .filter(Repository.fork.is_(True)) \ .filter(Repository.commits != None) \ .options(joinedload(Repository.commits)) \ .all() logger.info(f'Found {len(all_repositories)}') repositories_count = 0 for repository in all_repositories: repository.commits = [] session.add(repository) repositories_count += 1 if repositories_count % 100 == 0: logger.info(f'Removed {repositories_count}') session.commit() logger.info("Remove unattached commits") session.query(Commit) \ .filter(Commit.repositories == None) \ .delete() session.commit() finally: session.close()
def complete_repos(): """Complete unfinished repsitories.""" logger.info("Get unfinished or out of date repositories.") rescan_interval = int(config['aggregator']['repository_rescan_interval']) rescan_threshold = datetime.utcnow() - timedelta(seconds=rescan_interval) session = new_session() repos = session.query(Repository) \ .filter(Repository.fork.is_(False)) \ .filter(Repository.broken.is_(False)) \ .filter(Repository.too_big.is_(False)) \ .filter(or_( Repository.completely_scanned.is_(False), Repository.updated_at <= rescan_threshold, )) \ .all() logger.info(f'Found {len(repos)}') full_names = [r.full_name for r in repos] repos_to_scan = set(full_names) manager = Manager('github_repository', repos_to_scan) manager.start() manager.run() session.close()
def get_user_data(user_data: tuple): """Get all missing data from a user.""" try: contributor = user_data[0] login = contributor.login session = new_session() contributor = Contributor.get_contributor(login, session, True) user = call_github_function(github.github, 'get_user', [login]) if user.location: contributor.location = user.location session.add(contributor) session.commit() response = {'message': f'Scanned user {login}'} except GithubException as e: # Forbidden or not found (Just made private or deleted) if e.status == 404: response = {'message': f'User {login} not found.'} pass except BaseException as e: # Catch any exception and print it, as we won't get any information due to threading otherwise. sentry.captureException() response = { 'message': f'Error while getting repos for {login}:\n', 'error': traceback.format_exc(), } pass finally: session.close() return response
def get_organization_memberships(): """Refresh all user organizations.""" session = new_session() tz = pytz.timezone('Europe/Berlin') now = datetime.now(tz) contributors = session.query(Contributor).all() for contributor in contributors: if contributor.last_full_scan and contributor.last_full_scan > now - timedelta( days=2): continue logger.info( f'Checking {contributor.login}. {github.github.rate_limiting[0]} remaining.' ) github_user = call_github_function(github.github, 'get_user', [contributor.login]) github_orgs = call_github_function(github_user, 'get_orgs') for org in github_orgs: organization = Organization.get_organization( org.login, org.url, session) contributor.organizations.append(organization) contributor.last_full_scan = datetime.utcnow() session.add(contributor) session.commit()
def user(login): """Plot all graphs for a specific github user.""" try: session = new_session() plot_user(login, session) except KeyboardInterrupt: logger.info("CTRL-C Exiting Gracefully") sys.exit(1)
def user_for_repositories(login, repositories): """Get statistics of an user for specific repositories.""" try: session = new_session() plot_employee(login, repositories, session) except KeyboardInterrupt: logger.info("CTRL-C Exiting Gracefully") sys.exit(1)
def comparison(users, repos): """Get statistics of several user for specific repositories.""" try: if not users or not repos: logger.info("Users and Repos are required parameters.") session = new_session() plot_comparison(users, repos, session) except KeyboardInterrupt: logger.info("CTRL-C Exiting Gracefully") sys.exit(1)
def get_punchcard_data(contributors_commits): """Analyse the travel path of a few contributers.""" try: session = new_session() for contributor, commit_hashes in contributors_commits: # Query result again with current session. contributor = session.query(Contributor).get(contributor.login) result = contributor.analysis_result if result is None: result = AnalysisResult() contributor.analysis_result = result if result.intermediate_results is None: result.intermediate_results = {} commits_changed = (len(commit_hashes) != result.commit_count) if 'punchcard' not in result.intermediate_results or commits_changed: # Deepcopy intermediate result, otherwise the jsonb won't refresh. new_intermediate = deepcopy(result.intermediate_results) commits = session.query(Commit) \ .filter(Commit.sha.in_(commit_hashes)) \ .all() # Compute the final punchcard evaluation plotter = CommitPunchcard(commits, '/', '') plotter.preprocess() # Standartize data df = plotter.data mean = df['count'].mean() df['count'] = df['count'] / mean # Add data to list vector = df['count'].values.tolist() # Save the standartized intermediate result into the database new_intermediate['punchcard'] = vector result.intermediate_results = new_intermediate result.last_change = datetime.now() result.commit_count = len(commits) session.add(result) session.add(contributor) session.commit() finally: session.close() return {'message': 'Success'}
def update_contributors(update_all): """Complete contributors.""" session = new_session() logger.info(f'Start Scan.') # Look at the last two years time_span = datetime.now() - timedelta(days=2 * 365) results = session.query(Contributor, func.array_agg(Commit.sha)) \ .filter(Contributor.login == Contributor.login) \ .join(Email, Contributor.login == Email.contributor_login) \ .join(Commit, or_( Commit.author_email_address == Email.email, Commit.committer_email_address == Email.email, )) \ .filter(Commit.commit_time >= time_span) \ .filter(or_( Contributor.location == None, )) \ .group_by(Contributor.login) \ .all() logger.info(f'Scanning {len(results)} contributors.') if update_all: contributors_to_scan = results logger.info(f'Scanning {len(contributors_to_scan)} contributors') else: count = 0 contributors_to_scan = [] for contributor, commits in results: if len(commits) > 100 and len(commits) < 20000: contributors_to_scan.append((contributor, commits)) count += 1 if count % 5000 == 0: logger.info( f'Found {count} contributors ({len(contributors_to_scan)} big)' ) manager = ListManager('github_user', contributors_to_scan) manager.start() manager.run()
def get_github_organization(name: str, members=False): """Get all collaborators of an organization.""" session = new_session() orga = call_github_function(github.github, 'get_organization', [name]) # Get orga repos orga_repos = call_github_function(orga, 'get_repos') while orga_repos._couldGrow(): call_github_function(orga_repos, '_grow') # Check orga repos repos_to_scan = set() for github_repo in orga_repos: repository = Repository.get_or_create( session, github_repo.ssh_url, name=github_repo.name, full_name=github_repo.full_name, ) if github_repo.fork: check_fork(github_repo, session, repository, repos_to_scan) session.add(repository) if not repository.should_scan(): continue session.commit() repos_to_scan.add(github_repo.full_name) member_list = set() if members: # Get members members = call_github_function(orga, 'get_members') while members._couldGrow(): call_github_function(members, '_grow') member_list = set([m.login for m in members]) # Create and start manager with orga repos and memeber_list sub_manager = Manager('github_repository', repos_to_scan) manager = Manager('github_contributor', member_list, sub_manager) manager.start() manager.run()
def get_user_with_followers(name: str): """Get all relevant Information about all friends of a specific user..""" user = call_github_function(github.github, 'get_user', [name]) followers = call_github_function(user, 'get_followers') following = call_github_function(user, 'get_following') # Add all following and followed people into list # Deduplicate the list as we have to make as few API calls as possible. user_list = [user] for follower in followers: user_list.append(follower) for followed in following: exists = filter(lambda x: x.login == followed.login, user_list) if len(list(exists)) == 0: user_list.append(followed) user_logins = [u.login for u in user_list] # for user in user_list: # print(user) sub_manager = Manager('github_repository', []) manager = Manager('github_contributor', user_logins, sub_manager) manager.start() manager.run() try: session = new_session() for login in user_logins: contributor = session.query(Contributor) \ .filter(Contributor.login.ilike(login)) \ .one() if not contributor.too_big: contributor.last_full_scan = datetime.utcnow() session.add(contributor) session.commit() finally: session.close()
def get_user_repos(user_login: str, skip=True): """Get all relevant Information for a single user.""" try: session = new_session() contributor = Contributor.get_contributor(user_login, session, True) # Checks for already scanned users. if not contributor.should_scan(): return user_up_to_date_message(user_login) if contributor.too_big: return user_too_big_message(user_login) user = call_github_function(github.github, 'get_user', [user_login]) owned = user.get_repos() starred = user.get_starred() repos_to_scan = set() # Prefetch all owned repositories user_too_big = False owned_repos = 0 while owned._couldGrow() and not user_too_big: owned_repos += 1 call_github_function(owned, '_grow') # Debug messages to see that the repositories are still collected. if owned_repos % 100 == 0: logger.info( f'{owned_repos} owned repos for user {user_login}.') # The user is too big. Just drop him. if skip and owned_repos > int( config['aggregator']['max_repositories_for_user']): user_too_big = True # Prefetch all starred repositories starred_repos = 0 while starred._couldGrow() and not user_too_big: starred_repos += 1 call_github_function(starred, '_grow') # Debug messages to see that the repositories are still collected. if starred_repos % 100 == 0: logger.info( f'{starred_repos} starred repos for user {user_login}.') # The user is too big. Just drop him. if skip and starred_repos > int( config['aggregator']['max_repositories_for_user']): user_too_big = True # User has too many repositories. Flag him and return if user_too_big: contributor.too_big = True sentry.captureMessage( 'User too big', extra={'url': contributor.login}, level='info', tags={ 'type': 'too_big', 'entity': 'user' }, ) session.add(contributor) session.commit() return user_too_big_message(user_login) # Check own repositories. We assume that we are collaborating in those for github_repo in owned: repository = Repository.get_or_create( session, github_repo.ssh_url, name=github_repo.name, full_name=github_repo.full_name, ) if github_repo.fork and not repository.is_invalid(): check_fork(github_repo, session, repository, repos_to_scan, user_login) session.add(repository) if not repository.should_scan(): continue session.commit() repos_to_scan.add(github_repo.full_name) # Check stars and if the user collaborated to them. for github_repo in starred: repository = Repository.get_or_create( session, github_repo.ssh_url, name=github_repo.name, full_name=github_repo.full_name, ) if github_repo.fork and not repository.is_invalid(): check_fork(github_repo, session, repository, repos_to_scan, user_login) session.add(repository) if not repository.should_scan(): continue repos_to_scan.add(github_repo.full_name) session.commit() rate = github.github.get_rate_limit().core message = f'Got repositories for {user.login}. ' message += f'{user.login}. {rate.remaining} of 5000 remaining.' response = { 'message': message, 'tasks': list(repos_to_scan), } except BaseException: # Catch any exception and print it, as we won't get any information due to threading otherwise. sentry.captureException() response = { 'message': f'Error while getting repos for {user_login}:\n', 'error': traceback.format_exc(), } pass finally: session.close() return response
def analyse_punch_card(existing, method, eps=150, min_samples=5): """Analyze the efficiency of the missing time comparison.""" session = new_session() logger.info(f'Start Scan.') # If the only_existing parameter is given, we only work with # the existing intermediate AnalysisResults. if not existing: # Only look at commits of the last year time_span = datetime.now() - timedelta(days=365) results = session.query(Contributor, func.array_agg(Commit.sha)) \ .filter(Contributor.login == Contributor.login) \ .join(Email, Contributor.login == Email.contributor_login) \ .join(Commit, or_( Commit.author_email_address == Email.email, Commit.committer_email_address == Email.email, )) \ .filter(Commit.commit_time >= time_span) \ .group_by(Contributor.login) \ .all() logger.info(f'Scanning {len(results)} contributors.') count = 0 big_contributors = [] for contributor, commits in results: if len(commits) > 100 and len(commits) < 20000: big_contributors.append((contributor, commits)) count += 1 if count % 50000 == 0: logger.info( f'Scanned {count} contributors ({len(big_contributors)} big)' ) # Finished searching for contributors with enough commits. logger.info(f'Analysing {len(big_contributors)} contributors.') # Chunk the contributor list into chunks of 100 chunks = create_chunks(big_contributors, 100) manager = ListManager('analyse_punchcard', chunks) manager.start() manager.run() # Only look at commits of the last year analysis_results = session.query(AnalysisResult) \ .filter(AnalysisResult.intermediate_results != None) \ .filter(AnalysisResult.commit_count > 100) \ .filter(AnalysisResult.commit_count < 20000) \ .options(joinedload('contributor')) \ .all() if existing: logger.info(f'Analysing {len(analysis_results)} results.') logger.info(f'Using {method} clustering') vectorized_data = [] contributors = [] for result in analysis_results: if 'punchcard' in result.intermediate_results: vectorized_data.append(result.intermediate_results['punchcard']) contributors.append(result.contributor) # Cluster using DBSCAN algorithm if method == 'dbscan': metric = 'l1' cluster_result = DBSCAN( eps=eps, min_samples=min_samples, metric=metric, n_jobs=-1, ).fit(vectorized_data) core_samples_mask = np.zeros_like(cluster_result.labels_, dtype=bool) core_samples_mask[cluster_result.core_sample_indices_] = True # Cluster using Mean-Shift algorithm elif method == 'mean-shift': quantile = 0.1 n_samples = -1 logger.info(f'Computing bandwidth.') bandwidth = estimate_bandwidth( vectorized_data, quantile=quantile, n_samples=n_samples, n_jobs=-1, ) logger.info(f'Bandwidth computed.') cluster_result = MeanShift( bandwidth=bandwidth, bin_seeding=True, n_jobs=-1, ).fit(vectorized_data) # Cluster using Affinity Propagation algorithm elif method == 'affinity': preference = None cluster_result = AffinityPropagation(preference=preference) \ .fit(vectorized_data) # Number of entities per label labels = cluster_result.labels_ unique, counts = np.unique(labels, return_counts=True) occurrences = dict(zip(unique, counts)) contributor_by_label = {} for index, label in enumerate(labels): if contributor_by_label.get(label) is None: contributor_by_label[label] = [] contributor_by_label[label].append(contributors[index].login) # Prepare the plot dir for prototype plotting plot_dir = config['plotting']['plot_dir'] plot_dir = os.path.join(plot_dir, 'analysis', 'analyse_punch', method) if not os.path.exists(plot_dir): os.makedirs(plot_dir) logger.info(f'Found {len(occurrences)}') # Get the mean-center prototypes for each label and plot them prototypes = get_mean_center_prototypes(cluster_result, vectorized_data, min_samples) logger.info(f'Found {len(prototypes)} valid clusters') for label, prototype in prototypes.items(): if method == 'dbscan': name = f'{metric}-{min_samples}-{eps}-{label}' else: name = f'{label}' path = os.path.join(plot_dir, name) title = f'Prototype for {name} with {occurrences[label]} elements' plotter = CommitPunchcard([], path, title) plotter.preprocess() plotter.data['count'] = np.array(prototype) * 5 plotter.plot() if method == 'dbscan': logger.info(f'DBSCAN with EPS: {eps} and {min_samples} min samples.') logger.info('Amount of entities in clusters. -1 is an outlier:') logger.info(pformat(occurrences)) logger.info(pformat(contributor_by_label)) logger.info(f'{len(analysis_results)} contributers are relevant.') if method == 'dbscan': core_samples = cluster_result.core_sample_indices_ logger.info(f'Core samples: {len(core_samples)}') return
def get_github_repository(full_name: str): """Get all information from a single repository.""" try: session = new_session() # Sleep for a random time to avoid hitting the abuse detection. sleeptime = randrange(1, 15) sleep(sleeptime) github_repo = call_github_function(github.github, 'get_repo', [full_name], {'lazy': False}) repository = Repository.get_or_create( session, github_repo.ssh_url, name=github_repo.name, full_name=github_repo.full_name, ) if repository.broken: return {'message': f'Skip broken repo {github_repo.ssh_url}'} elif github_repo.size > int(config['aggregator']['max_repository_size']): repository.too_big = True session.add(repository) session.commit() sentry.captureMessage(f'Repo filesize too big', level='info', extra={'repo': repository.clone_url}) return {'message': f'Repo too big (filesize): {github_repo.ssh_url}'} current_time = datetime.now().strftime('%H:%M') owner = get_github_object(github_repo, 'owner') git_repo = get_git_repository( github_repo.ssh_url, owner.login, github_repo.name, ) scanner = CommitScanner(git_repo, session, github_repo) commit_count = scanner.scan_repository() breadcrumbs.record( data={'action': 'Commits scanned. Set repo metadata and debug output'}, category='info', ) repository = session.query(Repository).get(github_repo.ssh_url) rate = github.github.get_rate_limit().core time = rate.reset.strftime("%H:%M") current_time = datetime.now().strftime('%H:%M') message = f'{current_time}: ' message += f'Scanned {repository.clone_url} with {commit_count} commits.\n' message += f'{rate.remaining} of 5000 remaining. Reset at {time}\n' response = {'message': message} repository.updated_at = datetime.now() session.add(repository) session.commit() except GithubException as e: # 451: Access denied. Repository probably gone private. # 404: User or repository just got deleted if e.status == 451 or e.status == 404: repository = session.query(Repository) \ .filter(Repository.full_name == full_name) \ .one_or_none() if repository: repository.broken = True session.add(repository) session.commit() response = {'message': 'Repository access blocked.'} # Catch any other GithubException else: sentry.captureException() response = error_message('Error in get_repository:\n') pass except (GitError, UnicodeDecodeError): response = error_message('Error in get_repository:\n') pass except BaseException: # Catch any exception and print it, as we won't get any information due to threading otherwise. sentry.captureException() response = error_message('Error in get_repository:\n') pass finally: if 'owner' in locals() and 'github_repo' in locals(): delete_git_repository(owner.login, github_repo.name) session.close() return response
def analyse_travel_path(existing): """Analyze the efficiency of the missing time comparison.""" session = new_session() logger.info(f'Start Scan.') # Look at the last two years time_span = datetime.now() - timedelta(days=2 * 365) if not existing: results = session.query(Contributor, func.array_agg(Commit.sha)) \ .filter(Contributor.login == Contributor.login) \ .join(Email, Contributor.login == Email.contributor_login) \ .join(Commit, or_( Commit.author_email_address == Email.email, Commit.committer_email_address == Email.email, )) \ .filter(Commit.commit_time >= time_span) \ .group_by(Contributor.login) \ .all() logger.info(f'Scanning {len(results)} contributors.') count = 0 big_contributors = [] for contributor, commits in results: if len(commits) > 100 and len(commits) < 20000: big_contributors.append((contributor, commits)) count += 1 if count % 5000 == 0: logger.info( f'Scanned {count} contributors ({len(big_contributors)} big)' ) # Finished searching for contributors with enough commits. logger.info(f'Analysing {len(big_contributors)} contributors.') # Chunk the contributor list into chunks of 100 chunks = create_chunks(big_contributors, 100) manager = ListManager('analyse_travel_path', chunks) manager.start() manager.run() # Only look at commits of the last year results = session.query(AnalysisResult) \ .filter(AnalysisResult.timezone_switches != None) \ .filter(and_( AnalysisResult.commit_count != None, AnalysisResult.commit_count > 100, AnalysisResult.commit_count < 20000, )) \ .options(joinedload('contributor')) \ .all() changed = 0 unchanged = 0 distribution = {} for result in results: amount = result.timezone_switches if amount > 1: changed += 1 else: unchanged += 1 if distribution.get(amount) is None: distribution[amount] = 1 else: distribution[amount] += 1 ignored_timezones = set([ 'GB', 'WET', 'MET', 'CET', 'EET', 'NZ', 'MST7MDT', 'PST8PDT', 'CST6CDT', 'W-SU', 'ROK', 'EET', 'NZ-CHAT', 'GB-Eire', 'ROC', 'EST5EDT', 'EET', 'PRC', ]) for i in range(0, 16): ignored_timezones.add(f'GMT-{i}') ignored_timezones.add(f'GMT+{i}') correct = 0 considered_contributors = 0 survey_results = {} detected_timezones = {} for result in results: contributor = result.contributor home = set(result.intermediate_results['home']['set']) if 'full_set' in result.intermediate_results['home']: full_set = set(result.intermediate_results['home']['full_set']) else: full_set = set() if result.different_timezones is not None: if result.different_timezones not in detected_timezones: detected_timezones[result.different_timezones] = 0 detected_timezones[result.different_timezones] += 1 if contributor.location is None: continue for item in timezone_evaluations: if element_in_string(contributor.location, item['search']): survey_string = ', '.join(item['search']) if survey_string not in survey_results: survey_results[survey_string] = {} survey_results[survey_string]['set'] = set(home) survey_results[survey_string]['amount'] = 0 survey_results[survey_string]['correct'] = 0 survey_results[survey_string]['timezone_amount'] = 0 survey_results[survey_string]['match'] = item['timezone'] survey_results[survey_string]['full_set'] = full_set survey_results[survey_string][ 'set'] = survey_results[survey_string]['set'] | home survey_results[survey_string]['amount'] += 1 survey_results[survey_string]['timezone_amount'] += len( home - ignored_timezones) survey_results[survey_string]['ratio'] = survey_results[ survey_string]['timezone_amount'] / survey_results[ survey_string]['amount'] considered_contributors += 1 if 'full_set' in item: survey_results[survey_string]['full_set'] = survey_results[ survey_string]['full_set'] | full_set # Debug stuff if 'roflcopter' == survey_string: print(home) if item['timezone'] in home: correct += 1 survey_results[survey_string]['correct'] += 1 break logger.info(f'Looked at {len(results)} contributors.') logger.info(f'{len(results)} are relevant.') logger.info(f'Detected a change in {changed} of those.') logger.info(f'Detected no change in {unchanged} of those.') logger.info(f'Distribution of users by amount of different timezones:') logger.info(pformat(distribution)) logger.info(f'Distribution of users by amount of detected timezones:') logger.info(pformat(detected_timezones)) logger.info( f'Verified contributors {correct} of {considered_contributors}: {correct/considered_contributors}' ) print( f"Strings query;Considered contributors;Expected timezone;Home location in subset;Mean size of subset;Max size of subset" ) for key, result in survey_results.items(): print( f"{key};{result['amount']};{result['match']};{result['correct']};{result['ratio']:.2f};{len(result['full_set'])}" ) return
def analyse_contributer_travel_path(contributors_commits): """Analyse the travel path of a few contributers.""" try: session = new_session() count = 0 for contributor, commit_hashes in contributors_commits: # Query result again with current session. contributor = session.query(Contributor).get(contributor.login) result = contributor.analysis_result if result is None: result = AnalysisResult() contributor.analysis_result = result session.add(contributor) session.add(result) commits_changed = (len(commit_hashes) != result.commit_count) # Look at the jsonb intermediate_result to see if we already wrote the data into it json_results = result.intermediate_results if json_results is None: json_results = {} result.intermediate_results = json_results if result.different_timezones is None \ or result.timezone_switches is None \ or commits_changed \ or json_results.get('travel') is None \ or json_results.get('home') is None: commits = session.query(Commit) \ .filter(Commit.sha.in_(commit_hashes)) \ .all() plotter = TravelPath(commits, '/') plotter.preprocess() json_results = deepcopy(result.intermediate_results) for timezone_set in plotter.data: del (timezone_set['start']) del (timezone_set['end']) timezone_set['set'] = list(timezone_set['set']) timezone_set['full_set'] = list(timezone_set['full_set']) json_results['home'] = plotter.home_zone json_results['travel'] = plotter.data result.intermediate_results = json_results result.timezone_switches = len(plotter.data) result.different_timezones = plotter.different_timezones result.last_change = datetime.now() result.commit_count = len(commits) session.add(result) count += 1 if count % 50 == 0: session.commit() session.commit() finally: session.close() return {'message': 'Success'}