# Get list of commit records for a repo def get_records(repo_name): data = get_commits(repo_name, gh_username, gh_oauth_key) try: validate_response_found(data[0]) except ValueError: return None except KeyError: return None curr_time = curr_time_utc() curr_commit = curr_commit_master(repo_name, gh_username, gh_oauth_key) return [get_record(dct, repo_name, curr_time, curr_commit) for dct in data] print("%s\tGetting commit info from GitHub API and pushing to BigQuery table" % curr_time_utc()) num_done = 0 num_repos = len(repos) for repo_name in repos: records = get_records(repo_name) num_done = num_done + 1 if records is not None: print("%s\tPushing %s commit records for repo %s/%s: %s" % (curr_time_utc(), len(records), num_done, num_repos, repo_name)) push_bq_records(client, dataset, table, records) else: print("%s\tPushing 0 commit records for repo %s/%s: %s" % (curr_time_utc(), num_done, num_repos, repo_name))
recs_to_add_sc = [] skipped_sha = [] for rec in reader: if num_done % 1000 == 0: print( 'Finished %s files. Got results for %s. Skipped %s already done, %s previously skipped, %s with empty content, %s with invalid file extension, and %s with no CLOC result.' % (num_done, num_success, num_skipped_already_done, num_skipped_skipped, num_skipped_empty_content, num_skipped_file_extension, num_skipped_no_result)) # Push batch of records if num_done % 10 == 0: if len(recs_to_add_loc) > 0: push_bq_records(bq_client, out_ds, table_loc_ungrouped, recs_to_add_loc) push_bq_records(bq_client, out_ds, table_sc_ungrouped, recs_to_add_sc) if len(skipped_sha) > 0: push_bq_records(bq_client, out_ds, table_skip, [{ 'sha': sha } for sha in skipped_sha]) recs_to_add_loc.clear() recs_to_add_sc.clear() skipped_sha.clear() num_done = num_done + 1 repo = rec["repo_name"] filename = rec["file_name"] path = rec["path"]
for record in file_info_records: # Skip if already done if (record["repo_name"], record["path"], record["sha"]) in existing_contents: num_skipped_already_done = num_skipped_already_done + 1 continue recs_to_push.append(get_contents_record(record)) num_done = num_done + 1 if num_done % 100 == 0: print("%s\tFinished %s/%s records. Pushing %s records to BigQuery." % (curr_time_utc(), num_done, num_to_do, len(recs_to_push))) try: # Push the entire batch push_bq_records(client, dataset, table_contents, recs_to_push, print_failed_records=False) except RuntimeError: # Try records individually print( "Batch push failed. Trying records individually every 2 seconds due to BigQuery rate limit." ) for rec in recs_to_push: sleep(2.1) try: push_bq_records(client, dataset, table_contents, [rec], print_failed_records=False) except RuntimeError:
print("%s\tGetting file initial commit times from GitHub API and pushing to table" % curr_time_utc()) num_done = 0 num_skipped_already_done = 0 num_to_do = len(file_info_records) - num_already_done recs_to_push = [] for record in file_info_records: # Skip if already done if (record["repo_name"], record["path"], record["sha"]) in existing_records: num_skipped_already_done = num_skipped_already_done + 1 continue try: recs_to_push.append(get_init_commit(record)) except ValueError as e: print("Caught ValueError; skipping repo %s and path %s. Error:\n%s" % (record["repo_name"], record["path"], e)) except pycurl.error as e: print("Caught pycurl.error; skipping repo %s and path %s. Error:\n%s" % (record["repo_name"], record["path"], e)) num_done = num_done + 1 if num_done % 100 == 0: print("%s\tFinished %s/%s records. Pushing %s records to BigQuery." % (curr_time_utc(), num_done, num_to_do, len(recs_to_push))) push_bq_records(client, dataset, table_init_commit, recs_to_push, print_failed_records = True) recs_to_push.clear() # Final batch print("%s\tFinished %s/%s records. Pushing %s records to BigQuery." % (curr_time_utc(), num_done, num_to_do, len(recs_to_push))) push_bq_records(client, dataset, table_init_commit, recs_to_push, print_failed_records = True)
create_bq_table(client, bq_ds, bq_tb, schema) # Iterate through the records and write to BQ table print('\nExtracting GitHub repo names from articles...') num_done = 0 num_found = 0 recs_to_push = [] for record in records: metadata = parse_record(record) repos = gh_repos_from_metadata(metadata) if repos is not None: num_found += len(repos['repos']) for repo in repos['repos']: metadata_this_repo = metadata.copy() metadata_this_repo['repo_name'] = repo metadata_this_repo['repo_source'] = repos['source'] recs_to_push.append(metadata_this_repo) num_done += 1 if num_done % 100 == 0: print("Analyzed %s papers. Found %s valid repo names." % (num_done, num_found)) if recs_to_push: push_bq_records(client, bq_ds, bq_tb, recs_to_push) recs_to_push.clear() # Push final batch of records if recs_to_push: push_bq_records(client, bq_ds, bq_tb, recs_to_push) print("\n\nAll done.")
'api_url': r.get_gh_api_url(), 'html_url': r.get_html_url(), 'description': r.get_description(), 'is_fork': r.is_fork(), 'stargazers_count': r.get_stargazers_count(), 'watchers_count': r.get_watchers_count(), 'forks_count': r.get_forks_count(), 'open_issues_count': r.get_open_issues_count(), 'subscribers_count': r.get_subscribers_count(), 'curr_commit_master': curr_commit, 'time_accessed': curr_time} print("Getting repo info from GitHub API") records = [] num_done = 0 for repo_name in repos: try: records.append(get_record(repo_name)) except UnicodeEncodeError: print("Skipping repo %s" % repo_name) num_done = num_done + 1 if num_done % 100 == 0: print("Finished %s repos. Pushing records." % num_done) push_bq_records(client, dataset, table, records) records.clear() push_bq_records(client, dataset, table, records) # Last batch
'html_url': pr_data['html_url'], 'title': pr_data['title'], 'body': pr_data['body'], 'user_login': pr_data['user']['login'], 'user_id': pr_data['user']['id'], 'curr_commit_master': curr_commit, 'time_accessed': curr_time} print("Getting pull request info from GitHub API") num_done = 0 num_repos = len(repos) for repo_name in repos: num_done = num_done + 1 try: records = [get_record(repo_name, pr) for pr in get_pull_requests(repo_name, gh_username, gh_oauth_key, "all")] if records is not None: print("%s\tPushing %s pull request records for repo %s/%s: %s" % (curr_time_utc(), len(records), num_done, num_repos, repo_name)) push_bq_records(client = client, dataset = dataset, table = table, records = records, max_batch = 10) else: print("%s\tPushing 0 pull request records for repo %s/%s: %s" % (curr_time_utc(), num_done, num_repos, repo_name)) except KeyError: print("Skipping repo %s: %s" % (repo_name, pr['message'])) except UnicodeEncodeError as e: print("Skipping repo %s: %s" % (repo_name, str(e)))
data = get_file_info(repo_name, gh_username, gh_oauth_key) curr_time = curr_time_utc() curr_commit = curr_commit_master(repo_name, gh_username, gh_oauth_key) return [{ 'repo_name': repo_name, 'file_name': record['name'], 'path': record['path'], 'sha': record['sha'], 'size': record['size'], 'api_url': record['url'], 'html_url': record['html_url'], 'git_url': record['git_url'], 'download_url': record['download_url'], 'type': record['type'], 'curr_commit_master': curr_commit, 'time_accessed': curr_time } for record in data] print("%s\tGetting file info from GitHub API and pushing to file info table" % curr_time_utc()) num_done = 0 num_repos = len(repos) for repo_name in repos: file_info_records = get_file_info_records(repo_name) num_done = num_done + 1 print("%s\tPushing %s file info records for repo %s/%s: %s" % (curr_time_utc(), len(file_info_records), num_done, num_repos, repo_name)) push_bq_records(client, dataset, table, file_info_records)