Пример #1
0
def get_record(repo_name):
    lic = get_license(repo_name, gh_username, gh_oauth_key)
    curr_time = curr_time_utc()
    curr_commit = curr_commit_master(repo_name, gh_username, gh_oauth_key)
    return {
        'repo_name': repo_name,
        'license': lic,
        'curr_commit_master': curr_commit,
        'time_accessed': curr_time
    }
Пример #2
0
def get_records(repo_name):
    data = get_commits(repo_name, gh_username, gh_oauth_key)
    try:
        validate_response_found(data[0])
    except ValueError:
        return None
    except KeyError:
        return None
    curr_time = curr_time_utc()
    curr_commit = curr_commit_master(repo_name, gh_username, gh_oauth_key)
    return [get_record(dct, repo_name, curr_time, curr_commit) for dct in data]
Пример #3
0
def get_records(repo_name):
    data = get_language_bytes(repo_name, gh_username, gh_oauth_key)
    curr_time = curr_time_utc()
    curr_commit = curr_commit_master(repo_name, gh_username, gh_oauth_key)
    return [{
        'repo_name': repo_name,
        'language_name': key,
        'language_bytes': data[key],
        'curr_commit_master': curr_commit,
        'time_accessed': curr_time
    } for key in data.keys()]
Пример #4
0
def get_record(repo_name, pr_data):
    curr_time = curr_time_utc()
    curr_commit = curr_commit_master(repo_name, gh_username, gh_oauth_key)
    return {'repo_name': repo_name,
            'pr_id': pr_data['id'],
            'state': pr_data['state'],
            'api_url': pr_data['url'],
            'html_url': pr_data['html_url'],
            'title': pr_data['title'],
            'body': pr_data['body'],
            'user_login': pr_data['user']['login'],
            'user_id': pr_data['user']['id'],
            'curr_commit_master': curr_commit,
            'time_accessed': curr_time}
def get_record(repo_name):
    r = repo.Repo(repo_name, gh_username, gh_oauth_key)
    curr_time = curr_time_utc()
    curr_commit = curr_commit_master(repo_name, gh_username, gh_oauth_key)
    return {'repo_name': r.get_repo_name(),
            'api_url': r.get_gh_api_url(),
            'html_url': r.get_html_url(),
            'description': r.get_description(),
            'is_fork': r.is_fork(),
            'stargazers_count': r.get_stargazers_count(),
            'watchers_count': r.get_watchers_count(),
            'forks_count': r.get_forks_count(),
            'open_issues_count': r.get_open_issues_count(),
            'subscribers_count': r.get_subscribers_count(),
            'curr_commit_master': curr_commit,
            'time_accessed': curr_time}
Пример #6
0
def get_file_info_records(repo_name):
    data = get_file_info(repo_name, gh_username, gh_oauth_key)
    curr_time = curr_time_utc()
    curr_commit = curr_commit_master(repo_name, gh_username, gh_oauth_key)
    return [{
        'repo_name': repo_name,
        'file_name': record['name'],
        'path': record['path'],
        'sha': record['sha'],
        'size': record['size'],
        'api_url': record['url'],
        'html_url': record['html_url'],
        'git_url': record['git_url'],
        'download_url': record['download_url'],
        'type': record['type'],
        'curr_commit_master': curr_commit,
        'time_accessed': curr_time
    } for record in data]
def get_contents_record(file_info_record):
    repo_name = file_info_record["repo_name"]
    path = file_info_record["path"]
    git_url = file_info_record["git_url"]
    curr_time = curr_time_utc()
    contents = None
    size = file_info_record["size"]
    if size <= max_record_size - 1000:
        try:
            contents = get_file_contents(git_url, gh_username, gh_oauth_key)
        except:
            pass
    return {
        'repo_name': repo_name,
        'file_name': file_info_record["file_name"],
        'path': path,
        'sha': file_info_record["sha"],
        'git_url': git_url,
        'contents': contents,
        'time_accessed': curr_time
    }
Пример #8
0
            'time_accessed': curr_time}

# Get list of commit records for a repo
def get_records(repo_name):
    data = get_commits(repo_name, gh_username, gh_oauth_key)
    try:
        validate_response_found(data[0])
    except ValueError:
        return None
    except KeyError:
        return None
    curr_time = curr_time_utc()
    curr_commit = curr_commit_master(repo_name, gh_username, gh_oauth_key)
    return [get_record(dct, repo_name, curr_time, curr_commit) for dct in data]
        
print("%s\tGetting commit info from GitHub API and pushing to BigQuery table" % curr_time_utc())
num_done = 0
num_repos = len(repos)
for repo_name in repos:
    records = get_records(repo_name)
    num_done = num_done + 1
    if records is not None:
        print("%s\tPushing %s commit records for repo %s/%s: %s" 
              % (curr_time_utc(), len(records), num_done, num_repos, repo_name))
        push_bq_records(client, dataset, table, records)
    else:
        print("%s\tPushing 0 commit records for repo %s/%s: %s" 
              % (curr_time_utc(), num_done, num_repos, repo_name))


        except:
            pass
    return {
        'repo_name': repo_name,
        'file_name': file_info_record["file_name"],
        'path': path,
        'sha': file_info_record["sha"],
        'git_url': git_url,
        'contents': contents,
        'time_accessed': curr_time
    }


print(
    "%s\tGetting file contents from GitHub API and pushing to file contents table"
    % curr_time_utc())
num_done = 0
num_skipped_already_done = 0
num_to_do = len(file_info_records) - num_already_done
recs_to_push = []
for record in file_info_records:
    # Skip if already done
    if (record["repo_name"], record["path"],
            record["sha"]) in existing_contents:
        num_skipped_already_done = num_skipped_already_done + 1
        continue
    recs_to_push.append(get_contents_record(record))
    num_done = num_done + 1
    if num_done % 100 == 0:
        print("%s\tFinished %s/%s records. Pushing %s records to BigQuery." %
              (curr_time_utc(), num_done, num_to_do, len(recs_to_push)))
file_info_records = run_bq_query(client, """
SELECT repo_name, file_name, path, sha FROM [%s:%s.%s] 
""" % (proj, dataset, table_info), 120)

# Get initial commit
def get_init_commit(file_info_record):
    repo_name = file_info_record["repo_name"]
    path = file_info_record["path"]
    return {'repo_name': repo_name,
            'file_name': file_info_record["file_name"],
            'path': path,
            'sha': file_info_record["sha"],
            'init_commit_timestamp': get_initial_commit(repo_name, path, gh_username, gh_oauth_key).isoformat()}
    
    
print("%s\tGetting file initial commit times from GitHub API and pushing to table" % curr_time_utc())
num_done = 0
num_skipped_already_done = 0
num_to_do = len(file_info_records) - num_already_done
recs_to_push = []
for record in file_info_records:
    # Skip if already done
    if (record["repo_name"], record["path"], record["sha"]) in existing_records:
        num_skipped_already_done = num_skipped_already_done + 1
        continue
    try:
        recs_to_push.append(get_init_commit(record))
    except ValueError as e:
        print("Caught ValueError; skipping repo %s and path %s. Error:\n%s" % (record["repo_name"], record["path"], e))
    except pycurl.error as e:
        print("Caught pycurl.error; skipping repo %s and path %s. Error:\n%s" % (record["repo_name"], record["path"], e))
Пример #11
0
            'html_url': pr_data['html_url'],
            'title': pr_data['title'],
            'body': pr_data['body'],
            'user_login': pr_data['user']['login'],
            'user_id': pr_data['user']['id'],
            'curr_commit_master': curr_commit,
            'time_accessed': curr_time}
    
print("Getting pull request info from GitHub API")
num_done = 0
num_repos = len(repos)
for repo_name in repos:
    num_done = num_done + 1
    try:
        records = [get_record(repo_name, pr) for pr in get_pull_requests(repo_name, gh_username, gh_oauth_key, "all")]
        if records is not None:
            print("%s\tPushing %s pull request records for repo %s/%s: %s" 
                  % (curr_time_utc(), len(records), num_done, num_repos, repo_name))
            push_bq_records(client = client, dataset = dataset, table = table, records = records, max_batch = 10)
        else:
            print("%s\tPushing 0 pull request records for repo %s/%s: %s" 
                  % (curr_time_utc(), num_done, num_repos, repo_name))
    except KeyError:
        print("Skipping repo %s: %s" % (repo_name, pr['message']))
    except UnicodeEncodeError as e:
        print("Skipping repo %s: %s" % (repo_name, str(e)))




Пример #12
0
    data = get_file_info(repo_name, gh_username, gh_oauth_key)
    curr_time = curr_time_utc()
    curr_commit = curr_commit_master(repo_name, gh_username, gh_oauth_key)
    return [{
        'repo_name': repo_name,
        'file_name': record['name'],
        'path': record['path'],
        'sha': record['sha'],
        'size': record['size'],
        'api_url': record['url'],
        'html_url': record['html_url'],
        'git_url': record['git_url'],
        'download_url': record['download_url'],
        'type': record['type'],
        'curr_commit_master': curr_commit,
        'time_accessed': curr_time
    } for record in data]


print("%s\tGetting file info from GitHub API and pushing to file info table" %
      curr_time_utc())
num_done = 0
num_repos = len(repos)
for repo_name in repos:
    file_info_records = get_file_info_records(repo_name)
    num_done = num_done + 1
    print("%s\tPushing %s file info records for repo %s/%s: %s" %
          (curr_time_utc(), len(file_info_records), num_done, num_repos,
           repo_name))
    push_bq_records(client, dataset, table, file_info_records)