def get_all_repos(gh,orgs,Verbose=False): repos={} for project,org in orgs.items(): org_name = org[0] repo_prefix = org[1] repos[project] = cc.get_repos(gh=gh,org_name=org_name,prefix=repo_prefix) if Verbose: for p,r in repos.items(): print(p, [k.name for k in r]) return repos
def collect_stats(gh,org_name,update,prefix,top_only): avoid_rate_limiting(gh) today = datetime.today().strftime('%Y-%m-%d') # populate our persistent data structures from the pickles people = cortx_community.CortxCommunity(org_name) author_activity = cortx_community.CortxActivity(org_name) persistent_stats = cortx_community.PersistentStats(org_name) # averages are weird so handle them differently ave_age_str='_ave_age_in_s' # the shared structure that we use for collecting stats global_stats = { 'branches' : 0, 'clones_count_14_days' : 0, 'clones_unique_14_days' : 0, 'comments' : 0, 'commits' : 0, 'companies_contributing' : set(), 'companies' : set(), 'contributors' : set(), 'domains' : set(), 'downloads_releases' : 0, 'downloads_vms' : 0, 'email_addresses' : set(), 'external_comments' : 0, 'external_email_addresses' : set(), 'forks_external' : set(), 'forks' : set(), 'logins' : set(), 'new_external_activities' : set(), 'new_logins' : set(), 'pull_requests_external_merged' : 0, 'pull_requests_internal_merged' : 0, 'pull_requests_merged' : 0, 'seagate_blog_referrer_count' : 0, 'seagate_blog_referrer_uniques' : 0, 'seagate_referrer_count' : 0, 'seagate_referrer_uniques' : 0, 'stars_external' : set(), 'stars' : set(), 'top_paths' : [], 'top_referrers' : [], 'views_count_14_days' : 0, 'views_unique_14_days' : 0, 'watchers_external' : set(), 'watchers' : set(), } load_actors(global_stats,people) load_items(global_stats,('issues','pull_requests'),('_external','_internal',''),('','_open','_closed','_open_ave_age_in_s','_closed_ave_age_in_s')) local_stats_template = copy.deepcopy(global_stats) # save an empty copy of the stats struct to copy for each repo for repo in cortx_community.get_repos(org_name=org_name,prefix=prefix): while True: # add a while loop since we are always failing and it would be good to run successfully more often try: local_stats = copy.deepcopy(local_stats_template) # get an empty copy of the stats structure rname=repo.name # just in case this requires a github API call, fetch it once and reuse it # Use this update if you just want to add some new data and don't want to wait for the very slow time # to scrape all activity. Once you have finished the update, migrate the code out of the update block. # Typically we don't use update; only during development # Note that update doesn't work for values that are incremented . . . if update: (cached_local_stats,timestamp) = persistent_stats.get_latest(rname) # load the cached version print("Fetched %s data for %s" % (timestamp, repo)) for k,v in cached_local_stats.items(): local_stats[k] = v else: get_top_level_repo_info(local_stats,repo,people=people,author_activity=author_activity,gh=gh,org_name=org_name) get_contributors(rname,repo,local_stats,people=people,gh=gh,org_name=org_name) if not top_only: get_issues_and_prs(rname,repo,local_stats,people=people,author_activity=author_activity,gh=gh,org_name=org_name) get_commits(rname,repo,local_stats,people=people,author_activity=author_activity,gh=gh,org_name=org_name) # what we need to do is query when the last time this ran and then pass 'since' to get_commits # summarize info for this repo and persist the data structures summarize_consolidate(local_stats,global_stats,people=people,author_activity=author_activity,ave_age_str=ave_age_str) persist_author_activity(author_activity) persistent_stats.add_stats(date=today,repo=rname,stats=local_stats) persistent_stats.print_repo(rname,local_stats,date=today,verbose=False,csv=False) break except Exception as e: print("WTF: Failed while getting stats for repo %s" % repo.name, e) avoid_rate_limiting(gh,Verbose=True) # do a bit of cleaning on global stats # print and persist the global consolidated stats # treat the 'ave_age_in_s' fields differently # all those fields have consistent names: 'x_ave_age_in_s' # also, there will always be a corresponding field x which is the count for ave_age in [key for key in global_stats.keys() if ave_age_str in key]: item = ave_age[0:len(ave_age)-len(ave_age_str)] try: global_stats[ave_age] /= global_stats[item] except ZeroDivisionError: global_stats[ave_age] = 0 global_stats['top_referrers'] = consolidate_referrers(global_stats['top_referrers']) persistent_stats.print_repo('GLOBAL',global_stats,date=today,verbose=False,csv=False) persistent_stats.add_stats(date=today,repo='GLOBAL',stats=global_stats)