def contribution_percentage(result=None): if result is None: result = utils.load_result_pkl(connectivity_file) contributions = {} max_k = max(flatten([result[year]['k_components'].keys() for year in result])) for G in networks_by_year(): year = G.graph['year'] contributions[year] = {} devs = set(n for n, d in G.nodes(data=True) if d['bipartite']==1) total = float(sum(G.degree(devs, weight='weight').values())) all_devs = float(len(devs)) contributions[year]['total'] = (all_devs, 1, total, 1, total/all_devs) kcomps = result[year]['k_components'] for k in range(2, max_k + 1): if k not in kcomps: contributions[year][k] = (0, 0, 0, 0, 0) else: nodes_at_k = set.union(*[nodes[1] for nodes in kcomps[k]]) devs_at_k = nodes_at_k & devs if not devs_at_k: print("No developers at level {0} in year {1}".format(k, year)) continue n_at_k = float(len(devs_at_k)) contrib_at_k = sum(G.degree(devs_at_k, weight='weight').values()) contributions[year][k] = (len(devs_at_k), (len(devs_at_k) / all_devs) * 100, contrib_at_k, (contrib_at_k / total) * 100, contrib_at_k / n_at_k) return contributions
def get_developers_by_years(networks=None): if networks is None: networks = networks_by_year() devs_by_year = {} for G in networks: year = G.graph['year'] devs = set(n for n, d in G.nodes(data=True) if d['bipartite']==1) devs_by_year[year] = devs return devs_by_year
def compute_tenure_by_year(last_year, networks=None): if networks is None: networks = networks_by_year() result = {} seen = set() for G in networks: year = G.graph['year'] if year > last_year: break for node in G: if node not in seen: result[node] = year seen.add(node) return dict((node, 1 + last_year - year ) for node, year in result.items())
def compute_k_components(nets=None, names=None): datet = datetime.datetime.today() date = datet.strftime("%Y%m%d%H%M") if names is None: names = default_years if nets is None: nets = networks_by_year() result = {} for name, G in zip(names, nets): result[name] = {} print("Analizing {}".format(name)) k_comp, k_num = k_components(G) result[name]['k_components'] = k_comp result[name]['k_num'] = k_num fn = 'years' if name == 2014 else 'branches' fname = "{0}/k_components_{1}_{2}.pkl".format(results_dir, fn, date) utils.write_results_pkl(result, fname)
def summay_results(nets=None, years=None): if years is None: years = default_years if nets is None: nets = networks_by_year() result = {} previous_devs = None for year, G in zip(years, nets): result[year] = {} devs = set(n for n, d in G.nodes(data=True) if d['bipartite']==1) files = set(G) - devs result[year]['guido_in'] = u'Guido van Rossum' in G result[year]['density'] = bp.density(G, devs) cc = sorted(nx.connected_components(G), key=len, reverse=True) result[year]['cc'] = len(cc[0]) / float(G.order()) if cc else 0 bcc = sorted(nx.biconnected_components(G), key=len, reverse=True) result[year]['bcc'] = len(bcc[0]) / float(G.order()) if bcc else 0 result[year]['devs'] = len(devs) result[year]['files'] = len(files) result[year]['py_files'] = len([f for f in files if f.endswith('.py')]) result[year]['c_files'] = len([f for f in files if f.endswith('.c') or f.endswith('.h')]) result[year]['doc_files'] = len([f for f in files if f.lower().endswith('.txt') or f.endswith('.rst') or f.endswith('.tex')]) result[year]['weight'] = sum(nx.degree(G, devs, weight='weight').values()) result[year]['added'] = sum(nx.degree(G, devs, weight='added').values()) result[year]['deleted'] = sum(nx.degree(G, devs, weight='deleted').values()) result[year]['edits'] = sum(nx.degree(G, devs, weight='edits').values()) result[year]['sq_clustering'] = (sum(nx.square_clustering(G, devs).values()) / float(len(devs))) if previous_devs is None: # First year result[year]['new_devs'] = len(devs) result[year]['continue_devs'] = 0 result[year]['lost_devs'] = 0 else: result[year]['new_devs'] = len(devs - previous_devs) result[year]['continue_devs'] = len(devs & previous_devs) result[year]['lost_devs'] = len(previous_devs - devs) previous_devs = devs return result
def build_mobility_network(connectivity): H = nx.DiGraph() networks = networks_by_year() skip = next(networks) years = range(1992, 2015) for year, G in zip(years, networks): devs = {n for n, d in G.nodes(data=True) if d['bipartite']==1} kcomps = connectivity[year]['k_components'] max_k = max(kcomps) devs = set(n for n, d in G.nodes(data=True) if d['bipartite']==1) these_devs = set(u for u in set.union(*[v[1] for v in kcomps[max_k]]) if u in devs) H.add_node(year, devs=these_devs, number_devs=len(these_devs), total_devs=len(devs)) for year in years: seen = set() for future_year in range(year+1, 2015): common = H.node[year]['devs'] & H.node[future_year]['devs'] if common: to_add = common - seen if to_add: H.add_edge(year, future_year, devs=to_add, weight=len(to_add)) seen.update(to_add) seen = set() for year in years: devs = H.node[year]['devs'] if year == max(years): future_devs = set() else: future_devs = set.union(*[H.node[n]['devs'] for n in range(year+1, 2015)]) out_devs = devs - future_devs if out_devs: if year != max(years): H.add_node("%s-out" % year, devs=out_devs, number_devs=len(out_devs)) H.add_edge(year, "%s-out" % year, devs=out_devs, weight=len(out_devs)) new_devs = devs - seen if new_devs: H.add_node("%s-in" % year, devs=new_devs, number_devs=len(new_devs)) H.add_edge("%s-in" % year, year, devs=new_devs, weight=len(new_devs)) seen.update(devs) return H
def compute_centrality(nets=None, names=None): datet = datetime.datetime.today() date = datet.strftime("%Y%m%d%H%M") if names is None: names = default_years if nets is None: nets = networks_by_year() result = {} for name, G in zip(names, nets): result[name] = {} print("computing centrality for {}".format(name)) devs = set(n for n, d in G.nodes(data=True) if d['bipartite']==1) result[name]['deg'] = bp.degree_centrality(G, devs) try: result[name]['bet'] = bp.betweenness_centrality(G, devs) except ZeroDivisionError: result[name]['bet'] = dict() result[name]['clos'] = bp.closeness_centrality(G, devs) result[name]['ev'] = nx.eigenvector_centrality_numpy(G) fn = 'years' if name == 2014 else 'branches' fname = "{0}/bipartite_centrality_{1}_{2}.pkl".format(results_dir, fn, date) utils.write_results_pkl(result, fname)
def compute_layouts(nets=None, names=None): datet = datetime.datetime.today() date = datet.strftime("%Y%m%d%H%M") if names is None: names = default_years if nets is None: nets = networks_by_year() result = {} for name, G in zip(names, nets): # This is likely a pydot bug, nodes cannot have a 'name' attribute for node, data in G.node.items(): if 'name' in data: del data['name'] result[name] = {} print("Analizing {}".format(name)) result[name]['pos_kk'] = nx.graphviz_layout(G, prog='neato') try: result[name]['pos_fdp'] = nx.graphviz_layout(G, prog='fdp') except: result[name]['pos_fdp'] = result[name]['pos_kk'] fn = 'years' if name == 2014 else 'branches' fname = "{0}/layouts_{1}_{2}.pkl".format(results_dir, fn, date) utils.write_results_pkl(result, fname)
def main(): parser = OptionParser() parser.add_option('-c','--conn_years', action='store_true', dest='connectivity_years', help='Connectivity analysis by year', default=False) parser.add_option('-d','--conn_branches', action='store_true', dest='connectivity_branches', help='Connectivity analysis by branch', default=False) parser.add_option('-e','--cent_years', action='store_true', dest='centrality_years', help='Centrality analysis by year', default=False) parser.add_option('-f','--cent_branches', action='store_true', dest='centrality_branches', help='Centrality analysis by branch', default=False) parser.add_option('-l','--layouts_years', action='store_true', dest='layouts_years', help='Compute layouts by year', default=False) parser.add_option('-m','--layouts_branches', action='store_true', dest='layouts_branches', help='Compute layouts by branch', default=False) parser.add_option('-s','--survival', action='store_true', dest='survival', help='Build and save survival Data Frame', default=False) parser.add_option('-z','--contrib', action='store_true', dest='contrib', help='Build and save contribution and PEP authoring DF', default=False) options, args = parser.parse_args() if options.connectivity_years: compute_k_components(nets=networks_by_year(), names=default_years) if options.connectivity_branches: compute_k_components(nets=networks_by_branches(), names=default_branches) if options.centrality_years: compute_centrality(nets=networks_by_year(), names=default_years) if options.centrality_branches: compute_centrality(nets=networks_by_branches(), names=default_branches) if options.layouts_years: compute_layouts(nets=networks_by_year(), names=default_years) if options.layouts_branches: compute_layouts(nets=networks_by_branches(), names=default_branches) if options.survival: build_survival_data_frame(fname=survival_file) if options.contrib: write_developer_contrib_df(fname='data/developer_contributions_df.csv')
def write_developer_contrib_df(fname='data/developer_contributions_df.csv'): ids = utils.UniqueIdGenerator() peps = [pep for pep in get_peps() if pep.created is not None] connectivity = utils.load_result_pkl(connectivity_file) centrality = utils.load_result_pkl(centrality_file) networks_gen = networks_by_year() skip = next(networks_gen) networks = list(networks_gen) years = range(1992, 2015) devs_by_year = get_developers_by_years(networks=networks) with open(fname, 'wb') as f: out = csv.writer(f) out.writerow([ 'id', 'year', 'dev', 'has_written_peps', 'has_written_acc_peps', 'is_delegate', 'peps_this_year', 'total_peps', 'accepted_peps_year', 'total_accepted_peps', 'degree', 'contributions_sc', 'contributions_edits', 'contributions_added', 'contributions_deleted', 'collaborators', 'knum', 'aknum', 'top', 'top2', 'tenure', 'betweenness', 'closeness', 'degree_cent', 'file_mean_degree', 'clus_sq', 'clus_dot', 'clus_red', ]) for year, G in zip(years, networks): print("Analyzing {}".format(G.name)) bdfl_delegates = get_delegates_by_year(year, peps=peps) peps_this_year = peps_by_developer_that_year(year, peps=peps) peps_until_year = peps_by_developer_until_year(year, peps=peps) acc_peps_this_year = accepted_peps_by_developer_that_year(year, peps=peps) acc_peps_until_year = accepted_peps_by_developer_until_year(year, peps=peps) top = get_developers_top_connectivity_by_year(G, year, connectivity=connectivity) top2 = get_developers_top_connectivity_by_year_new(G, year, connectivity=connectivity) devs = devs_by_year[year] tenure = compute_tenure_by_year(year, networks=networks) k_num = connectivity[year]['k_num'] bet = normalize(centrality[year]['bet']) clos = normalize(centrality[year]['clos']) deg = normalize(centrality[year]['deg']) clus_sq = nx.square_clustering(G) clus_dot = bp.clustering(G) clus_red = bp.node_redundancy(G) for dev in devs: out.writerow([ ids[dev], year, dev.encode('utf8'), 1 if dev in peps_until_year else 0, # developer has written at least a pep 1 if dev in acc_peps_until_year else 0, # developer has written at least an acc. pep 1 if dev in bdfl_delegates else 0, # developer has been BDFL delegate peps_this_year[dev] if dev in peps_this_year else 0, # peps written this year peps_until_year[dev] if dev in peps_until_year else 0, # peps written until this year acc_peps_this_year[dev] if dev in acc_peps_this_year else 0, # peps acc. this year acc_peps_until_year[dev] if dev in acc_peps_until_year else 0, # total peps acc. len(G[dev]), #G.degree(dev, weight=None), G.degree(dev, weight='weight'), # lines of code added plus deleted G.degree(dev, weight='edits'), # number files edit G.degree(dev, weight='added'), # lines of code added G.degree(dev, weight='deleted'), # lines of code removed second_order_nbrs(G, dev), # second order neighbors k_num[dev][0], # k-component number k_num[dev][1], # Average k-component number 1 if dev in top else 0, # top connectivity level 1 if dev in top2 else 0, # top 2 connectivity level tenure[dev], bet[dev], clos[dev], deg[dev], sum(len(G[n]) for n in G[dev]) / float(len(G[dev])), clus_sq[dev], clus_dot[dev], clus_red[dev], ])
def build_survival_data_frame(fname=survival_file): nan = float('nan') ids = utils.UniqueIdGenerator() connectivity = utils.load_result_pkl(connectivity_file) centrality = utils.load_result_pkl(centrality_file) peps = [pep for pep in get_peps() if pep.created is not None] networks = list(networks_by_year()) devs = get_developers_by_years(networks=networks) skip = networks.pop(0) # skip 1991 G_start = networks.pop(0) # start with 1992 devs_start = set(n for n, d in G_start.nodes(data=True) if d['bipartite']==1) years = range(1993, 2015) with open(fname, 'wb') as f: out = csv.writer(f) out.writerow([ 'id', 'dev', 'period', 'rstart', 'rstop', 'status', 'has_written_peps', 'has_written_acc_peps', 'peps_this_year', 'total_peps', 'accepted_peps_year', 'total_accepted_peps', 'biconnected', 'top', 'tenure', 'colaborators', 'knum', 'aknum', 'clus_sq', 'clus_dot', 'clus_red', 'degree', 'contributions', 'dcentrality', 'betweenness', 'closeness', ]) previous_devs = devs_start previous_year = 1992 previous_G = G_start for i, (year, G) in enumerate(zip(years, networks)): print("processing year {}".format(previous_year)) clus_sq = nx.square_clustering(previous_G) these_devs = devs[year] remaining_devs = get_all_remaining_devs(devs, years[i:]) top_devs = get_developers_top_connectivity( connectivity[previous_year]['k_components'], previous_devs) tenure = compute_tenure_by_year(previous_year) bet = normalize(centrality[previous_year]['bet']) clos = normalize(centrality[previous_year]['bet']) deg = normalize(centrality[previous_year]['deg']) clus_sq = nx.square_clustering(previous_G) clus_dot = bp.clustering(previous_G) clus_red = bp.node_redundancy(previous_G) peps_this_year = peps_by_developer_that_year(previous_year, peps=peps) peps_until_year = peps_by_developer_until_year(previous_year, peps=peps) acc_peps_this_year = accepted_peps_by_developer_that_year(previous_year, peps=peps) acc_peps_until_year = accepted_peps_by_developer_until_year(previous_year, peps=peps) for dev in previous_devs: out.writerow([ ids[dev], # developer numerical ID dev.encode('utf8'), # developer name i + 1, # period i, # start i + 1, # stop 0 if dev in remaining_devs else 1, # status (censored) 1 if dev in peps_until_year else 0, # developer has written at least a pep 1 if dev in acc_peps_until_year else 0, # developer has written at least an acc. pep peps_this_year[dev] if dev in peps_this_year else 0, # peps written this year peps_until_year[dev] if dev in peps_until_year else 0, # peps written until this year acc_peps_this_year[dev] if dev in acc_peps_this_year else 0, # peps acc. this year acc_peps_until_year[dev] if dev in acc_peps_until_year else 0, # total peps acc. 0 if connectivity[previous_year]['k_num'][dev][0] < 2 else 1,#biconnected 0 if dev not in top_devs else 1, # member of the top connectivity level tenure[dev], # tenure in years second_order_nbrs(previous_G, dev), # collaborators connectivity[previous_year]['k_num'].get(dev, (nan,nan))[0], # knum connectivity[previous_year]['k_num'].get(dev, (nan,nan))[1], # aknum clus_sq.get(dev, nan), clus_dot.get(dev, nan), clus_red.get(dev, nan), previous_G.degree(dev), # degree previous_G.degree(dev, weight='weight'), # contributions deg.get(dev, nan), bet.get(dev, nan), clos.get(dev, nan), ]) previous_devs = these_devs previous_year = year previous_G = G