def centrality_leaders(budgetYears): network = load_network_for(budgetYears) g = network.g.copy() g = ResearchCollaborationNetwork.largest_component(g) topK = 10 candidates, rankings = cl.centrality_leaders(g) ordered_list = [] for r in range(len(rankings))[:topK]: #logger.info('tier: %d'%r) for i in list(rankings[r]): node_name = g.vs[candidates[i]]['name'] ordered_list.append(node_name) # set the node's centrality_leader attribute, the higher the better g.vs[candidates[i]]['centrality_leader'] = topK + 1 - r startBudgetYear = budgetYears[0] endBudgetYear = budgetYears[-1] filename = '%s/figures/%s-%s-centrality-leaders.png' % (root_folder(), startBudgetYear, endBudgetYear) draw(g, filename) logger.info(ordered_list)
def centrality_leaders(budgetYears): network = load_network_for(budgetYears) g = network.g.copy() g = ResearchCollaborationNetwork.largest_component(g) topK = 10 candidates, rankings = cl.centrality_leaders(g) ordered_list = [] for r in range(len(rankings))[:topK]: #logger.info('tier: %d'%r) for i in list(rankings[r]): node_name = g.vs[candidates[i]]['name'] ordered_list.append(node_name) # set the node's centrality_leader attribute, the higher the better g.vs[candidates[i]]['centrality_leader'] = topK + 1 - r startBudgetYear = budgetYears[0] endBudgetYear = budgetYears[-1] filename = '%s/figures/%s-%s-centrality-leaders.png' % ( root_folder(), startBudgetYear, endBudgetYear) draw(g, filename) logger.info(ordered_list)
def update_graphml(budgetYears): startBudgetYear = budgetYears[0] endBudgetYear = budgetYears[-1] network = load_network_for(budgetYears) network.g.vs['centrality_leader'] = 0 g = network.g.copy() g = ResearchCollaborationNetwork.largest_component(g) topK = 50 candidates, rankings = cl.centrality_leaders(g) #ordered_list = [] for r in range(len(rankings))[:topK]: #logger.info('tier: %d'%r) for i in list(rankings[r]): node_name = g.vs[candidates[i]]['name'] # ordered_list.append(node_name) # set the node's centrality_leader attribute, the higher the better #g.vs[candidates[i]]['centrality_leader'] = topK + 1 - r node = network.g.vs.select(name_eq=node_name) node['centrality_leader'] = topK - r #logger.info(topK - r) # logger.info(node['name']) filename = '%s/data/networks/%d-%d.graphml' % (root_folder(), startBudgetYear, endBudgetYear) network.write(filename)
def load_network_for(budgetYears): startBudgetYear = budgetYears[0] endBudgetYear = budgetYears[-1] filename = '%s/data/networks/%d-%d.graphml'%(root_folder(),startBudgetYear, endBudgetYear) network = ResearchCollaborationNetwork.read(filename) return network
def load_network_for(budgetYears): startBudgetYear = budgetYears[0] endBudgetYear = budgetYears[-1] filename = '%s/data/networks/%d-%d.graphml' % ( root_folder(), startBudgetYear, endBudgetYear) network = ResearchCollaborationNetwork.read(filename) return network
def network_to_d3(budgetYears): network = load_network_for(budgetYears) #network = ResearchCollaborationNetwork.read(budgetYears) startBudgetYear = budgetYears[0] endBudgetYear = budgetYears[-1] filename = '%s/data/networks/%s-%s-complete.json' % ( root_folder(), startBudgetYear, endBudgetYear) ResearchCollaborationNetwork.d3(network.g, filename) # remove isolated nodes g = network.g.copy() g = ResearchCollaborationNetwork.simplify(g) filename = '%s/data/networks/%s-%s.json' % (root_folder(), startBudgetYear, endBudgetYear) ResearchCollaborationNetwork.d3(g, filename) # only the largest components g = network.g.copy() g = ResearchCollaborationNetwork.largest_component(g) filename = '%s/data/networks/%s-%s-largest-component.json' % ( root_folder(), startBudgetYear, endBudgetYear) ResearchCollaborationNetwork.d3(g, filename)
def draw_g(budgetYears): network = load_network_for(budgetYears) g = network.g.copy() #g = g.simplify(multiple=True, loops=True,combine_edges=sum) # convert to undirected #g.to_undirected(combine_edges=sum) g = ResearchCollaborationNetwork.simplify(g) startBudgetYear = budgetYears[0] endBudgetYear = budgetYears[-1] filename = '%s/figures/%s-%s-%d.png'%(root_folder(),startBudgetYear, endBudgetYear,len(g.vs)) #logger.info(g.summary()) draw(g, filename) gl = ResearchCollaborationNetwork.largest_component(g) filename = '%s/figures/%s-%s-%d-largest-component.png'%(root_folder(),startBudgetYear, endBudgetYear,len(gl.vs)) draw(gl, filename)
def network_to_d3(budgetYears): network = load_network_for(budgetYears) #network = ResearchCollaborationNetwork.read(budgetYears) startBudgetYear = budgetYears[0] endBudgetYear = budgetYears[-1] filename = '%s/data/networks/%s-%s-complete.json' % (root_folder(), startBudgetYear, endBudgetYear) ResearchCollaborationNetwork.d3(network.g, filename) # remove isolated nodes g = network.g.copy() g = ResearchCollaborationNetwork.simplify(g) filename = '%s/data/networks/%s-%s.json' % (root_folder(), startBudgetYear, endBudgetYear) ResearchCollaborationNetwork.d3(g, filename) # only the largest components g = network.g.copy() g = ResearchCollaborationNetwork.largest_component(g) filename = '%s/data/networks/%s-%s-largest-component.json' % (root_folder(), startBudgetYear, endBudgetYear) ResearchCollaborationNetwork.d3(g, filename)
def test(): from matplotlib import rc rc('text', usetex=False) rc('font', family='serif') fig = plt.figure(figsize=(8, 16)) ax = fig.add_subplot(1, 2, 1) task = 'per_user' roc.roc_curve_init(ax) area, [ax, b] = plot_auc(2006, 2009, task, ax, 'b') area, [ax, g] = plot_auc(2010, 2012, task, ax, 'g') area, [ax, r] = plot_auc(2006, 2012, task, ax, 'r') ax.set_title('Per-user Model') #f.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=.3, hspace=.2) #plt.legend(loc=4) #plt.savefig('%s/figures/%s-roc-curve.eps'%(root_folder(), task),bbox_inches='tight', dpi=600) #plt.close() task = 'per_network' ax = fig.add_subplot(1, 2, 2) roc.roc_curve_init(ax) area, [ax, b] = plot_auc(2006, 2009, task, ax, 'b') area, [ax, g] = plot_auc(2010, 2012, task, ax, 'g') area, [ax, r] = plot_auc(2006, 2012, task, ax, 'r') ax.set_title('Per-network Model') fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=.3, hspace=.2) #plt.figlegend((b, g, r), ('RCN (2006 - 2009)', 'RCN (2010 - 2012)', 'RCN (2006 - 2012)'), 'center') #plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.05), ncol=5, fancybox=True) plt.legend((b, g, r), ('RCN (2006 - 2009)', 'RCN (2010 - 2012)', 'RCN (2006 - 2012)'), bbox_to_anchor=(-0.15, -0.4), loc='center', prop={'size': 12}) plt.savefig('%s/figures/roc-curve.eps' % (root_folder()), bbox_inches='tight', dpi=600)
def plot_auc(startBudgetYear,endBudgetYear, task, ax, color): filename = '%s/data/%s-%s.%s.roc.samples.npy'%(root_folder(),startBudgetYear, endBudgetYear, task) roc_samples = np.load(filename) labels = [] scores = [] for k, label, score in roc_samples: labels.append(np.float(label)) scores.append(np.float(score)) area, [ax, lines] = roc.roc_curve(labels=np.array(labels),scores=np.array(scores), ax=ax, linewidth=1.5, color=color) return area, [ax, lines]
def plot_all_combined(): import matplotlib.pyplot as plt budgetYears = range(2006,2010) logger.info("================================================================") logger.info(budgetYears) wg_rcn_2006_2009, degree_rcn_2006_2009, strength_rcn_2006_2009 = get_data(budgetYears) budgetYears = range(2010,2013) logger.info("================================================================") logger.info(budgetYears) wg_rcn_2010_2012, degree_rcn_2010_2012, strength_rcn_2010_2012 = get_data(budgetYears) budgetYears = range(2006,2013) logger.info("================================================================") logger.info(budgetYears) wg_rcn_2006_2012, degree_rcn_2006_2012, strength_rcn_2006_2012 = get_data(budgetYears) f = plt.figure(figsize=(12,6)) data = strength_rcn_2006_2009 data_inst = 1 units = 'RCN 2006 - 2009' plot_powerlaw_combined(data, data_inst, f, units) data_inst = 2 data = strength_rcn_2010_2012 units = 'RCN 2010 - 2012' plot_powerlaw_combined(data, data_inst, f, units) data_inst = 3 data = strength_rcn_2006_2012 units = 'RCN 2006 - 2012' plot_powerlaw_combined(data, data_inst, f, units) f.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=.3, hspace=.2) f.savefig('%s/figures/powerlaw_degree_distribution.eps'%(root_folder()), bbox_inches='tight')
def rwr_scores(budgetYears): startBudgetYear = budgetYears[0] endBudgetYear = budgetYears[-1] logger.info('---------------- %s-%s -------------------' % (startBudgetYear, endBudgetYear)) network = load_network_for(budgetYears) #network = ResearchCollaborationNetwork.read(budgetYears) g = network.g.copy() ResearchCollaborationNetwork.simplify(g) logger.info(g.summary()) adj = np.array(g.get_adjacency(igraph.GET_ADJACENCY_BOTH).data) links = [] m = len(g.vs) for i in range(m): for j in range(i + 1, m): key = '%d,%d' % (i, j) links.append(key) rwr_scores = pgrank.rwr_score(g, links) rwrs = {} for link, score in rwr_scores.items(): v = link.split(',') v1 = int(v[0]) v2 = int(v[1]) key = '%s,%s' % (g.vs[v1]['name'], g.vs[v2]['name']) if(float(score) > 0.001): rwrs[key] = score filename = '%s/data/networks/%d-%d-rwr.json' % (root_folder(), startBudgetYear, endBudgetYear) with open(filename, 'w') as out: json.dump(rwrs, out)
def rwr_scores(budgetYears): startBudgetYear = budgetYears[0] endBudgetYear = budgetYears[-1] logger.info('---------------- %s-%s -------------------' % (startBudgetYear, endBudgetYear)) network = load_network_for(budgetYears) #network = ResearchCollaborationNetwork.read(budgetYears) g = network.g.copy() ResearchCollaborationNetwork.simplify(g) logger.info(g.summary()) adj = np.array(g.get_adjacency(igraph.GET_ADJACENCY_BOTH).data) links = [] m = len(g.vs) for i in range(m): for j in range(i + 1, m): key = '%d,%d' % (i, j) links.append(key) rwr_scores = pgrank.rwr_score(g, links) rwrs = {} for link, score in rwr_scores.items(): v = link.split(',') v1 = int(v[0]) v2 = int(v[1]) key = '%s,%s' % (g.vs[v1]['name'], g.vs[v2]['name']) if (float(score) > 0.001): rwrs[key] = score filename = '%s/data/networks/%d-%d-rwr.json' % ( root_folder(), startBudgetYear, endBudgetYear) with open(filename, 'w') as out: json.dump(rwrs, out)
def plot_auc(startBudgetYear, endBudgetYear, task, ax, color): filename = '%s/data/%s-%s.%s.roc.samples.npy' % ( root_folder(), startBudgetYear, endBudgetYear, task) roc_samples = np.load(filename) labels = [] scores = [] for k, label, score in roc_samples: labels.append(np.float(label)) scores.append(np.float(score)) area, [ax, lines] = roc.roc_curve(labels=np.array(labels), scores=np.array(scores), ax=ax, linewidth=1.5, color=color) return area, [ax, lines]
def test(): from matplotlib import rc rc('text', usetex=False) rc('font', family='serif') fig = plt.figure(figsize=(8,16)) ax = fig.add_subplot(1,2,1) task = 'per_user' roc.roc_curve_init(ax) area, [ax, b] = plot_auc(2006, 2009, task, ax, 'b') area, [ax, g] = plot_auc(2010, 2012, task, ax, 'g') area, [ax, r] = plot_auc(2006, 2012, task, ax, 'r') ax.set_title('Per-user Model') #f.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=.3, hspace=.2) #plt.legend(loc=4) #plt.savefig('%s/figures/%s-roc-curve.eps'%(root_folder(), task),bbox_inches='tight', dpi=600) #plt.close() task = 'per_network' ax = fig.add_subplot(1,2,2) roc.roc_curve_init(ax) area, [ax, b] = plot_auc(2006, 2009, task, ax, 'b') area, [ax, g] = plot_auc(2010, 2012, task, ax, 'g') area, [ax, r] = plot_auc(2006, 2012, task, ax, 'r') ax.set_title('Per-network Model') fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=.3, hspace=.2) #plt.figlegend((b, g, r), ('RCN (2006 - 2009)', 'RCN (2010 - 2012)', 'RCN (2006 - 2012)'), 'center') #plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.05), ncol=5, fancybox=True) plt.legend((b, g, r), ('RCN (2006 - 2009)', 'RCN (2010 - 2012)', 'RCN (2006 - 2012)'), bbox_to_anchor=(-0.15,-0.4),loc='center', prop={'size':12}) plt.savefig('%s/figures/roc-curve.eps'%(root_folder()),bbox_inches='tight', dpi=600)
def update_graphml(budgetYears): startBudgetYear = budgetYears[0] endBudgetYear = budgetYears[-1] network = load_network_for(budgetYears) network.g.vs['centrality_leader'] = 0 g = network.g.copy() g = ResearchCollaborationNetwork.largest_component(g) topK = 50 candidates, rankings = cl.centrality_leaders(g) #logger.info(candidates) #logger.info(rankings) #ordered_list = [] for r in range(len(rankings))[:topK]: logger.info('tier: %d' % r) for i in list(rankings[r]): node_name = g.vs[candidates[i]]['name'] # ordered_list.append(node_name) # set the node's centrality_leader attribute, the higher the better #g.vs[candidates[i]]['centrality_leader'] = topK + 1 - r node = network.g.vs.select(name_eq=node_name) #logger.info(node['name']) node['centrality_leader'] = r + 1 #logger.info(topK - r) # logger.info(node['name']) filename = '%s/data/networks/%d-%d.graphml' % ( root_folder(), startBudgetYear, endBudgetYear) network.write(filename)
def per_network(budgetYears): startBudgetYear = budgetYears[0] endBudgetYear = budgetYears[-1] logger.info('---------------- %s-%s -------------------'%(startBudgetYear, endBudgetYear)) network = load_network_for(budgetYears) g = network.g.copy() ResearchCollaborationNetwork.simplify(g) logger.info(g.summary()) # randomly pick 20 users candidates = range(len(g.vs)) shuffle(candidates) candidates = candidates[:20] adj = np.array(g.get_adjacency(igraph.GET_ADJACENCY_BOTH).data) m, _ = adj.shape nonobservedlinks = {} nonobserved_actual_edges = [] nonobserved_nonexist_edges = [] for i in range(m): # undirectd graph, so only care if the source is in candidates or not if i not in candidates: continue for j in range(i + 1, m): key = '%d,%d'%(i,j) nonobservedlinks[key] = adj[i,j] if adj[i,j] > 0: nonobserved_actual_edges.append(key) else: nonobserved_nonexist_edges.append(key) #logger.info('-----original graph:-----\r\n %s \r\n -----end original graph:-----'%g.summary()) auc = 0.0 apk = {3: 0.0, 5: 0.0, 10: 0.0} kfold = 10 cnt = 0; roc_samples = [] for ((es_p_training, es_p_validation), (es_m_training, es_m_validation)) in zip(utils.k_fold_cross_validation(list(nonobserved_actual_edges), kfold), utils.k_fold_cross_validation(list(nonobserved_nonexist_edges), kfold)): logger.info('--------iteration %d-------------'%cnt) logger.info('xxxxxxxxxxxxxxxxxxxxxxxx') logger.info('positive training: %d'%len(es_p_training)) logger.info('positive validation: %d'%len(es_p_validation)) logger.info('------------------------') logger.info('negative training: %d'%len(es_m_training)) logger.info('negative validation: %d'%len(es_m_validation)) #logger.info('xxxxxxxxxxxxxxxxxxxxxxxx') training = es_p_training + es_m_training validation = es_p_validation + es_m_validation #logger.info('training: %d; valiation: %d'%(len(training), len(validation))) # create training graph trainingG = g.copy() edges_2_delete = [] #// remove edges from the validation set for link in validation: v = link.split(',') v1 = int(v[0]) v2 = int(v[1]) eId = trainingG.get_eid(v1,v2, directed=False, error=False) if eId != -1: edges_2_delete.append(eId) trainingG.delete_edges(edges_2_delete) #logger.info('-----training graph:-----\r\n %s \r\n -----end training graph:-----'%trainingG.summary()) rwr_scores = pgrank.rwr_score(trainingG, validation) actual = [] posterior = [] actual_edges = [] for k in validation: actual.append(nonobservedlinks[k]) if nonobservedlinks[k] > 0: actual_edges.append(k) posterior.append(rwr_scores[k]) roc_samples.append((k, nonobservedlinks[k], rwr_scores[k])) #logger.info('actual edges: %s'%actual_edges) #logger.info('posterior: %s'%posterior) auc_ = benchmarks.auc(actual, posterior) auc += auc_ #area, [ax, lines] = roc.roc_curve(labels=np.array(actual),scores=np.array(posterior)) for topK, p in apk.iteritems(): predictedIndexes = sorted(range(len(posterior)), reverse=True, key=lambda k: posterior[k])[:topK] predicted = np.array(validation)[predictedIndexes] apk_ = benchmarks.apk(actual_edges, predicted, topK) apk[topK] += apk_ cnt += 1 # take a look at http://www.machinedlearnings.com/2012/06/thought-on-link-prediction.html logger.info('auc: %f'%(auc/kfold)) for topK, p in apk.iteritems(): logger.info('ap@%d: %f'%(topK, (apk[topK]/kfold))) #plt.show() np.save('%s/data/%s-%s.per_network.roc.samples.npy'%(root_folder(),startBudgetYear, endBudgetYear), np.array(roc_samples))
def per_candidate(budgetYears): startBudgetYear = budgetYears[0] endBudgetYear = budgetYears[-1] logger.info('---------------- %s-%s -------------------'%(startBudgetYear, endBudgetYear)) network = load_network_for(budgetYears) g = network.g.copy() ResearchCollaborationNetwork.simplify(g) logger.info(g.summary()) adj = np.array(g.get_adjacency(igraph.GET_ADJACENCY_BOTH).data) m, _ = adj.shape cNodes = g.vs.select(_degree_gt=15) #range(len(g.vs)) candidates = [] for cNode in cNodes: candidates.append(cNode.index) shuffle(candidates) candidates = candidates[:10] total_auc = 0.0 precision_at_k = {3: 0.0, 5: 0.0, 10: 0.0} mapk = precision_at_k kfold = 5 roc_samples = [] progress = len(candidates) # for each candidate we do training and testing... for c in candidates: logger.info('%d-----------------------'%progress) nonobservedlinks = {} nonobserved_actual_edges = [] nonobserved_nonexist_edges = [] # undirectd graph, so only care if the source is in candidates or not for j in range(m): key = '%d,%d'%(c,j) nonobservedlinks[key] = adj[c,j] #logger.info(adj[c,j]) if adj[c,j] > 0: nonobserved_actual_edges.append(key) else: nonobserved_nonexist_edges.append(key) cnt = 0 auc = 0.0 #average precision at k is defined per candidate apk = precision_at_k for ((es_p_training, es_p_validation), (es_m_training, es_m_validation)) in zip(utils.k_fold_cross_validation(list(nonobserved_actual_edges), kfold), utils.k_fold_cross_validation(list(nonobserved_nonexist_edges), kfold)): #logger.info('--------iteration %d-------------'%cnt) #logger.info('xxxxxxxxxxxxxxxxxxxxxxxx') #logger.info('positive training: %d'%len(es_p_training)) #logger.info('positive validation: %d'%len(es_p_validation)) #logger.info('------------------------') #logger.info('negative training: %d'%len(es_m_training)) #logger.info('negative validation: %d'%len(es_m_validation)) #logger.info('xxxxxxxxxxxxxxxxxxxxxxxx') training = es_p_training + es_m_training validation = es_p_validation + es_m_validation #logger.info('training: %d; valiation: %d'%(len(training), len(validation))) # create training graph trainingG = g.copy() edges_2_delete = [] #// remove edges from the validation set for link in validation: v = link.split(',') v1 = int(v[0]) v2 = int(v[1]) eId = trainingG.get_eid(v1,v2, directed=False, error=False) if eId != -1: edges_2_delete.append(eId) trainingG.delete_edges(edges_2_delete) #logger.info('-----training graph:-----\r\n %s \r\n -----end training graph:-----'%trainingG.summary()) rwr_scores = pgrank.rwr_score(trainingG, validation) for k, rwr_score in rwr_scores.iteritems(): if rwr_score > 1: logger.info('overflow? rwr_score: %0.2f'%(rwr_score)) actual = [] posterior = [] actual_edges = [] for k in validation: actual.append(nonobservedlinks[k]) if nonobservedlinks[k] > 0: actual_edges.append(k) posterior.append(rwr_scores[k]) roc_samples.append((k, nonobservedlinks[k], rwr_scores[k])) #logger.info('actual edges: %s'%actual_edges) #logger.info('posterior: %s'%posterior) auc_ = benchmarks.auc(actual, posterior) auc += auc_ total_auc += auc_ #area, [ax, lines] = roc.roc_curve(labels=np.array(actual),scores=np.array(posterior)) for topK, p in mapk.iteritems(): predictedIndexes = sorted(range(len(posterior)), reverse=True, key=lambda k: posterior[k])[:topK] predicted = np.array(validation)[predictedIndexes] apk_ = benchmarks.apk(actual_edges, predicted, topK) apk[topK] += apk_ mapk[topK] += apk_ cnt += 1 logger.info('%d: auc: %f'%(c, float(auc)/kfold)) for topK, p in apk.iteritems(): logger.info('%d: ap@%d: %f'%(c, topK, (apk[topK]/kfold))) progress -= 1 logger.info('auc: %f'%(float(total_auc)/(kfold*len(candidates)))) for topK, p in mapk.iteritems(): logger.info('map@%d: %f'%(topK, (mapk[topK]/(kfold*len(candidates))))) np.save('%s/data/%s-%s.per_user.roc.samples.npy'%(root_folder(),startBudgetYear, endBudgetYear), np.array(roc_samples))