def extract_backbone(flavor_network, alpha): """ makes a new graph with only the edges with weights that exceed the threshold for statistical significance :param ing_comp_graph: full flavor ingredient network :return: the pruned SGraph """ def degree_count_fn(src, edge, dst): """ increments the degree of the nodes on this edge :param src: :param edge: :param dst: :return: """ src['deg'] += 1 dst['deg'] += 1 return src, edge, dst def compute_node_moments(node_k): mean = 2 * node_k / (node_k + 1) sigma = sqrt(node_k**2 * ((20 + 4 * node_k) / ((node_k + 1) * (node_k + 2) * (node_k + 3)) - 4 / (node_k + 1)**2)) return mean, sigma def test_for_significance(edge, weights_lookup, alpha): y_obs = edge['weight'] node1_k = weights_lookup[edge['__dst_id']] node2_k = weights_lookup[edge['__src_id']] m1, sig1 = compute_node_moments(float(node1_k)) m2, sig2 = compute_node_moments(float(node2_k)) return y_obs >= abs(m1 + alpha * sig1) or y_obs >= abs(m2 + alpha * sig2) flav_net_w_deg = SGraph() edge_list = flavor_network.get_edges() new_node_list = flavor_network.vertices.fillna('deg', 0) flav_net_w_deg = flav_net_w_deg.add_vertices(new_node_list).add_edges( edge_list) flav_net_w_deg = flav_net_w_deg.triple_apply(degree_count_fn, mutated_fields=['deg']) weights_dict = flav_net_w_deg.vertices.to_dataframe().set_index( '__id').to_dict()['deg'] significant_edges = [] for edge in flav_net_w_deg.get_edges(): if test_for_significance(edge, weights_dict, alpha): significant_edges.append( flav_net_w_deg.get_edges(src_ids=edge['__src_id'], dst_ids=edge['__dst_id'], format='list')[0]) pruned_network = SGraph().add_vertices(new_node_list) pruned_network = pruned_network.add_edges(significant_edges) return pruned_network
def showPath(self, highlight=None): # with open(self.verticesFn,'a') as Vwr: # with open(self.edgesFn,'a') as Ewr: # for i in range(8): # Vwr.write('\nc0_' + `i` + ', ') # Ewr.write('\np8_0_t,' + 'c0_' + `i` + ',c') # highlight['c0_'+`i`] = [0.69, 0.0, 0.498] # start = datetime.datetime.now() edge_data = SFrame.read_csv(self.edgesFn) vertex_data = SFrame.read_csv(self.verticesFn) g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name', src_field='src', dst_field='dst') # end = datetime.datetime.now() # print (end - start) # g.show(vlabel='attributes', elabel='relation', h_offset=0.3,v_offset=-0.025, highlight=highlight, arrows=True) # highLight g.show(vlabel='attributes', vlabel_hover=False, elabel='relation', highlight=highlight, arrows=True) # highLight sleep(20) pass
def get_subgraph(self, ids, radius=1, full_subgraph=True): verts = ids # find the vertices within radius (and the path edges) for i in range(radius): edges_out = self._graph.get_edges(src_ids=verts) # edges_in = self._graph.get_edges(dst_ids=verts) verts = list(edges_out['__src_id']) + list(edges_out['__dst_id']) verts = list(set(verts)) # make a new graph to return and add the vertices g = SGraph() g = g.add_vertices(self._graph.get_vertices(verts), vid_field='__id') # add the requested edge set if full_subgraph is True: df_induced = self._graph.get_edges(src_ids=verts) # induced_edge_in = self._graph.get_edges(dst_ids=verts) # df_induced = induced_edge_out.append(induced_edge_in) df_induced = df_induced.groupby(df_induced.column_names(), {}) verts_sa = SArray(list(ids)) edges = df_induced.filter_by(verts_sa, "__src_id") edges.append(df_induced.filter_by(verts_sa, "__dst_id")) g = g.add_edges(edges, src_field='__src_id', dst_field='__dst_id') return GlGraph(is_directed=self.is_directed, graph_obj=g)
def create_initial_bayesian_network(): ''' Start from a randomly generated Bayesian network where there is no edge between the variables of the same type. First, create a blacklist. ''' g = load_sgraph('data_graph') edges = g.get_edges() features = edges[['__dst_id', 'relation']].unique() features.rename({'__dst_id': 'feature_id', 'relation': 'feature_type'}) bn = SGraph() bn = bn.add_vertices(features, vid_field='feature_id') n_features = features.num_rows() edges_data_graph = g.get_edges() n_patients = edges_data_graph['__src_id'].unique().size() random.seed(1234) for i in range(20): src = features['feature_id'][random.randint(0, n_features - 1)] dst = 'E8498' #dst = features['feature_id'][random.randint(0, n_features-1)] bn = bn.add_edges(Edge(src, dst)) print "Added edge between " + src + " and " + dst bic = get_bic_score(g, bn, n_patients) return g
def create_network_features(Returns, Network, name='Sales', Start=9, End=12): for quarter in xrange(Start, End): if quarter == 12: continue ReturnsX = Returns[Returns['TaxQuarter'] == quarter] NetworkX = Network[Network['TaxQuarter'] == quarter] g = SGraph(vertices=ReturnsX, edges=NetworkX, vid_field='Mtin', src_field='Mtin', dst_field='SellerBuyerTin') # cc = graphlab.connected_components.create(g) # g.vertices['component_id'] = cc['graph'].vertices['component_id'] pr = graphlab.pagerank.create(g) g.vertices['pagerank'] = pr['graph'].vertices['pagerank'] tc = graphlab.triangle_counting.create(g) g.vertices['triangle_count'] = tc['graph'].vertices['triangle_count'] deg = degree_counting.create(g) deg_graph = deg['graph'] g.vertices['in_degree'] = deg_graph.vertices['in_degree'] g.vertices['out_degree'] = deg_graph.vertices['out_degree'] # kc = kcore.create(g) # g.vertices['core_id'] = kc['graph'].vertices['core_id'] # g.vertices.export_csv('H:\\Ashwin\\dta\\sample_bogusdealersNetworkFeaturesSales17.csv') g.vertices.export_csv( 'H:\\Ashwin\\dta\\bogusdealers\\NetworkFeatures{}{}.csv'.format( name, quarter))
def SSSP(): url = '/home/gengl/Datasets/SSSP/BerkStan/edge.txt' data = SFrame.read_csv(url, delimiter='\t', header=False, column_type_hints=[int, int, int]) graph = SGraph().add_edges(data, src_field='X1', dst_field='X2') sp_model = shortest_path.create(graph, source_vid=0, weight_field='X3') sp_model.summary()
def CC(): url = '/home/gengl/Datasets/CC/BerkStan/edge.txt' data = SFrame.read_csv(url, delimiter='\t', header=False, column_type_hints=[int, int]) graph = SGraph().add_edges(data, src_field='X1', dst_field='X2') cc_model = connected_components.create(graph, verbose=True) cc_model.summary()
def get_graph(X_train, k): start = datetime.now() factor0 = (X_train['rating'].mean() / k / 0.25)**0.5 vertices = get_vertices(k, factor0) X_train['uid'] = X_train['userId'].apply(prefix('u')) X_train['mid'] = X_train['movieId'].apply(prefix('m')) sg = SGraph().add_vertices(vertices, vid_field='__id')\ .add_edges(X_train, src_field='uid', dst_field='mid') print 'get_graph %s' % (datetime.now() - start) return sg
def PageRank(): url = '/home/gengl/Datasets/PageRank/BerkStan/edge.txt' data = SFrame.read_csv(url, delimiter='\t', header=False, column_type_hints=[int, int]) graph = SGraph().add_edges(data, src_field='X1', dst_field='X2') pr_model = pagerank.create(graph, reset_probability=0.2, threshold=0.0001, max_iterations=1000, _distributed=True) pr_model.summary()
def PageRank(): url = '/clueweb/PageRank/clueweb_20M/edge_pair.txt' data = SFrame.read_csv(url, delimiter='\t', header=False, column_type_hints=[int, int]) graph = SGraph().add_edges(data, src_field='X1', dst_field='X2') pr_model = pagerank.create(graph, reset_probability=0.2, threshold=0.000000000001, max_iterations=42, _distributed=True) pr_model.summary()
def showPath(self, highlight=None): edge_data = SFrame.read_csv(self.edgesFn) vertex_data = SFrame.read_csv(self.verticesFn) g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name', src_field='src', dst_field='dst') g.show(vlabel='id', elabel='relation', highlight=highlight, arrows=True) # highLight sleep(10) pass
def MP_graph(D, x): N, M = D.shape z = np.zeros((M, 1)) z_temp = np.zeros(M) r = np.copy(x) num_iter = 30 # Create bipartite graph G = SGraph() x_vertices = [Vertex(i) for i in xrange(N)] z_vertices = [Vertex(j + N) for j in xrange(M)] D_edges = [Edge(i, j) for i in xrange(N) for j in xrange(N, N + M)] G.add_vertices(x_vertices, z_vertices) G.add_edges(D_edges) for i in xrange(N): x_vertices[i]["value"] = x[i] for j in xrange(M): z_vertices[j]["value"] = 0.0 z_vertices[j]["dummy"] = 0.0 z_vertices[j]["max"] = 0.0 for i in xrange(N): for j in xrange(M): Edge(x_vertices[i], z_vertices[j])["value"] = D[i][j] def inner_prod(s, e, t): t["dummy"] += e["value"] * s["value"] def update_z(s, e, t): if not t["max"] == 0.0: t["value"] += e["value"] * s["value"] def compute_residual(s, e, t): if not t["max"] == 0.0: s["value"] -= t["value"] * e["value"] for itr in xrange(num_iter): # Compute inner products with r print "NUM ITR = ", itr G = G.triple_apply(inner_prod, mutated_fields=["value", "dummy"]) for i in xrange(M): z_vertices[i]["max"] = 0.0 z_temp[i] = z_vertices[i]["dummy"] max_pos = np.argmax(z_temp) z_vertices[max_pos]["max"] = z_temp[max_pos] G = G.triple_apply(update_z, mutated_fields=["max", "value"]) for i in xrange(M): z[i] = z_vertices[i]["value"] return z
def showPath(self, highlight=None): start = datetime.datetime.now() edge_data = SFrame.read_csv(self.edgesFn) vertex_data = SFrame.read_csv(self.verticesFn) g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name', src_field='src', dst_field='dst') end = datetime.datetime.now() print(end - start) # g.show(vlabel='attributes', elabel='relation', highlight=highlight, arrows=True) # highLight # sleep(40) pass
def SSSP(): url = '/home/gengl/Datasets/SSSP/Google/edge.txt' data = SFrame.read_csv(url, delimiter='\t', header=False, column_type_hints=[int, int, int]) graph = SGraph().add_edges(data, src_field='X1', dst_field='X2') sp_model = shortest_path.create(graph, source_vid=0, weight_field='X3') sp_model.summary() with open('/home/gengl/sssp_graphlab', 'w') as fo: for vid in range(0, 875713): try: result_pair = sp_model.get_path(vid) fo.write(str(result_pair[-1]) + '\n') except: pass
def showPath(self, highlight=None): # start = datetime.datetime.now() edge_data = SFrame.read_csv(self.edgesFn) vertex_data = SFrame.read_csv(self.verticesFn) g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name', src_field='src', dst_field='dst') # end = datetime.datetime.now() # print (end - start) # g.show(vlabel='attributes',vlabel_hover=True, elabel='relation', h_offset=0.3,v_offset=-0.025, highlight=highlight, arrows=True) # highLight g.show(vlabel='id', elabel='relation', vlabel_hover=True, highlight=highlight, arrows=True) # highLight sleep(30) pass
def build_weighted_graph(ing_comp_dict): """ builds the weighted undirected graph that is the flavor network :param ing_comp_dict: ingredient:compound dictionary :return: SGraph that represents the flavor network """ flav_network = SGraph() vert_list = [] edge_list = [] ingrds_not_seen = ing_comp_dict.keys() for node_1_ingr, compounds in ing_comp_dict.iteritems(): ingrds_not_seen.remove(node_1_ingr) vert_list.append(Vertex(node_1_ingr, attr={'deg': 0})) for node_2_ingr in ingrds_not_seen: weight = len( set(ing_comp_dict[node_2_ingr]).intersection(set(compounds))) if weight > 0: edge_list.append( Edge(node_1_ingr, node_2_ingr, attr={'weight': weight})) vert_list.append(Vertex(node_1_ingr)) flav_network = flav_network.add_vertices(vert_list) flav_network = flav_network.add_edges(edge_list) return flav_network
# In[33]: edges = gl.SFrame.read_csv(os.path.join(csvDataFolder, '001008_0.csv'), header=False, delimiter=',', column_type_hints=int) edegs_n # In[42]: from graphlab import SGraph, Vertex g = SGraph().add_vertices([ Vertex('cat', {'fluffy': 1}), Vertex('dog', { 'fluffy': 1, 'woof': 1 }), Vertex('hippo', {}) ]) g.vertices.save(os.path.join(resultFolder, 'test_graph_vertices'), format='csv') # In[59]: #sframe reading start_r = time.time() # csvFiles = os.listdir(csvDataFolder) csvFiles = [ '000000_0.csv', '000001_0.csv', '000002_0.csv',
path = sys.argv[1] else: path = "./data/" verbose = False vertexFiles = [ "City", "Country", "Region", "Advisor", "Category", "Founder", "FundingRound", "HQ", "keywords", "Member", "Office", "organizations", "PrimaryImage", "TeamMember", "Website", "companies_acquired_by_sap" ] edgesFiles = [ "GeoInformation", "acquisitions", "categories_keywords_edges", "investments", "keywords_descriptions_edges", "keywords_webpages_edges", "relationships", "companies_acquired_by_sap_edges" ] g = SGraph() for f in vertexFiles: content = SFrame.read_csv(path + f + '.csv', na_values='null', verbose=verbose) if 'path' in content.column_names(): g = g.add_vertices(content, vid_field='path') elif 'url' in content.column_names(): g = g.add_vertices(content, vid_field='url') else: print "Unknown vid field: ", content.column_names() sys.exit() for f in edgesFiles: content = SFrame.read_csv(path + f + '.csv',
#targets = ['James Bond', 'Moneypenny'] #subgraph = g.get_neighborhood(ids=targets, radius=1, full_subgraph=True) #subgraph.show(vlabel='id', highlight=['James Bond', 'Moneypenny'], arrows=True) #from graphlab import SGraph, Vertex, Edge #g = SGraph() #verts = [Vertex(0, attr={'breed': 'labrador'}), # Vertex(1, attr={'breed': 'labrador'}), # Vertex(2, attr={'breed': 'vizsla'})] #g = g.add_vertices(verts) #g = g.add_edges(Edge(1, 2)) #print g from graphlab import SFrame, SGraph edge_data = SFrame.read_csv( 'http://s3.amazonaws.com/dato-datasets/bond/bond_edges.csv') vertex_data = SFrame.read_csv( 'http://s3.amazonaws.com/dato-datasets/bond/bond_vertices.csv') g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name', src_field='src', dst_field='dst') #print g g.show()
def main(): g = SGraph() verts = [] #initialize the Karate graph with unique label fields for each node for i in range(0, 34): verts.append(Vertex(i, attr={'label': str(i)})) g = g.add_vertices(verts) #prepare the path for the Karate network data fname = "./karate.txt" #read the edges from Karate.txt and add them to the SGraph object with open(fname) as f: for l in f: #print(l) #parse the src and dst ids for the next edge ids = l.split() src = int(ids[0]) dst = int(ids[1]) #add the edge as a graphlabl.Edge object to the graph g = g.add_edges(Edge(src, dst)) #visualize the graph #print(g.summary()) #randId=rn.sample(range(0,34),1)[0] #print(randId) #test = g.get_vertices(fields={'label':'1'})[randId] #test.show() #print(test) ids = range(0, 34) #label propagation loop flag = False iteration = 0 #rounds=5 #initialize neigh dict for performance gns = {} cur_max = 0 start = time.time() #start=time.time() while flag == False: #pick vertice iteration order randomly rn.shuffle(ids) flag = True #print(ids) start = time.time() for index in ids: #print(index) cur_max = LPA(g, index) if str(cur_max) != g.get_vertices(ids=[index])['label'][0]: flag = False g.vertices['label'] = g.vertices.apply(lambda x: str( cur_max) if x['__id'] == index else x['label']) #print(end-start) iteration += 1 print(iteration) end = time.time() #end=time.time() print(end - start) print iteration g.show(vlabel='label')
outputPath = os.environ.get("OUTPUT_PATH") startScale = int(os.environ.get("START_SCALE")) tagFile = './tmp' with open(tagFile, 'r') as f: infor = f.readline().strip().split(",") maxScale = int(infor[1]) realEndScale = int(infor[2]) scaleRange = range(startScale, realEndScale + 1) for scale in scaleRange: inputPath = os.path.join(outputPath, 'tmp', 'AdjacentRelationships', str(scale)) url = inputPath data = SFrame.read_csv(url, header=False) if (data.num_rows() == 0): cc_ids = SFrame({"__id": [], "component_id": []}) else: g = SGraph().add_edges(data, src_field=data.column_names()[0], dst_field=data.column_names()[1]) cc = connected_components.create(g) cc_ids = cc.get('component_id') path = os.path.join(outputPath, 'tmp', 'ConnectedComponents', str(scale)) if (~os.path.exists(path)): os.makedirs(path) SFrame.export_csv(cc_ids, os.path.join(path))
import graphlab as gl from graphlab import SGraph, Vertex, Edge f = open('web-Google.txt', 'r') vertices = set() edges = [] for line in f: v1, v2 = [int(x) for x in line.split()] vertices.add(v1) vertices.add(v2) edges.append(Edge(v1, v2)) print 'In total {0} vertices and {1} edges'.format(len(vertices), len(edges)) g = SGraph().add_vertices([Vertex(x) for x in vertices]).add_edges(edges) g.save('page_graph')
def build_data_graph(): file_path = "/Users/blahiri/healthcare/documents/recommendation_system/" beneficiaries = SFrame.read_csv(file_path + "beneficiary_summary_2008_2009.csv") bene_packed = beneficiaries.pack_columns( column_prefix='chron_', dtype=dict, new_column_name='chronic_conditions', remove_prefix=False) #x is a row of bene_packed in the following lambda. We insert the desynpuf_id into the (key, value) tuple, convert the tuple to a list by calling list(), #and the outer [] makes sure we emit a list of lists. bene_chrons = bene_packed.flat_map( ["chronic_condition_name", "chronic_condition_value", "desynpuf_id"], lambda x: [ list(k + (x['desynpuf_id'], )) for k in x['chronic_conditions'].iteritems() ]) bene_chrons = bene_chrons[bene_chrons['chronic_condition_value'] == 1] del bene_chrons['chronic_condition_value'] bene_chrons.rename({'chronic_condition_name': 'chronic_condition'}) g = SGraph() bene_chrons['relation'] = 'had_chronic' g = g.add_edges(bene_chrons, src_field='desynpuf_id', dst_field='chronic_condition') print g.summary() #Take out the distinct IDs of patients with chronic conditions to avoid repetition in query bene_with_chrons = SFrame(None) bene_with_chrons.add_column(bene_chrons['desynpuf_id'].unique(), 'desynpuf_id') #Add edges to the graph indicating which patient had which diagnosed condition tcdc = SFrame.read_csv(file_path + "transformed_claim_diagnosis_codes.csv") cols_to_drop = [ 'clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year' ] for column in cols_to_drop: del tcdc[column] #Same patient can be diagnosed with same condition multiple times a year, so take distinct tcdc = tcdc.unique() #Take diagnosed conditions for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no diagnosed condition, however. bene_chrons_tcdc = bene_with_chrons.join(tcdc) bene_chrons_tcdc['relation'] = 'diagnosed_with' g = g.add_edges(bene_chrons_tcdc, src_field='desynpuf_id', dst_field='dgns_cd') print g.summary() #Add edges to the graph indicating which patient had which procedure tcpc = SFrame.read_csv(file_path + "transformed_claim_prcdr_codes.csv", column_type_hints={'prcdr_cd': str}) cols_to_drop = [ 'clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year' ] for column in cols_to_drop: del tcpc[column] tcpc = tcpc.unique() #Take procedures for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no procedure, however. bene_chrons_tcpc = bene_with_chrons.join(tcpc) bene_chrons_tcpc['relation'] = 'underwent' g = g.add_edges(bene_chrons_tcpc, src_field='desynpuf_id', dst_field='prcdr_cd') print g.summary() #Add edges to the graph indicating which patient had which medicine pde = SFrame.read_csv(file_path + "prescribed_drugs.csv") pde = pde.unique() #Take medicines for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no medicine, however. bene_chrons_pde = bene_with_chrons.join(pde) bene_chrons_pde['relation'] = 'had_drug' g = g.add_edges(bene_chrons_pde, src_field='desynpuf_id', dst_field='substancename') print g.summary() return g