def from_previous_reduction(cls, input_dir): parent = gl.load_sgraph(input_dir+'parent') verticy_descriptions = gl.load_sframe(input_dir+'verticy_descriptions') child = gl.load_sgraph(input_dir+'child') gw = cls() gw.g = parent gw.verticy_descriptions = verticy_descriptions gw.child = cls() gw.child.g = child return gw
def from_previous_reduction(cls, input_dir): parent = gl.load_sgraph(input_dir + 'parent') verticy_descriptions = gl.load_sframe(input_dir + 'verticy_descriptions') child = gl.load_sgraph(input_dir + 'child') gw = cls() gw.g = parent gw.verticy_descriptions = verticy_descriptions gw.child = cls() gw.child.g = child return gw
def create_initial_bayesian_network(): ''' Start from a randomly generated Bayesian network where there is no edge between the variables of the same type. First, create a blacklist. ''' g = load_sgraph('data_graph') edges = g.get_edges() features = edges[['__dst_id', 'relation']].unique() features.rename({'__dst_id': 'feature_id', 'relation': 'feature_type'}) bn = SGraph() bn = bn.add_vertices(features, vid_field='feature_id') n_features = features.num_rows() edges_data_graph = g.get_edges() n_patients = edges_data_graph['__src_id'].unique().size() random.seed(1234) for i in range(20): src = features['feature_id'][random.randint(0, n_features - 1)] dst = 'E8498' #dst = features['feature_id'][random.randint(0, n_features-1)] bn = bn.add_edges(Edge(src, dst)) print "Added edge between " + src + " and " + dst bic = get_bic_score(g, bn, n_patients) return g
def create_initial_bayesian_network(): ''' Start from a randomly generated Bayesian network where there is no edge between the variables of the same type. First, create a blacklist. ''' g = load_sgraph('data_graph') edges = g.get_edges() features = edges[['__dst_id', 'relation']].unique() features.rename({'__dst_id': 'feature_id', 'relation': 'feature_type'}) bn = SGraph() bn = bn.add_vertices(features, vid_field = 'feature_id') n_features = features.num_rows() edges_data_graph = g.get_edges() n_patients = edges_data_graph['__src_id'].unique().size() random.seed(1234) for i in range(20): src = features['feature_id'][random.randint(0, n_features-1)] dst = 'E8498' #dst = features['feature_id'][random.randint(0, n_features-1)] bn = bn.add_edges(Edge(src, dst)) print "Added edge between " + src + " and " + dst bic = get_bic_score(g, bn, n_patients) return g
def __load_data_structure__(self, filepath): """Return data structure if can be loaded, otherwise returns None and logs warning""" # try to load different supported types, since don't know what type just try all and swallow exceptions obj = None try: obj = _gl.load_sframe(filepath) return obj except: pass try: obj = _gl.load_sgraph(filepath) return obj except: pass try: obj = _gl.load_model(filepath) return obj except: pass try: obj = _gl.SArray(data=filepath) return obj except: pass __LOGGER__.debug("Unable to load dependency, unsupported type at path: %s" % filepath) return None
def main(): args = parse_args('Test the result of the CommunityDetection algorithm') result_graph = gl.load_sgraph(args.graph_name) expected = gl.SFrame.read_csv(args.expected_output, delimiter=' ', header=False, column_type_hints=long) for node in result_graph.vertices: test = expected.apply(lambda x: node['label'] != x['X2'] and node['__id'] == x['X1']) if test.sum() > 0: print('Not all values match, invalid algorithm') exit(1)
def load_nxgraph_from_sgraph(graph_path): sg = gl.load_sgraph(graph_path) import networkx as nx g = nx.Graph() # Put the nodes and edges from the SGraph into a NetworkX graph g.add_nodes_from(list(sg.vertices['__id'])) g.add_edges_from([(e['__src_id'], e['__dst_id'], e['attr']) for e in sg.edges]) return g
def get_pr_result_from_input(input_file): g = gl.load_sgraph(input_file, format='csv') pr = gl.pagerank.create(g) pr_out = pr['pagerank'] csvfile_pr_movie_start_result = open("output/pr_movie_start_result.csv", "w") writer_pr_movie_start_result = csv.writer(csvfile_pr_movie_start_result) for pr_out_item in pr_out: writer_pr_movie_start_result.writerow([pr_out_item['__id'], pr_out_item['pagerank'], pr_out_item['delta']]) csvfile_pr_movie_start_result.close()
def _load_graphlab_object(cls, obj_type, obj_path): if obj_type == 'model': return graphlab.load_model(obj_path) elif obj_type == 'sarray': return graphlab.SArray(obj_path) elif obj_type == 'sframe': return graphlab.load_sframe(obj_path) elif obj_type == 'sgraph': return graphlab.load_sgraph(obj_path) else: raise RuntimeError(str(obj_type) + ' is not supported')
def main(): args = parse_args('Test the result of the CommunityDetection algorithm') result_graph = gl.load_sgraph(args.graph_name) expected = gl.SFrame.read_csv(args.expected_output, delimiter=' ', header=False, column_type_hints=long) for node in result_graph.vertices: test = expected.apply( lambda x: node['label'] != x['X2'] and node['__id'] == x['X1']) if test.sum() > 0: print('Not all values match, invalid algorithm') exit(1)
def __init__(self, sf_path=None, g_path=None, cache_max=0.75): self.sf = None self.label = None self.bin_sfs = None self.reps = gl.SArray(dtype=str) self.hier_graph = None self.num_bins = 0 self.features = None self.distance = None self.cache_max = cache_max if g_path: self.g = gl.load_sgraph(g_path) self.sf = self.g.vertices elif sf_path: self.sf = gl.load_sframe(sf_path)
def main(): args = parse_args('Test the result of the CommunityDetection algorithm') result_graph = gl.load_sgraph(args.graph_name) expected = gl.SFrame.read_csv(args.expected_output, delimiter=' ', header=False, column_type_hints=long) for node in result_graph.vertices.sort('__id'): test = expected.apply( lambda x: float_not_equals(x['X2'], node['local_clustering_coefficient']) and node['__id'] == x['X1']) if test.sum() > 0: print('Not all values match, invalid algorithm') exit(1) expected_average_cc = expected.filter_by([0], 'X1')['X2'][0] actual_average_cc = result_graph.vertices['average_clustering_coefficient'][0] if float_not_equals(expected_average_cc, actual_average_cc): print('Average Clustering Coefficient is wrong: expected: "%s", but got: "%s"' % ( expected_average_cc, actual_average_cc))
def load_gl_object(filename): """ Load a GLC datastructure from a filename. Parameters ---------- filename : Filename for the archive Returns ---------- The GLC object. """ obj = None if not os.path.exists(filename): raise "Loading error: %s is not a valid filename." % filename try: obj = _gl.load_sframe(filename) return obj except: pass try: obj = _gl.load_sgraph(filename) return obj except: pass try: obj = _gl.load_model(filename) return obj except: pass try: obj = _gl.SArray(data=filename) return obj except: pass return obj
def main(): args = parse_args('Test the result of the CommunityDetection algorithm') result_graph = gl.load_sgraph(args.graph_name) expected = gl.SFrame.read_csv(args.expected_output, delimiter=' ', header=False, column_type_hints=long) for node in result_graph.vertices.sort('__id'): test = expected.apply( lambda x: float_not_equals(x['X2'], node[ 'local_clustering_coefficient']) and node['__id'] == x['X1']) if test.sum() > 0: print('Not all values match, invalid algorithm') exit(1) expected_average_cc = expected.filter_by([0], 'X1')['X2'][0] actual_average_cc = result_graph.vertices[ 'average_clustering_coefficient'][0] if float_not_equals(expected_average_cc, actual_average_cc): print( 'Average Clustering Coefficient is wrong: expected: "%s", but got: "%s"' % (expected_average_cc, actual_average_cc))
def get_nodes_and_edges(self): directory_names = os.listdir(self._input_directory_path) graph_tuples = [] for directory_name in directory_names: file_names = os.listdir(self._input_directory_path + directory_name) for file_name in file_names: if ".sgraph" in file_name: print("File name is: {0}".format(file_name)) pattern = "^([^\.]+)__([^\.]+).[^\.]+.([^\.]+).sgraph$" match = re.match(pattern, file_name) group_tuple = match.groups() category = group_tuple[0] sub_category = group_tuple[1] timestamp = group_tuple[2] sub_graph = gl.load_sgraph(self._input_directory_path + directory_name + "/" + file_name) sub_graph.save(self._output_directory_path + file_name + ".csv", format='csv') summary_dict = sub_graph.summary() num_vertices = summary_dict['num_vertices'] num_edges = summary_dict['num_edges'] tuple = (category, sub_category, timestamp, num_vertices, num_edges) graph_tuples.append(tuple) df = pd.DataFrame( graph_tuples, columns=['category', 'sub_category', 'date', 'nodes', 'edges']) df.to_csv(self._output_directory_path + "graph_summary.csv")
if numpy.linalg.norm(numpy.array(dst['memVector']), ord=2) > 0.0: if numpy.linalg.norm( numpy.array(dst['prev']) - numpy.array(dst['memVector']), ord=2) < CONVERGENCE_VALUE: dst['isSuperNode'] = 1 return (src, edge, dst) def updatePrev(src, edge, dst): if src['isSuperNode'] == 0: src['prev'] = src['memVector'] if dst['isSuperNode'] == 0: dst['prev'] = dst['memVector'] return (src, edge, dst) if __name__ == '__main__': graph = gl.load_sgraph("s3://sdurgam/GraphLab/Graph") graph = graph.triple_apply(initialise, mutated_fields=['prev']) convergence = graph.vertices['isSuperNode'].sum() while (convergence < NUM_NON_SUPERNODES): graph = graph.triple_apply(propagate, mutated_fields=['memVector']) graph = graph.triple_apply(l2Norm, mutated_fields=['isSuperNode']) graph = graph.triple_apply(updatePrev, mutated_fields=['prev']) graph.vertices['memVector'] = graph.vertices['memVector'].apply( lambda x: [0.0] * 92000) convergence = graph.vertices['isSuperNode'].sum() graph = graph.save("s3://sdurgam/GraphLab/Graph")
import graphlab as gl from graphlab.data_structures.sgraph import SGraph as _SGraph import graphlab.aggregate as _Aggregate from graphlab import SArray from graphlab import SFrame from graphlab import Vertex from graphlab import SGraph from graphlab import Edge g = gl.load_sgraph('/home/tweninge/wiki.graph') def initVertex(g): g.vertices['dist'] = 8888 g.vertices['sent'] = 0 #g.vertices['from_last_art'] = 0 #g.vertices['count'] =0 g.vertices['isDead'] = 0 #g.vertices['vid_set'] = SArray.from_const({}, g.summary()['num_vertices']) #seen here have two function, for the cat, it is used to remember the articles, for art, it is used as the vid_set #in fact, it is a dict with the form of {'id':[dist, from_last_art]} g.vertices['seen'] = SArray.from_const({}, g.summary()['num_vertices']) #g.vertices['msg_q'] = SArray.from_const([], g.summary()['num_vertices']) we donnot need it any more #g = gl.load_graph('/Users/liuzuozhu/Downloads/web-Google.txt', format='snap') initVertex(g) #print g.get_vertices() # def initEdge(g): # #g.edges.head(5)
import os import graphlab as gl data_file = 'US_business_links' if os.path.exists(data_file): sg = gl.load_sgraph(data_file) # sg.save('1', format='csv') else: url = 'https://static.turi.com/datasets/' + data_file sg = gl.load_sgraph(url) sg.save(data_file) print sg.summary() pr = gl.pagerank.create(sg, max_iterations=10) # print pr['pagerank'] print pr.summary() pr_out = pr['pagerank'] print pr_out.topk('pagerank', k=10) ##Triangle counting ##The number of triangles in a vertex's immediate neighborhood is a measure of the "density" of the vertex's neighborhood. tri = gl.triangle_counting.create(sg) print tri.summary() tri_out = tri['triangle_count'] print tri_out.topk('triangle_count', k=10) ##Because GraphLab Create SGraphs use directed edges, the shortest path toolkit also finds the shortest directed paths to a source vertex. sssp = gl.shortest_path.create(sg, source_vid='Microsoft') sssp.get_path(vid='Weyerhaeuser',
import graphlab as gl from graphlab import SFrame, SGraph, Vertex, Edge edge_data = SFrame.read_csv( 'http://s3.amazonaws.com/dato-datasets/bond/bond_edges.csv') g = SGraph() g = g.add_edges(edge_data, src_field='src', dst_field='dst') print g g.save('james_bond') new_graph = gl.load_sgraph('james_bond') g.show(vlabel='id', highlight=['James Bond', 'Moneypenny'], arrows=True)
results.save('buckets/' + b) print 'Nearest Neighbors Complete!\n' # Create Graphs # reload the SFrame sf = gl.load_sframe('sydney_sf') print 'Creating Graphs...' for i in range(1, 671): print 'Creating Graph for B' + str(i) cur_edges = gl.load_sframe('buckets/B' + str(i)) edge_verts = cur_edges['query_label'].append(cur_edges['reference_label']) edge_verts = edge_verts.unique() cur_verts = sf.filter_by(edge_verts, 'mongo_id') g = gl.SGraph(cur_verts, cur_edges, vid_field='mongo_id', src_field='query_label', dst_field='reference_label') g.save('graphs/B' + str(i)) print 'Graph Creation Complete!\n' # Calculate Components print 'Calculating Components...' for i in range(1, 671): print 'Calculating Components for for B' + str(i) g = gl.load_sgraph('graphs/B' + str(i)) cc = gl.connected_components.create(g) cc.save('components/B' + str(i)) print 'Success!' exit()
def sgraph_to_csv(sgraph_path, output_path): sg = gl.load_sgraph(sgraph_path) sg.save(output_path, 'csv')
import graphlab as gl g = gl.load_sgraph('input/test.txt', format='snap') pr = gl.pagerank.create(g) pr_out = pr['pagerank'] print "#########" for pr_out_index in pr_out: print pr_out_index
import graphlab as gl g = gl.load_sgraph('page_graph') N = len(g.vertices) beta = 0.02 epsilon = 150 f = open('topk_weight', 'w') g.vertices['weight'] = 1.0 g.vertices['degree'] = 0 def increment_degree(src, edge, dst): src['degree'] += 1 return (src, edge, dst) def increment_weight(src, edge, dst): dst['weight_new'] += src['weight'] / src['degree'] return (src, edge, dst) g = g.triple_apply(increment_degree, mutated_fields=['degree']) while True: g.vertices['weight_new'] = 0 g.triple_apply(increment_weight, mutated_fields=['weight_new']) g.vertices['weight_new'] = beta / N + ( 1 - beta) * (g.vertices['weight_new'] +
itr = 0 for i in range(0, 10): vector.append(array.index(max(array)) + NUM_FIRST + itr) array.remove(max(array)) itr += 1 return vector def getNumRecos(src, edge, dst): src['rightRecos'] = len(set(src['recos']).intersection(src['groundTruth'])) dst['rightRecos'] = len(set(dst['recos']).intersection(dst['groundTruth'])) return (src, edge, dst) if __name__ == "__main__": graph = gl.load_sgraph("s3://sank/GraphLab/Graph") groundTruth = gl.SArray("GroundTruth") graph.vertices['recos'] = graph.vertices['prev'].apply( lambda x: getRecos(x)) graph.vertices['groundTruth'] = groundTruth graph.vertices['rightRecos'] = 0 graph = graph.triple_apply(getNumRecos, mutated_fields=['rightRecos']) r1 = list( graph.vertices.sort('__id')['rightRecos'][NUM_SUPERNODES:]).count(1) r2 = list( graph.vertices.sort('__id')['rightRecos'][NUM_SUPERNODES:]).count(2) r3 = list( graph.vertices.sort('__id')['rightRecos'][NUM_SUPERNODES:]).count(3)