def showPath(self, highlight=None): # with open(self.verticesFn,'a') as Vwr: # with open(self.edgesFn,'a') as Ewr: # for i in range(8): # Vwr.write('\nc0_' + `i` + ', ') # Ewr.write('\np8_0_t,' + 'c0_' + `i` + ',c') # highlight['c0_'+`i`] = [0.69, 0.0, 0.498] # start = datetime.datetime.now() edge_data = SFrame.read_csv(self.edgesFn) vertex_data = SFrame.read_csv(self.verticesFn) g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name', src_field='src', dst_field='dst') # end = datetime.datetime.now() # print (end - start) # g.show(vlabel='attributes', elabel='relation', h_offset=0.3,v_offset=-0.025, highlight=highlight, arrows=True) # highLight g.show(vlabel='attributes', vlabel_hover=False, elabel='relation', highlight=highlight, arrows=True) # highLight sleep(20) pass
def showPath(self, highlight=None): start = datetime.datetime.now() edge_data = SFrame.read_csv(self.edgesFn) vertex_data = SFrame.read_csv(self.verticesFn) g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name', src_field='src', dst_field='dst') end = datetime.datetime.now() print(end - start) # g.show(vlabel='attributes', elabel='relation', highlight=highlight, arrows=True) # highLight # sleep(40) pass
def showPath(self, highlight=None): edge_data = SFrame.read_csv(self.edgesFn) vertex_data = SFrame.read_csv(self.verticesFn) g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name', src_field='src', dst_field='dst') g.show(vlabel='id', elabel='relation', highlight=highlight, arrows=True) # highLight sleep(10) pass
def load_graph(self, graph_path, direction=1, start_line=0, limit=None, blacklist=set(), delimiter=','): json_object = utils.is_json(graph_path) if json_object is not False: # print json_object graph_path = SFrame(SArray(json_object).unpack()) graph_path.rename({'X.0': 'X1', 'X.1': 'X2', 'X.2': 'Weight'}) else: # load_sgraph() graph_path = SFrame.read_csv(graph_path, delimiter=delimiter, header=False, column_type_hints={ 'X1': str, 'X2': str }, nrows=limit, skiprows=start_line) if self._weight_field != "": graph_path.rename({'X3': 'Weight'}) # print graph_data self._graph = self._graph.add_edges(graph_path, src_field='X1', dst_field='X2') if not self.is_directed: self.to_undirected()
def SSSP(): url = '/home/gengl/Datasets/SSSP/BerkStan/edge.txt' data = SFrame.read_csv(url, delimiter='\t', header=False, column_type_hints=[int, int, int]) graph = SGraph().add_edges(data, src_field='X1', dst_field='X2') sp_model = shortest_path.create(graph, source_vid=0, weight_field='X3') sp_model.summary()
def CC(): url = '/home/gengl/Datasets/CC/BerkStan/edge.txt' data = SFrame.read_csv(url, delimiter='\t', header=False, column_type_hints=[int, int]) graph = SGraph().add_edges(data, src_field='X1', dst_field='X2') cc_model = connected_components.create(graph, verbose=True) cc_model.summary()
def showPath(self, highlight=None): # start = datetime.datetime.now() edge_data = SFrame.read_csv(self.edgesFn) vertex_data = SFrame.read_csv(self.verticesFn) g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name', src_field='src', dst_field='dst') # end = datetime.datetime.now() # print (end - start) # g.show(vlabel='attributes',vlabel_hover=True, elabel='relation', h_offset=0.3,v_offset=-0.025, highlight=highlight, arrows=True) # highLight g.show(vlabel='id', elabel='relation', vlabel_hover=True, highlight=highlight, arrows=True) # highLight sleep(30) pass
def PageRank(): url = '/clueweb/PageRank/clueweb_20M/edge_pair.txt' data = SFrame.read_csv(url, delimiter='\t', header=False, column_type_hints=[int, int]) graph = SGraph().add_edges(data, src_field='X1', dst_field='X2') pr_model = pagerank.create(graph, reset_probability=0.2, threshold=0.000000000001, max_iterations=42, _distributed=True) pr_model.summary()
def PageRank(): url = '/home/gengl/Datasets/PageRank/BerkStan/edge.txt' data = SFrame.read_csv(url, delimiter='\t', header=False, column_type_hints=[int, int]) graph = SGraph().add_edges(data, src_field='X1', dst_field='X2') pr_model = pagerank.create(graph, reset_probability=0.2, threshold=0.0001, max_iterations=1000, _distributed=True) pr_model.summary()
def SSSP(): url = '/home/gengl/Datasets/SSSP/Google/edge.txt' data = SFrame.read_csv(url, delimiter='\t', header=False, column_type_hints=[int, int, int]) graph = SGraph().add_edges(data, src_field='X1', dst_field='X2') sp_model = shortest_path.create(graph, source_vid=0, weight_field='X3') sp_model.summary() with open('/home/gengl/sssp_graphlab', 'w') as fo: for vid in range(0, 875713): try: result_pair = sp_model.get_path(vid) fo.write(str(result_pair[-1]) + '\n') except: pass
outputPath = os.environ.get("OUTPUT_PATH") startScale = int(os.environ.get("START_SCALE")) tagFile = './tmp' with open(tagFile, 'r') as f: infor = f.readline().strip().split(",") maxScale = int(infor[1]) realEndScale = int(infor[2]) scaleRange = range(startScale, realEndScale + 1) for scale in scaleRange: inputPath = os.path.join(outputPath, 'tmp', 'AdjacentRelationships', str(scale)) url = inputPath data = SFrame.read_csv(url, header=False) if (data.num_rows() == 0): cc_ids = SFrame({"__id": [], "component_id": []}) else: g = SGraph().add_edges(data, src_field=data.column_names()[0], dst_field=data.column_names()[1]) cc = connected_components.create(g) cc_ids = cc.get('component_id') path = os.path.join(outputPath, 'tmp', 'ConnectedComponents', str(scale)) if (~os.path.exists(path)): os.makedirs(path) SFrame.export_csv(cc_ids, os.path.join(path))
import graphlab as gl import datetime # Create cluster c = gl.deploy.hadoop_cluster.create(name='test-cluster',dato_dist_path='hdfs://ec2-54-215-136-187.us-west-1.compute.amazonaws.com:9000/dato/tmp',hadoop_conf_dir='/usr/local/hadoop/etc/hadoop',num_containers=3) print c from graphlab import SFrame, SGraph url = 'hdfs://ec2-54-215-136-187.us-west-1.compute.amazonaws.com:9000/data/pokec.txt' data = SFrame.read_csv(url, delimiter='\t',header=False) g = SGraph().add_edges(data, src_field='X2', dst_field='X1') # triangle counting from graphlab import triangle_counting tc = triangle_counting.create(g) tc_out = tc['triangle_count'] #pagerank from graphlab import pagerank datetime.datetime.now() pr = pagerank.create(g,threshold=0.001) datetime.datetime.now() # Connected Components from graphlab import connected_components datetime.datetime.now() cc = connected_components.create(g) datetime.datetime.now()
#g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name', # src_field='src', dst_field='dst') #targets = ['James Bond', 'Moneypenny'] #subgraph = g.get_neighborhood(ids=targets, radius=1, full_subgraph=True) #subgraph.show(vlabel='id', highlight=['James Bond', 'Moneypenny'], arrows=True) #from graphlab import SGraph, Vertex, Edge #g = SGraph() #verts = [Vertex(0, attr={'breed': 'labrador'}), # Vertex(1, attr={'breed': 'labrador'}), # Vertex(2, attr={'breed': 'vizsla'})] #g = g.add_vertices(verts) #g = g.add_edges(Edge(1, 2)) #print g from graphlab import SFrame, SGraph edge_data = SFrame.read_csv('http://s3.amazonaws.com/dato-datasets/bond/bond_edges.csv') vertex_data = SFrame.read_csv('http://s3.amazonaws.com/dato-datasets/bond/bond_vertices.csv') g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name',src_field='src', dst_field='dst') #print g g.show()
verbose = False vertexFiles = [ "City", "Country", "Region", "Advisor", "Category", "Founder", "FundingRound", "HQ", "keywords", "Member", "Office", "organizations", "PrimaryImage", "TeamMember", "Website", "companies_acquired_by_sap" ] edgesFiles = [ "GeoInformation", "acquisitions", "categories_keywords_edges", "investments", "keywords_descriptions_edges", "keywords_webpages_edges", "relationships", "companies_acquired_by_sap_edges" ] g = SGraph() for f in vertexFiles: content = SFrame.read_csv(path + f + '.csv', na_values='null', verbose=verbose) if 'path' in content.column_names(): g = g.add_vertices(content, vid_field='path') elif 'url' in content.column_names(): g = g.add_vertices(content, vid_field='url') else: print "Unknown vid field: ", content.column_names() sys.exit() for f in edgesFiles: content = SFrame.read_csv(path + f + '.csv', na_values='null', verbose=verbose) if 'src' in content.column_names() and 'dst' in content.column_names(): g = g.add_edges(content, src_field='src', dst_field='dst')
#targets = ['James Bond', 'Moneypenny'] #subgraph = g.get_neighborhood(ids=targets, radius=1, full_subgraph=True) #subgraph.show(vlabel='id', highlight=['James Bond', 'Moneypenny'], arrows=True) #from graphlab import SGraph, Vertex, Edge #g = SGraph() #verts = [Vertex(0, attr={'breed': 'labrador'}), # Vertex(1, attr={'breed': 'labrador'}), # Vertex(2, attr={'breed': 'vizsla'})] #g = g.add_vertices(verts) #g = g.add_edges(Edge(1, 2)) #print g from graphlab import SFrame, SGraph edge_data = SFrame.read_csv( 'http://s3.amazonaws.com/dato-datasets/bond/bond_edges.csv') vertex_data = SFrame.read_csv( 'http://s3.amazonaws.com/dato-datasets/bond/bond_vertices.csv') g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name', src_field='src', dst_field='dst') #print g g.show()
def build_data_graph(): file_path = "/Users/blahiri/healthcare/documents/recommendation_system/" beneficiaries = SFrame.read_csv(file_path + "beneficiary_summary_2008_2009.csv") bene_packed = beneficiaries.pack_columns( column_prefix='chron_', dtype=dict, new_column_name='chronic_conditions', remove_prefix=False) #x is a row of bene_packed in the following lambda. We insert the desynpuf_id into the (key, value) tuple, convert the tuple to a list by calling list(), #and the outer [] makes sure we emit a list of lists. bene_chrons = bene_packed.flat_map( ["chronic_condition_name", "chronic_condition_value", "desynpuf_id"], lambda x: [ list(k + (x['desynpuf_id'], )) for k in x['chronic_conditions'].iteritems() ]) bene_chrons = bene_chrons[bene_chrons['chronic_condition_value'] == 1] del bene_chrons['chronic_condition_value'] bene_chrons.rename({'chronic_condition_name': 'chronic_condition'}) g = SGraph() bene_chrons['relation'] = 'had_chronic' g = g.add_edges(bene_chrons, src_field='desynpuf_id', dst_field='chronic_condition') print g.summary() #Take out the distinct IDs of patients with chronic conditions to avoid repetition in query bene_with_chrons = SFrame(None) bene_with_chrons.add_column(bene_chrons['desynpuf_id'].unique(), 'desynpuf_id') #Add edges to the graph indicating which patient had which diagnosed condition tcdc = SFrame.read_csv(file_path + "transformed_claim_diagnosis_codes.csv") cols_to_drop = [ 'clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year' ] for column in cols_to_drop: del tcdc[column] #Same patient can be diagnosed with same condition multiple times a year, so take distinct tcdc = tcdc.unique() #Take diagnosed conditions for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no diagnosed condition, however. bene_chrons_tcdc = bene_with_chrons.join(tcdc) bene_chrons_tcdc['relation'] = 'diagnosed_with' g = g.add_edges(bene_chrons_tcdc, src_field='desynpuf_id', dst_field='dgns_cd') print g.summary() #Add edges to the graph indicating which patient had which procedure tcpc = SFrame.read_csv(file_path + "transformed_claim_prcdr_codes.csv", column_type_hints={'prcdr_cd': str}) cols_to_drop = [ 'clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year' ] for column in cols_to_drop: del tcpc[column] tcpc = tcpc.unique() #Take procedures for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no procedure, however. bene_chrons_tcpc = bene_with_chrons.join(tcpc) bene_chrons_tcpc['relation'] = 'underwent' g = g.add_edges(bene_chrons_tcpc, src_field='desynpuf_id', dst_field='prcdr_cd') print g.summary() #Add edges to the graph indicating which patient had which medicine pde = SFrame.read_csv(file_path + "prescribed_drugs.csv") pde = pde.unique() #Take medicines for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no medicine, however. bene_chrons_pde = bene_with_chrons.join(pde) bene_chrons_pde['relation'] = 'had_drug' g = g.add_edges(bene_chrons_pde, src_field='desynpuf_id', dst_field='substancename') print g.summary() return g
import graphlab from graphlab import SFrame train_input = graphlab.image_analysis.load_images('train_images/', "auto", with_path=False, random_order=False) train_output = SFrame.read_csv('train_outputs.csv',delimiter=',', header=True, column_type_hints=[int,int]) train_output.rename({'Prediction':'label'}) train_output.remove_column('Id') train_output.add_column(train_input.select_column('image'),name='image') training_data, validation_data = train_output.random_split(0.8) training_data['image'] = graphlab.image_analysis.resize(training_data['image'], 28, 28, 1) validation_data['image'] = graphlab.image_analysis.resize(validation_data['image'], 28, 28, 1) mnist_net = graphlab.deeplearning.get_builtin_neuralnet('mnist') #net = graphlab.deeplearning.create(sf, target='Prediction') m = graphlab.neuralnet_classifier.create(training_data, target='label', network = mnist_net, validation_set=validation_data, max_iterations=200) #test_data = graphlab.image_analysis.load_images('test_images/', "auto", with_path=False, random_order=False) #pred = m.classify(test_data)
def build_data_graph(): file_path = "/Users/blahiri/healthcare/documents/recommendation_system/" beneficiaries = SFrame.read_csv(file_path + "beneficiary_summary_2008_2009.csv") bene_packed = beneficiaries.pack_columns(column_prefix = 'chron_', dtype = dict, new_column_name = 'chronic_conditions', remove_prefix = False) #x is a row of bene_packed in the following lambda. We insert the desynpuf_id into the (key, value) tuple, convert the tuple to a list by calling list(), #and the outer [] makes sure we emit a list of lists. bene_chrons = bene_packed.flat_map(["chronic_condition_name", "chronic_condition_value", "desynpuf_id"], lambda x:[list(k + (x['desynpuf_id'], )) for k in x['chronic_conditions'].iteritems()]) bene_chrons = bene_chrons[bene_chrons['chronic_condition_value'] == 1] del bene_chrons['chronic_condition_value'] bene_chrons.rename({'chronic_condition_name': 'chronic_condition'}) g = SGraph() bene_chrons['relation'] = 'had_chronic' g = g.add_edges(bene_chrons, src_field = 'desynpuf_id', dst_field = 'chronic_condition') print g.summary() #Take out the distinct IDs of patients with chronic conditions to avoid repetition in query bene_with_chrons = SFrame(None) bene_with_chrons.add_column(bene_chrons['desynpuf_id'].unique(), 'desynpuf_id') #Add edges to the graph indicating which patient had which diagnosed condition tcdc = SFrame.read_csv(file_path + "transformed_claim_diagnosis_codes.csv") cols_to_drop = ['clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year'] for column in cols_to_drop: del tcdc[column] #Same patient can be diagnosed with same condition multiple times a year, so take distinct tcdc = tcdc.unique() #Take diagnosed conditions for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no diagnosed condition, however. bene_chrons_tcdc = bene_with_chrons.join(tcdc) bene_chrons_tcdc['relation'] = 'diagnosed_with' g = g.add_edges(bene_chrons_tcdc, src_field = 'desynpuf_id', dst_field = 'dgns_cd') print g.summary() #Add edges to the graph indicating which patient had which procedure tcpc = SFrame.read_csv(file_path + "transformed_claim_prcdr_codes.csv", column_type_hints = {'prcdr_cd' : str}) cols_to_drop = ['clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year'] for column in cols_to_drop: del tcpc[column] tcpc = tcpc.unique() #Take procedures for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no procedure, however. bene_chrons_tcpc = bene_with_chrons.join(tcpc) bene_chrons_tcpc['relation'] = 'underwent' g = g.add_edges(bene_chrons_tcpc, src_field = 'desynpuf_id', dst_field = 'prcdr_cd') print g.summary() #Add edges to the graph indicating which patient had which medicine pde = SFrame.read_csv(file_path + "prescribed_drugs.csv") pde = pde.unique() #Take medicines for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no medicine, however. bene_chrons_pde = bene_with_chrons.join(pde) bene_chrons_pde['relation'] = 'had_drug' g = g.add_edges(bene_chrons_pde, src_field = 'desynpuf_id', dst_field = 'substancename') print g.summary() return g
[0]][0], artist_2b=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[1] [1]][0], artist_3a=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[2] [0]][0], artist_3b=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[2] [1]][0]) # Pass song names/titles to choose from to HTML template if __name__ == '__main__': play_count_path = 'train_triplets.txt' feat_path = 'trimmed_tempos.csv' model_path = 'full_two_hour_mod_directory' feat_mat = SFrame.read_csv(feat_path) model = graphlab.load_model(model_path) song_pairs = generate_pairs(model, np.array(feat_mat['song_id']), k=3) pref_songs = get_user_prefs(feat_mat, song_pairs) playlist = get_playlist(model, pref_songs, feat_mat, desired_tempo=160, tempo_margin=10, playlist_length=10) songs = playlist['song_name'].values artists = playlist['artist'].values tempoxs = playlist['tempo_multiplier'].values cadences = playlist['effective_tempo'].values pl_length = 10
def read_columns_to_sframe(self, column_map): from graphlab import SFrame sf = SFrame.read_csv(csv_datasource_path(self.name)) return sf.rename(invert_dict(column_map))
# import pandas as pd import numpy as np import os import graphlab as gl from graphlab import SFrame import pandas as pd def merge_data(df): return ''.join([str(df["store_nbr"]), "_", str(df["item_nbr"]), "_", df["date"]]) ind = True weather = SFrame.read_csv(os.path.join('..', "data", "weather_modified_3.csv")) if ind: test = SFrame.read_csv(os.path.join('..', "data", "test.csv")) train = SFrame.read_csv(os.path.join('..', "data", "train.csv")) key = SFrame.read_csv(os.path.join('..', "data", "key.csv")) zero_items = SFrame.read_csv(os.path.join('..', 'data', 'zero_items_solid_new.csv')) train_new = train.join(zero_items) if ind: test_new = test.join(zero_items)
song_3a=feat_mat['title'][feat_mat['song_id'] == song_pairs[2][0]][0], song_3b=feat_mat['title'][feat_mat['song_id'] == song_pairs[2][1]][0], artist_1a=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[0][0]][0], artist_1b=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[0][1]][0], artist_2a=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[1][0]][0], artist_2b=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[1][1]][0], artist_3a=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[2][0]][0], artist_3b=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[2][1]][0]) # Pass song names/titles to choose from to HTML template if __name__ == '__main__': play_count_path = 'train_triplets.txt' feat_path = 'trimmed_tempos.csv' model_path = 'full_two_hour_mod_directory' feat_mat = SFrame.read_csv(feat_path) model = graphlab.load_model(model_path) song_pairs = generate_pairs(model, np.array(feat_mat['song_id']), k=3) pref_songs = get_user_prefs(feat_mat, song_pairs) playlist = get_playlist(model, pref_songs, feat_mat, desired_tempo=160, tempo_margin=10, playlist_length=10) songs = playlist['song_name'].values artists = playlist['artist'].values tempoxs = playlist['tempo_multiplier'].values cadences = playlist['effective_tempo'].values pl_length = 10 timeline_obj = ""