def showPath(self, highlight=None):
        # with open(self.verticesFn,'a') as Vwr:
        #     with open(self.edgesFn,'a') as Ewr:
        #         for i in range(8):
        #             Vwr.write('\nc0_' + `i` + ', ')
        #             Ewr.write('\np8_0_t,' + 'c0_' + `i` + ',c')
        #             highlight['c0_'+`i`] = [0.69, 0.0, 0.498]
        # start = datetime.datetime.now()
        edge_data = SFrame.read_csv(self.edgesFn)
        vertex_data = SFrame.read_csv(self.verticesFn)
        g = SGraph(vertices=vertex_data,
                   edges=edge_data,
                   vid_field='name',
                   src_field='src',
                   dst_field='dst')
        # end = datetime.datetime.now()
        # print (end - start)
        # g.show(vlabel='attributes', elabel='relation', h_offset=0.3,v_offset=-0.025, highlight=highlight, arrows=True)  # highLight

        g.show(vlabel='attributes',
               vlabel_hover=False,
               elabel='relation',
               highlight=highlight,
               arrows=True)  # highLight
        sleep(20)
        pass
Пример #2
0
 def showPath(self, highlight=None):
     start = datetime.datetime.now()
     edge_data = SFrame.read_csv(self.edgesFn)
     vertex_data = SFrame.read_csv(self.verticesFn)
     g = SGraph(vertices=vertex_data,
                edges=edge_data,
                vid_field='name',
                src_field='src',
                dst_field='dst')
     end = datetime.datetime.now()
     print(end - start)
     # g.show(vlabel='attributes', elabel='relation', highlight=highlight, arrows=True)  # highLight
     # sleep(40)
     pass
 def showPath(self, highlight=None):
     edge_data = SFrame.read_csv(self.edgesFn)
     vertex_data = SFrame.read_csv(self.verticesFn)
     g = SGraph(vertices=vertex_data,
                edges=edge_data,
                vid_field='name',
                src_field='src',
                dst_field='dst')
     g.show(vlabel='id',
            elabel='relation',
            highlight=highlight,
            arrows=True)  # highLight
     sleep(10)
     pass
Пример #4
0
 def load_graph(self,
                graph_path,
                direction=1,
                start_line=0,
                limit=None,
                blacklist=set(),
                delimiter=','):
     json_object = utils.is_json(graph_path)
     if json_object is not False:
         # print json_object
         graph_path = SFrame(SArray(json_object).unpack())
         graph_path.rename({'X.0': 'X1', 'X.1': 'X2', 'X.2': 'Weight'})
     else:
         # load_sgraph()
         graph_path = SFrame.read_csv(graph_path,
                                      delimiter=delimiter,
                                      header=False,
                                      column_type_hints={
                                          'X1': str,
                                          'X2': str
                                      },
                                      nrows=limit,
                                      skiprows=start_line)
         if self._weight_field != "":
             graph_path.rename({'X3': 'Weight'})
     # print graph_data
     self._graph = self._graph.add_edges(graph_path,
                                         src_field='X1',
                                         dst_field='X2')
     if not self.is_directed:
         self.to_undirected()
Пример #5
0
def SSSP():
    url = '/home/gengl/Datasets/SSSP/BerkStan/edge.txt'
    data = SFrame.read_csv(url,
                           delimiter='\t',
                           header=False,
                           column_type_hints=[int, int, int])
    graph = SGraph().add_edges(data, src_field='X1', dst_field='X2')
    sp_model = shortest_path.create(graph, source_vid=0, weight_field='X3')
    sp_model.summary()
Пример #6
0
def CC():
    url = '/home/gengl/Datasets/CC/BerkStan/edge.txt'
    data = SFrame.read_csv(url,
                           delimiter='\t',
                           header=False,
                           column_type_hints=[int, int])
    graph = SGraph().add_edges(data, src_field='X1', dst_field='X2')
    cc_model = connected_components.create(graph, verbose=True)
    cc_model.summary()
Пример #7
0
 def showPath(self, highlight=None):
     # start = datetime.datetime.now()
     edge_data = SFrame.read_csv(self.edgesFn)
     vertex_data = SFrame.read_csv(self.verticesFn)
     g = SGraph(vertices=vertex_data,
                edges=edge_data,
                vid_field='name',
                src_field='src',
                dst_field='dst')
     # end = datetime.datetime.now()
     # print (end - start)
     # g.show(vlabel='attributes',vlabel_hover=True, elabel='relation', h_offset=0.3,v_offset=-0.025, highlight=highlight, arrows=True)  # highLight
     g.show(vlabel='id',
            elabel='relation',
            vlabel_hover=True,
            highlight=highlight,
            arrows=True)  # highLight
     sleep(30)
     pass
Пример #8
0
def PageRank():
    url = '/clueweb/PageRank/clueweb_20M/edge_pair.txt'
    data = SFrame.read_csv(url,
                           delimiter='\t',
                           header=False,
                           column_type_hints=[int, int])
    graph = SGraph().add_edges(data, src_field='X1', dst_field='X2')
    pr_model = pagerank.create(graph,
                               reset_probability=0.2,
                               threshold=0.000000000001,
                               max_iterations=42,
                               _distributed=True)
    pr_model.summary()
Пример #9
0
def PageRank():
    url = '/home/gengl/Datasets/PageRank/BerkStan/edge.txt'
    data = SFrame.read_csv(url,
                           delimiter='\t',
                           header=False,
                           column_type_hints=[int, int])
    graph = SGraph().add_edges(data, src_field='X1', dst_field='X2')
    pr_model = pagerank.create(graph,
                               reset_probability=0.2,
                               threshold=0.0001,
                               max_iterations=1000,
                               _distributed=True)
    pr_model.summary()
Пример #10
0
def SSSP():
    url = '/home/gengl/Datasets/SSSP/Google/edge.txt'
    data = SFrame.read_csv(url,
                           delimiter='\t',
                           header=False,
                           column_type_hints=[int, int, int])
    graph = SGraph().add_edges(data, src_field='X1', dst_field='X2')
    sp_model = shortest_path.create(graph, source_vid=0, weight_field='X3')
    sp_model.summary()
    with open('/home/gengl/sssp_graphlab', 'w') as fo:
        for vid in range(0, 875713):
            try:
                result_pair = sp_model.get_path(vid)
                fo.write(str(result_pair[-1]) + '\n')
            except:
                pass
Пример #11
0
outputPath = os.environ.get("OUTPUT_PATH")
startScale = int(os.environ.get("START_SCALE"))

tagFile = './tmp'
with open(tagFile, 'r') as f:
    infor = f.readline().strip().split(",")
    maxScale = int(infor[1])
    realEndScale = int(infor[2])

scaleRange = range(startScale, realEndScale + 1)

for scale in scaleRange:

    inputPath = os.path.join(outputPath, 'tmp', 'AdjacentRelationships',
                             str(scale))
    url = inputPath
    data = SFrame.read_csv(url, header=False)
    if (data.num_rows() == 0):
        cc_ids = SFrame({"__id": [], "component_id": []})
    else:
        g = SGraph().add_edges(data,
                               src_field=data.column_names()[0],
                               dst_field=data.column_names()[1])
        cc = connected_components.create(g)
        cc_ids = cc.get('component_id')
    path = os.path.join(outputPath, 'tmp', 'ConnectedComponents', str(scale))
    if (~os.path.exists(path)):
        os.makedirs(path)

    SFrame.export_csv(cc_ids, os.path.join(path))
Пример #12
0
import graphlab as gl
import datetime

# Create cluster
c = gl.deploy.hadoop_cluster.create(name='test-cluster',dato_dist_path='hdfs://ec2-54-215-136-187.us-west-1.compute.amazonaws.com:9000/dato/tmp',hadoop_conf_dir='/usr/local/hadoop/etc/hadoop',num_containers=3)
print c

from graphlab import SFrame, SGraph
url = 'hdfs://ec2-54-215-136-187.us-west-1.compute.amazonaws.com:9000/data/pokec.txt'
data = SFrame.read_csv(url, delimiter='\t',header=False)
g = SGraph().add_edges(data, src_field='X2', dst_field='X1')


# triangle counting
from graphlab import triangle_counting
tc = triangle_counting.create(g)
tc_out = tc['triangle_count']


#pagerank
from graphlab import pagerank
datetime.datetime.now()
pr = pagerank.create(g,threshold=0.001)
datetime.datetime.now()


# Connected Components
from graphlab import connected_components
datetime.datetime.now()
cc = connected_components.create(g)
datetime.datetime.now()
Пример #13
0
#g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name',
#           src_field='src', dst_field='dst')

#targets = ['James Bond', 'Moneypenny']
#subgraph = g.get_neighborhood(ids=targets, radius=1, full_subgraph=True)
#subgraph.show(vlabel='id', highlight=['James Bond', 'Moneypenny'], arrows=True)

#from graphlab import SGraph, Vertex, Edge

#g = SGraph()
#verts = [Vertex(0, attr={'breed': 'labrador'}),
#         Vertex(1, attr={'breed': 'labrador'}),
#         Vertex(2, attr={'breed': 'vizsla'})]

#g = g.add_vertices(verts)
#g = g.add_edges(Edge(1, 2))

#print g

from graphlab import SFrame, SGraph
edge_data = SFrame.read_csv('http://s3.amazonaws.com/dato-datasets/bond/bond_edges.csv')
vertex_data = SFrame.read_csv('http://s3.amazonaws.com/dato-datasets/bond/bond_vertices.csv')

g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name',src_field='src', dst_field='dst')
#print g

g.show()


Пример #14
0
verbose = False
vertexFiles = [
    "City", "Country", "Region", "Advisor", "Category", "Founder",
    "FundingRound", "HQ", "keywords", "Member", "Office", "organizations",
    "PrimaryImage", "TeamMember", "Website", "companies_acquired_by_sap"
]
edgesFiles = [
    "GeoInformation", "acquisitions", "categories_keywords_edges",
    "investments", "keywords_descriptions_edges", "keywords_webpages_edges",
    "relationships", "companies_acquired_by_sap_edges"
]
g = SGraph()

for f in vertexFiles:
    content = SFrame.read_csv(path + f + '.csv',
                              na_values='null',
                              verbose=verbose)
    if 'path' in content.column_names():
        g = g.add_vertices(content, vid_field='path')
    elif 'url' in content.column_names():
        g = g.add_vertices(content, vid_field='url')
    else:
        print "Unknown vid field: ", content.column_names()
        sys.exit()

for f in edgesFiles:
    content = SFrame.read_csv(path + f + '.csv',
                              na_values='null',
                              verbose=verbose)
    if 'src' in content.column_names() and 'dst' in content.column_names():
        g = g.add_edges(content, src_field='src', dst_field='dst')
Пример #15
0
#targets = ['James Bond', 'Moneypenny']
#subgraph = g.get_neighborhood(ids=targets, radius=1, full_subgraph=True)
#subgraph.show(vlabel='id', highlight=['James Bond', 'Moneypenny'], arrows=True)

#from graphlab import SGraph, Vertex, Edge

#g = SGraph()
#verts = [Vertex(0, attr={'breed': 'labrador'}),
#         Vertex(1, attr={'breed': 'labrador'}),
#         Vertex(2, attr={'breed': 'vizsla'})]

#g = g.add_vertices(verts)
#g = g.add_edges(Edge(1, 2))

#print g

from graphlab import SFrame, SGraph
edge_data = SFrame.read_csv(
    'http://s3.amazonaws.com/dato-datasets/bond/bond_edges.csv')
vertex_data = SFrame.read_csv(
    'http://s3.amazonaws.com/dato-datasets/bond/bond_vertices.csv')

g = SGraph(vertices=vertex_data,
           edges=edge_data,
           vid_field='name',
           src_field='src',
           dst_field='dst')
#print g

g.show()
Пример #16
0
def build_data_graph():
    file_path = "/Users/blahiri/healthcare/documents/recommendation_system/"
    beneficiaries = SFrame.read_csv(file_path +
                                    "beneficiary_summary_2008_2009.csv")
    bene_packed = beneficiaries.pack_columns(
        column_prefix='chron_',
        dtype=dict,
        new_column_name='chronic_conditions',
        remove_prefix=False)

    #x is a row of bene_packed in the following lambda. We insert the desynpuf_id into the (key, value) tuple, convert the tuple to a list by calling list(),
    #and the outer [] makes sure we emit a list of lists.
    bene_chrons = bene_packed.flat_map(
        ["chronic_condition_name", "chronic_condition_value", "desynpuf_id"],
        lambda x: [
            list(k + (x['desynpuf_id'], ))
            for k in x['chronic_conditions'].iteritems()
        ])

    bene_chrons = bene_chrons[bene_chrons['chronic_condition_value'] == 1]
    del bene_chrons['chronic_condition_value']
    bene_chrons.rename({'chronic_condition_name': 'chronic_condition'})

    g = SGraph()
    bene_chrons['relation'] = 'had_chronic'
    g = g.add_edges(bene_chrons,
                    src_field='desynpuf_id',
                    dst_field='chronic_condition')
    print g.summary()

    #Take out the distinct IDs of patients with chronic conditions to avoid repetition in query
    bene_with_chrons = SFrame(None)
    bene_with_chrons.add_column(bene_chrons['desynpuf_id'].unique(),
                                'desynpuf_id')

    #Add edges to the graph indicating which patient had which diagnosed condition
    tcdc = SFrame.read_csv(file_path + "transformed_claim_diagnosis_codes.csv")
    cols_to_drop = [
        'clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year'
    ]
    for column in cols_to_drop:
        del tcdc[column]
    #Same patient can be diagnosed with same condition multiple times a year, so take distinct
    tcdc = tcdc.unique()
    #Take diagnosed conditions for only those patients who had some chronic condition in 2008 or 2009. It is possible that
    #such a patient had no diagnosed condition, however.
    bene_chrons_tcdc = bene_with_chrons.join(tcdc)

    bene_chrons_tcdc['relation'] = 'diagnosed_with'
    g = g.add_edges(bene_chrons_tcdc,
                    src_field='desynpuf_id',
                    dst_field='dgns_cd')
    print g.summary()

    #Add edges to the graph indicating which patient had which procedure
    tcpc = SFrame.read_csv(file_path + "transformed_claim_prcdr_codes.csv",
                           column_type_hints={'prcdr_cd': str})
    cols_to_drop = [
        'clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year'
    ]
    for column in cols_to_drop:
        del tcpc[column]
    tcpc = tcpc.unique()
    #Take procedures for only those patients who had some chronic condition in 2008 or 2009. It is possible that
    #such a patient had no procedure, however.
    bene_chrons_tcpc = bene_with_chrons.join(tcpc)
    bene_chrons_tcpc['relation'] = 'underwent'
    g = g.add_edges(bene_chrons_tcpc,
                    src_field='desynpuf_id',
                    dst_field='prcdr_cd')
    print g.summary()

    #Add edges to the graph indicating which patient had which medicine
    pde = SFrame.read_csv(file_path + "prescribed_drugs.csv")
    pde = pde.unique()
    #Take medicines for only those patients who had some chronic condition in 2008 or 2009. It is possible that
    #such a patient had no medicine, however.
    bene_chrons_pde = bene_with_chrons.join(pde)
    bene_chrons_pde['relation'] = 'had_drug'
    g = g.add_edges(bene_chrons_pde,
                    src_field='desynpuf_id',
                    dst_field='substancename')
    print g.summary()

    return g
Пример #17
0
import graphlab
from graphlab import SFrame

train_input = graphlab.image_analysis.load_images('train_images/', "auto", with_path=False, random_order=False)
train_output = SFrame.read_csv('train_outputs.csv',delimiter=',', header=True, column_type_hints=[int,int])
train_output.rename({'Prediction':'label'})
train_output.remove_column('Id')
train_output.add_column(train_input.select_column('image'),name='image')
training_data, validation_data = train_output.random_split(0.8)

training_data['image'] = graphlab.image_analysis.resize(training_data['image'], 28, 28, 1)
validation_data['image'] = graphlab.image_analysis.resize(validation_data['image'], 28, 28, 1)

mnist_net = graphlab.deeplearning.get_builtin_neuralnet('mnist')

#net = graphlab.deeplearning.create(sf, target='Prediction')

m = graphlab.neuralnet_classifier.create(training_data, target='label', network = mnist_net, validation_set=validation_data, max_iterations=200)

#test_data = graphlab.image_analysis.load_images('test_images/', "auto", with_path=False, random_order=False)

#pred = m.classify(test_data)
Пример #18
0
def build_data_graph():
  file_path = "/Users/blahiri/healthcare/documents/recommendation_system/"
  beneficiaries = SFrame.read_csv(file_path + "beneficiary_summary_2008_2009.csv")
  bene_packed = beneficiaries.pack_columns(column_prefix = 'chron_', dtype = dict, new_column_name = 'chronic_conditions', remove_prefix = False)
  
  #x is a row of bene_packed in the following lambda. We insert the desynpuf_id into the (key, value) tuple, convert the tuple to a list by calling list(), 
  #and the outer [] makes sure we emit a list of lists.
  bene_chrons = bene_packed.flat_map(["chronic_condition_name", "chronic_condition_value", "desynpuf_id"], 
                                     lambda x:[list(k + (x['desynpuf_id'], )) for k in x['chronic_conditions'].iteritems()])
 

  bene_chrons = bene_chrons[bene_chrons['chronic_condition_value'] == 1]
  del bene_chrons['chronic_condition_value']
  bene_chrons.rename({'chronic_condition_name': 'chronic_condition'})

  g = SGraph()
  bene_chrons['relation'] = 'had_chronic'
  g = g.add_edges(bene_chrons, src_field = 'desynpuf_id', dst_field = 'chronic_condition')
  print g.summary()
 
  #Take out the distinct IDs of patients with chronic conditions to avoid repetition in query
  bene_with_chrons = SFrame(None)
  bene_with_chrons.add_column(bene_chrons['desynpuf_id'].unique(), 'desynpuf_id')
  
  #Add edges to the graph indicating which patient had which diagnosed condition
  tcdc = SFrame.read_csv(file_path + "transformed_claim_diagnosis_codes.csv")
  cols_to_drop = ['clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year']
  for column in cols_to_drop:
     del tcdc[column]
  #Same patient can be diagnosed with same condition multiple times a year, so take distinct
  tcdc = tcdc.unique()
  #Take diagnosed conditions for only those patients who had some chronic condition in 2008 or 2009. It is possible that 
  #such a patient had no diagnosed condition, however.
  bene_chrons_tcdc = bene_with_chrons.join(tcdc)
  
  bene_chrons_tcdc['relation'] = 'diagnosed_with'
  g = g.add_edges(bene_chrons_tcdc, src_field = 'desynpuf_id', dst_field = 'dgns_cd')
  print g.summary()

  
  #Add edges to the graph indicating which patient had which procedure
  tcpc = SFrame.read_csv(file_path + "transformed_claim_prcdr_codes.csv", column_type_hints = {'prcdr_cd' : str})
  cols_to_drop = ['clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year']
  for column in cols_to_drop:
     del tcpc[column]
  tcpc = tcpc.unique()
  #Take procedures for only those patients who had some chronic condition in 2008 or 2009. It is possible that 
  #such a patient had no procedure, however.
  bene_chrons_tcpc = bene_with_chrons.join(tcpc)
  bene_chrons_tcpc['relation'] = 'underwent'
  g = g.add_edges(bene_chrons_tcpc, src_field = 'desynpuf_id', dst_field = 'prcdr_cd')
  print g.summary()

  #Add edges to the graph indicating which patient had which medicine
  pde = SFrame.read_csv(file_path + "prescribed_drugs.csv")
  pde = pde.unique()
  #Take medicines for only those patients who had some chronic condition in 2008 or 2009. It is possible that 
  #such a patient had no medicine, however.
  bene_chrons_pde = bene_with_chrons.join(pde)
  bene_chrons_pde['relation'] = 'had_drug'
  g = g.add_edges(bene_chrons_pde, src_field = 'desynpuf_id', dst_field = 'substancename')
  print g.summary()
   
  return g
Пример #19
0
                                          [0]][0],
        artist_2b=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[1]
                                          [1]][0],
        artist_3a=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[2]
                                          [0]][0],
        artist_3b=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[2]
                                          [1]][0])
    # Pass song names/titles to choose from to HTML template


if __name__ == '__main__':
    play_count_path = 'train_triplets.txt'
    feat_path = 'trimmed_tempos.csv'
    model_path = 'full_two_hour_mod_directory'

    feat_mat = SFrame.read_csv(feat_path)
    model = graphlab.load_model(model_path)
    song_pairs = generate_pairs(model, np.array(feat_mat['song_id']), k=3)
    pref_songs = get_user_prefs(feat_mat, song_pairs)
    playlist = get_playlist(model,
                            pref_songs,
                            feat_mat,
                            desired_tempo=160,
                            tempo_margin=10,
                            playlist_length=10)

    songs = playlist['song_name'].values
    artists = playlist['artist'].values
    tempoxs = playlist['tempo_multiplier'].values
    cadences = playlist['effective_tempo'].values
    pl_length = 10
Пример #20
0
 def read_columns_to_sframe(self, column_map):
     from graphlab import SFrame
     sf = SFrame.read_csv(csv_datasource_path(self.name))
     return sf.rename(invert_dict(column_map))
Пример #21
0
# import pandas as pd
import numpy as np
import os
import graphlab as gl
from graphlab import SFrame
import pandas as pd

def merge_data(df):
    return ''.join([str(df["store_nbr"]), "_", str(df["item_nbr"]), "_", df["date"]])



ind = True

weather = SFrame.read_csv(os.path.join('..', "data", "weather_modified_3.csv"))

if ind:
  test = SFrame.read_csv(os.path.join('..', "data", "test.csv"))


train = SFrame.read_csv(os.path.join('..', "data", "train.csv"))
key = SFrame.read_csv(os.path.join('..', "data", "key.csv"))

zero_items = SFrame.read_csv(os.path.join('..', 'data', 'zero_items_solid_new.csv'))


train_new = train.join(zero_items)

if ind:
  test_new = test.join(zero_items)
Пример #22
0
                song_3a=feat_mat['title'][feat_mat['song_id'] == song_pairs[2][0]][0],
                song_3b=feat_mat['title'][feat_mat['song_id'] == song_pairs[2][1]][0],
                artist_1a=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[0][0]][0],
                artist_1b=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[0][1]][0],
                artist_2a=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[1][0]][0],
                artist_2b=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[1][1]][0],
                artist_3a=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[2][0]][0],
                artist_3b=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[2][1]][0])
    # Pass song names/titles to choose from to HTML template

if __name__ == '__main__':
    play_count_path = 'train_triplets.txt'
    feat_path = 'trimmed_tempos.csv'
    model_path = 'full_two_hour_mod_directory'

    feat_mat = SFrame.read_csv(feat_path)
    model = graphlab.load_model(model_path)
    song_pairs = generate_pairs(model, np.array(feat_mat['song_id']), k=3)
    pref_songs = get_user_prefs(feat_mat, song_pairs)
    playlist = get_playlist(model, pref_songs, feat_mat,
                            desired_tempo=160,
                            tempo_margin=10,
                            playlist_length=10)

    songs = playlist['song_name'].values
    artists = playlist['artist'].values
    tempoxs = playlist['tempo_multiplier'].values
    cadences = playlist['effective_tempo'].values
    pl_length = 10
    timeline_obj = ""