def run(path=r"datasets/movie_data_random.csv", ta=None): """ The default home directory is hdfs://user/atkuser all the sample data sets are saved to hdfs://user/atkuser/datasets when installing through the rpm you will need to copy the data sets to hdfs manually otherwise and adjust the data set location path accordingly :param path: data set hdfs path can be full and relative path """ NAME = "PR" if ta is None: ta = examples.connect() #csv schema definition schema = [("user_id", ta.int32), ("movie_id", ta.int32), ("rating", ta.int32), ("splits", str)] csv = ta.CsvFile(path, schema, skip_header_lines=1) frames = ta.get_frame_names() if NAME in frames: print "Deleting old '{0}' frame.".format(NAME) ta.drop_frames(NAME) print "Building frame '{0}'.".format(NAME) frame = ta.Frame(csv, NAME) print "Inspecting frame '{0}'.".format(NAME) print frame.inspect() print "Creating graph '{0}'".format(NAME) # Create a graph graphs = ta.get_graph_names() if NAME in graphs: print "Deleting old '{0}' graph".format(NAME) ta.drop_graphs(NAME) # Create some rules graph = ta.Graph() graph.name = NAME graph.define_vertex_type("user_id") graph.define_vertex_type("movie_id") graph.define_edge_type("rating", "user_id", "movie_id", directed=True) graph.vertices["user_id"].add_vertices(frame, 'user_id') graph.vertices["movie_id"].add_vertices(frame, 'movie_id') graph.edges['rating'].add_edges(frame, 'user_id', 'movie_id', ['rating']) print graph.vertex_count print graph.edge_count print graph.vertices["user_id"].inspect(20) print graph.vertices["movie_id"].inspect(20) print graph.edges["rating"].inspect(20) result = graph.graphx_pagerank(output_property="PageRank", max_iterations=2, convergence_tolerance=0.001) for frame_name in result["vertex_dictionary"]: result["vertex_dictionary"][frame_name].inspect(20) for frame_name in result["edge_dictionary"]: result["edge_dictionary"][frame_name].inspect(20) return {"frame": frame, "graph": graph, "result": result}
def run(path=r"datasets/movie_data_random.csv", ta=None): """ Loads movie_data_random.csv into a frame, creates a graph and runs the page rank algorithm. We are not required to use movie_data_random.csv but rather it's schema. Any other csv file with the correct schema and delimeter will work. Parameters ---------- path : str The HDFS path to the movie_data_random.csv dataset. If a path is not given the default is datasets/movie_data_random.csv. The dataset is available in the examples/datasets directory and in `github<https://github.com/trustedanalytics/atk/tree/master/python-client/trustedanalytics/examples/datasets>`__. Must be a valid HDFS path either fully qualified hdfs://some/path or relative the ATK rest servers HDFS home directory. ta : trusted analytics python import Can be safely ignored when running examples. It is only used during integration testing to pass pre-configured python client reference. Returns ------- A dictionary with the frame, graph, and algorithm result Datasets -------- All the datasets can be found in the examples/datasets directory of the python client or in `github<https://github.com/trustedanalytics/atk/tree/master/python-client/trustedanalytics/examples/datasets>`__. Dataset ------- Name : movie_data_random.csv schema: user_id(int32) , movie_id(int32) , rating(int32) , splits(str) sample .. code:: 58,-3,4,tr 59,-3,5,tr 60,-3,4,va delimeter: , Example ------- To run the movie example first import the example. .. code:: >>>import trustedanalytics.examples.movie_graph_small as movie After importing you can execute run method with the path to the dataset .. code:: >>>movie.run("hdfs://FULL_HDFS_PATH") """ NAME = "PR" if ta is None: ta = examples.connect() #csv schema definition schema = [("user_id", ta.int32), ("movie_id", ta.int32), ("rating", ta.int32), ("splits", str)] csv = ta.CsvFile(path, schema, skip_header_lines=1) frames = ta.get_frame_names() if NAME in frames: print "Deleting old '{0}' frame.".format(NAME) ta.drop_frames(NAME) print "Building frame '{0}'.".format(NAME) frame = ta.Frame(csv, NAME) print "Inspecting frame '{0}'.".format(NAME) print frame.inspect() print "Creating graph '{0}'".format(NAME) # Create a graph graphs = ta.get_graph_names() if NAME in graphs: print "Deleting old '{0}' graph".format(NAME) ta.drop_graphs(NAME) # Create some rules graph = ta.Graph() graph.name = NAME graph.define_vertex_type("user_id") graph.define_vertex_type("movie_id") graph.define_edge_type("rating", "user_id", "movie_id", directed=True) graph.vertices["user_id"].add_vertices(frame, 'user_id') graph.vertices["movie_id"].add_vertices(frame, 'movie_id') graph.edges['rating'].add_edges(frame, 'user_id', 'movie_id', ['rating']) print graph.vertex_count print graph.edge_count print graph.vertices["user_id"].inspect(20) print graph.vertices["movie_id"].inspect(20) print graph.edges["rating"].inspect(20) result = graph.graphx_pagerank(output_property="PageRank", max_iterations=2, convergence_tolerance=0.001) for frame_name in result["vertex_dictionary"]: result["vertex_dictionary"][frame_name].inspect(20) for frame_name in result["edge_dictionary"]: result["edge_dictionary"][frame_name].inspect(20) return {"frame": frame, "graph": graph, "result": result}
def run(path=r"datasets/movie_data_random.csv", ta=None): """ Loads movie_data_random.csv into a frame, filters it by movie rating and creates a graph. We are not required to use movie_data_random.csv but rather it's schema. Any other csv file with the correct schema and delimeter will work. Parameters ---------- path : str The HDFS path to the movie_data_random.csv dataset. If a path is not given the default is datasets/movie_data_random.csv. The dataset is available in the examples/datasets directory and in `github<https://github.com/trustedanalytics/atk/tree/master/python-client/trustedanalytics/examples/datasets>`__. Must be a valid HDFS path either fully qualified hdfs://some/path or relative the ATK rest servers HDFS home directory. ta : trusted analytics python import Can be safely ignored when running examples. It is only used during integration testing to pass pre-configured python client reference. Returns ------- A dictionary with the frame and graph Datasets -------- All the datasets can be found in the examples/datasets directory of the python client or in `github<https://github.com/trustedanalytics/atk/tree/master/python-client/trustedanalytics/examples/datasets>`__. Dataset ------- Name : movie_data_random.csv schema: user_id(int32) , movie_id(int32) , rating(int32) , splits(str) sample .. code:: 1,-1,4,tr 2,-1,1,tr 3,-1,2,tr delimeter: , Example ------- To run the movie example first import the example. .. code:: >>>import trustedanalytics.examples.movie_graph_small as movie After importing you can execute run method with the path to the dataset .. code:: >>>movie.run("hdfs://FULL_HDFS_PATH") """ FRAME_NAME = "MGS_frame" GRAPH_NAME = "MGS_graph" if ta is None: ta = examples.connect() #import trustedanalytics as ta #ta.connect() #csv schema definition schema = [("user_id", ta.int32), ("movie_id", ta.int32), ("rating", ta.int32), ("splits", str)] csv = ta.CsvFile(path, schema, skip_header_lines=1) frames = ta.get_frame_names() if FRAME_NAME in frames: print "Deleting old '{0}' frame.".format(FRAME_NAME) ta.drop_frames(FRAME_NAME) print "Building frame '{0}'.".format(FRAME_NAME) frame = ta.Frame(csv, FRAME_NAME) print "Inspecting frame '{0}'.".format(FRAME_NAME) print frame.inspect() print "Filter frame by rating." frame.filter(lambda row: row.rating >= 5) print frame.inspect() print "Creating graph '{0}'.".format(GRAPH_NAME) # Create a graph graphs = ta.get_graph_names() if GRAPH_NAME in graphs: print "Deleting old '{0}' graph.".format(GRAPH_NAME) ta.drop_graphs(GRAPH_NAME) graph = ta.Graph() graph.name = GRAPH_NAME # Create some rules graph.define_vertex_type("user_id") graph.define_vertex_type("movie_id") graph.define_edge_type("rating", "user_id", "movie_id", directed=True) #add data to graph graph.vertices["user_id"].add_vertices(frame, 'user_id') graph.vertices["movie_id"].add_vertices(frame, 'movie_id') graph.edges['rating'].add_edges(frame, 'user_id', 'movie_id', ['rating']) print graph.vertex_count print graph.edge_count print graph.vertices["user_id"].inspect(20) print graph.vertices["movie_id"].inspect(20) print graph.edges["rating"].inspect(20) return {"frame": frame, "graph": graph}
def run(path=r"datasets/cities.csv", ta=None): """ The default home directory is hdfs://user/atkuser all the sample data sets are saved to hdfs://user/atkuser/datasets when installing through the rpm you will need to copy the data sets to hdfs manually otherwise and adjust the data set location path accordingly :param path: data set hdfs path can be full and relative path """ NAME = "TEST" if ta is None: ta = examples.connect() #csv schema definition schema = [('rank', ta.int32), ('city', str), ('population_2013', ta.int32), ('population_2010', ta.int32), ('change', str), ('county', str)] csv = ta.CsvFile(path, schema, skip_header_lines=1, delimiter='|') frames = ta.get_frame_names() if NAME in frames: print "Deleting old '{0}' frame.".format(NAME) ta.drop_frames(NAME) print "Building frame '{0}'.".format(NAME) frame = ta.Frame(csv, NAME) print "Inspecting frame '{0}'.".format(NAME) print frame.inspect() print "Drop Change column." frame.drop_columns("change") print frame.inspect() print "Add Change column." frame.add_columns( lambda row: ((row.population_2013 - row.population_2010) / float( row.population_2010)) * 100, ("change", ta.float32)) print frame.inspect() print "Drop Change column." frame.drop_columns("change") print frame.inspect() print "Add Change columns." frame.add_columns( lambda row: [ row.population_2013 - row.population_2010, ((row.population_2013 - row.population_2010) / float( row.population_2010)) * 100 ], [("difference", ta.int32), ("change", ta.float32)]) print "Format inspection." print frame.inspect(10, wrap=10, columns=[ "city", "population_2013", "population_2010", "change", "difference" ], round=2) return {"frame": frame}
def run(path=r"datasets/cities.csv", ta=None): """ Loads cities.csv into a frame and runs some simple frame operations. We will be dropping columns and adding new ones with python lambdas. We are not required to use cities.csv but rather it's schema. Any other csv file with the correct schema and delimeter will work. Parameters ---------- path : str The HDFS path to the cities.csv dataset. If a path is not given the default is datasets/cities.csv. The dataset is available in the examples/datasets directory and in `github<https://github.com/trustedanalytics/atk/tree/master/python-client/trustedanalytics/examples/datasets>`__. Must be a valid HDFS path either fully qualified hdfs://some/path or relative the ATK rest servers HDFS home directory. ta : trusted analytics python import Can be safely ignored when running examples. It is only used during integration testing to pass pre-configured python client reference. Returns ------- A dictionary with the final built frame Datasets -------- All the datasets can be found in the examples/datasets directory of the python client or in `github<https://github.com/trustedanalytics/atk/tree/master/python-client/trustedanalytics/examples/datasets>`__. Dataset ------- Name : cities.csv schema: rank(int32) | city(str) | population_2013(int32) | population_2010(int32) | change(str) | county(str) sample .. code:: 1|Portland|609456|583776|4.40%|Multnomah 2|Salem|160614|154637|3.87%|Marion 3|Eugene|159190|156185|1.92%|Lane delimeter: | Example ------- To run the frame example first import the example. .. code:: >>>import trustedanalytics.examples.frame as frame After importing you can execute run method with the path to the dataset .. code:: >>>frame.run("hdfs://FULL_HDFS_PATH") """ NAME = "TEST" if ta is None: ta = examples.connect() #csv schema definition schema = [('rank', ta.int32), ('city', str), ('population_2013', ta.int32), ('population_2010', ta.int32), ('change', str), ('county', str)] #import csv file csv = ta.CsvFile(path, schema, skip_header_lines=1, delimiter='|') frames = ta.get_frame_names() if NAME in frames: print "Deleting old '{0}' frame.".format(NAME) ta.drop_frames(NAME) print "Building frame '{0}'.".format(NAME) frame = ta.Frame(csv, NAME) print "Inspecting frame '{0}'.".format(NAME) print frame.inspect() print "Drop Change column." frame.drop_columns("change") print frame.inspect() print "Add Change column." frame.add_columns(lambda row: ((row.population_2013 - row.population_2010)/float(row.population_2010)) * 100, ("change", ta.float32)) print frame.inspect() print "Drop Change column." frame.drop_columns("change") print frame.inspect() print "Add Change columns." frame.add_columns(lambda row: [row.population_2013 - row.population_2010, ((row.population_2013 - row.population_2010)/float(row.population_2010)) * 100 ], [("difference", ta.int32 ), ("change", ta.float32 )]) print "Format inspection." print frame.inspect(10, wrap=10, columns=["city", "population_2013", "population_2010", "change", "difference"], round=2) return {"frame": frame}
def run(path=r"datasets/movie_data_random.csv", ta=None): """ The default home directory is hdfs://user/atkuser all the sample data sets are saved to hdfs://user/atkuser/datasets when installing through the rpm you will need to copy the data sets to hdfs manually otherwise and adjust the data set location path accordingly :param path: data set hdfs path can be full and relative path """ NAME = "MGS" if ta is None: ta = examples.connect() #import trustedanalytics as ta #ta.connect() #csv schema definition schema = [("user_id", ta.int32), ("movie_id", ta.int32), ("rating", ta.int32), ("splits", str)] csv = ta.CsvFile(path, schema, skip_header_lines=1) frames = ta.get_frame_names() if NAME in frames: print "Deleting old '{0}' frame.".format(NAME) ta.drop_frames(NAME) print "Building frame '{0}'.".format(NAME) frame = ta.Frame(csv, NAME) print "Inspecting frame '{0}'.".format(NAME) print frame.inspect() print "Filter frame by rating." frame.filter(lambda row: row.rating >= 5) print frame.inspect() print "Creating graph '{0}'.".format(NAME) # Create a graph graphs = ta.get_graph_names() if NAME in graphs: print "Deleting old '{0}' graph.".format(NAME) ta.drop_graphs(NAME) graph = ta.Graph() graph.name = NAME # Create some rules graph.define_vertex_type("user_id") graph.define_vertex_type("movie_id") graph.define_edge_type("rating", "user_id", "movie_id", directed=True) #add data to graph graph.vertices["user_id"].add_vertices(frame, 'user_id') graph.vertices["movie_id"].add_vertices(frame, 'movie_id') graph.edges['rating'].add_edges(frame, 'user_id', 'movie_id', ['rating']) print graph.vertex_count print graph.edge_count print graph.vertices["user_id"].inspect(20) print graph.vertices["movie_id"].inspect(20) print graph.edges["rating"].inspect(20) return {"frame": frame, "graph": graph}