예제 #1
0
파일: pr.py 프로젝트: xoltar/atk
def run(path=r"datasets/movie_data_random.csv", ta=None):
    """
    The default home directory is hdfs://user/atkuser all the sample data sets are saved to
    hdfs://user/atkuser/datasets when installing through the rpm
    you will need to copy the data sets to hdfs manually otherwise and adjust the data set location path accordingly
    :param path: data set hdfs path can be full and relative path
    """
    NAME = "PR"

    if ta is None:
        ta = examples.connect()

    #csv schema definition
    schema = [("user_id", ta.int32), ("movie_id", ta.int32),
              ("rating", ta.int32), ("splits", str)]

    csv = ta.CsvFile(path, schema, skip_header_lines=1)

    frames = ta.get_frame_names()
    if NAME in frames:
        print "Deleting old '{0}' frame.".format(NAME)
        ta.drop_frames(NAME)

    print "Building frame '{0}'.".format(NAME)

    frame = ta.Frame(csv, NAME)

    print "Inspecting frame '{0}'.".format(NAME)

    print frame.inspect()

    print "Creating graph '{0}'".format(NAME)

    # Create a graph
    graphs = ta.get_graph_names()
    if NAME in graphs:
        print "Deleting old '{0}' graph".format(NAME)
        ta.drop_graphs(NAME)

    # Create some rules
    graph = ta.Graph()
    graph.name = NAME
    graph.define_vertex_type("user_id")
    graph.define_vertex_type("movie_id")
    graph.define_edge_type("rating", "user_id", "movie_id", directed=True)

    graph.vertices["user_id"].add_vertices(frame, 'user_id')
    graph.vertices["movie_id"].add_vertices(frame, 'movie_id')
    graph.edges['rating'].add_edges(frame, 'user_id', 'movie_id', ['rating'])

    print graph.vertex_count
    print graph.edge_count
    print graph.vertices["user_id"].inspect(20)
    print graph.vertices["movie_id"].inspect(20)
    print graph.edges["rating"].inspect(20)

    result = graph.graphx_pagerank(output_property="PageRank",
                                   max_iterations=2,
                                   convergence_tolerance=0.001)

    for frame_name in result["vertex_dictionary"]:
        result["vertex_dictionary"][frame_name].inspect(20)

    for frame_name in result["edge_dictionary"]:
        result["edge_dictionary"][frame_name].inspect(20)

    return {"frame": frame, "graph": graph, "result": result}
예제 #2
0
파일: pr.py 프로젝트: codeaudit/atk
def run(path=r"datasets/movie_data_random.csv", ta=None):
    """
    Loads movie_data_random.csv into a frame, creates a graph and runs the page rank algorithm.
    We are not required to use movie_data_random.csv but rather it's schema. Any other csv file with the correct schema and delimeter will work.

    Parameters
    ----------
    path : str
        The HDFS path to the movie_data_random.csv dataset. If a path is not given the default is datasets/movie_data_random.csv. The dataset is
        available in the examples/datasets directory and in `github<https://github.com/trustedanalytics/atk/tree/master/python-client/trustedanalytics/examples/datasets>`__.
        Must be a valid HDFS path either fully qualified hdfs://some/path or relative the ATK rest servers HDFS home directory.

    ta : trusted analytics python import
        Can be safely ignored when running examples. It is only used during integration testing to pass pre-configured
        python client reference.


    Returns
    -------
        A dictionary with the frame, graph, and algorithm result


    Datasets
    --------
      All the datasets can be found in the examples/datasets directory of the python client or in `github<https://github.com/trustedanalytics/atk/tree/master/python-client/trustedanalytics/examples/datasets>`__.


    Dataset
    -------
      Name : movie_data_random.csv

      schema:

        user_id(int32) , movie_id(int32) , rating(int32) , splits(str)

        sample

        .. code::
          58,-3,4,tr
          59,-3,5,tr
          60,-3,4,va

      delimeter: ,


    Example
    -------
        To run the movie example first import the example.

        .. code::

          >>>import trustedanalytics.examples.movie_graph_small as movie

        After importing you can execute run method with the path to the dataset

        .. code::

          >>>movie.run("hdfs://FULL_HDFS_PATH")



    """
    NAME = "PR"

    if ta is None:
        ta = examples.connect()

    #csv schema definition
    schema = [("user_id", ta.int32), ("movie_id", ta.int32),
              ("rating", ta.int32), ("splits", str)]

    csv = ta.CsvFile(path, schema, skip_header_lines=1)

    frames = ta.get_frame_names()
    if NAME in frames:
        print "Deleting old '{0}' frame.".format(NAME)
        ta.drop_frames(NAME)

    print "Building frame '{0}'.".format(NAME)

    frame = ta.Frame(csv, NAME)

    print "Inspecting frame '{0}'.".format(NAME)

    print frame.inspect()

    print "Creating graph '{0}'".format(NAME)

    # Create a graph
    graphs = ta.get_graph_names()
    if NAME in graphs:
        print "Deleting old '{0}' graph".format(NAME)
        ta.drop_graphs(NAME)

    # Create some rules
    graph = ta.Graph()
    graph.name = NAME
    graph.define_vertex_type("user_id")
    graph.define_vertex_type("movie_id")
    graph.define_edge_type("rating", "user_id", "movie_id", directed=True)

    graph.vertices["user_id"].add_vertices(frame, 'user_id')
    graph.vertices["movie_id"].add_vertices(frame, 'movie_id')
    graph.edges['rating'].add_edges(frame, 'user_id', 'movie_id', ['rating'])

    print graph.vertex_count
    print graph.edge_count
    print graph.vertices["user_id"].inspect(20)
    print graph.vertices["movie_id"].inspect(20)
    print graph.edges["rating"].inspect(20)

    result = graph.graphx_pagerank(output_property="PageRank",
                                   max_iterations=2,
                                   convergence_tolerance=0.001)

    for frame_name in result["vertex_dictionary"]:
        result["vertex_dictionary"][frame_name].inspect(20)

    for frame_name in result["edge_dictionary"]:
        result["edge_dictionary"][frame_name].inspect(20)

    return {"frame": frame, "graph": graph, "result": result}
예제 #3
0
def run(path=r"datasets/movie_data_random.csv", ta=None):
    """
    Loads movie_data_random.csv into a frame, filters it by movie rating and creates a graph.
    We are not required to use movie_data_random.csv but rather it's schema. Any other csv file with the correct schema and delimeter will work.

    Parameters
    ----------
    path : str
        The HDFS path to the movie_data_random.csv dataset. If a path is not given the default is datasets/movie_data_random.csv. The dataset is
        available in the examples/datasets directory and in `github<https://github.com/trustedanalytics/atk/tree/master/python-client/trustedanalytics/examples/datasets>`__.
        Must be a valid HDFS path either fully qualified hdfs://some/path or relative the ATK rest servers HDFS home directory.

    ta : trusted analytics python import
        Can be safely ignored when running examples. It is only used during integration testing to pass pre-configured
        python client reference.


    Returns
    -------
        A dictionary with the frame and graph


    Datasets
    --------
      All the datasets can be found in the examples/datasets directory of the python client or in `github<https://github.com/trustedanalytics/atk/tree/master/python-client/trustedanalytics/examples/datasets>`__.


    Dataset
    -------
      Name : movie_data_random.csv

      schema:

        user_id(int32) , movie_id(int32) , rating(int32) , splits(str)

        sample

        .. code::
          1,-1,4,tr
          2,-1,1,tr
          3,-1,2,tr

      delimeter: ,


    Example
    -------
        To run the movie example first import the example.

        .. code::

          >>>import trustedanalytics.examples.movie_graph_small as movie

        After importing you can execute run method with the path to the dataset

        .. code::

          >>>movie.run("hdfs://FULL_HDFS_PATH")



    """
    FRAME_NAME = "MGS_frame"
    GRAPH_NAME = "MGS_graph"

    if ta is None:
        ta = examples.connect()
    #import trustedanalytics as ta

    #ta.connect()

    #csv schema definition
    schema = [("user_id", ta.int32),
              ("movie_id", ta.int32),
              ("rating", ta.int32),
              ("splits", str)]

    csv = ta.CsvFile(path, schema, skip_header_lines=1)

    frames = ta.get_frame_names()
    if FRAME_NAME in frames:
        print "Deleting old '{0}' frame.".format(FRAME_NAME)
        ta.drop_frames(FRAME_NAME)

    print "Building frame '{0}'.".format(FRAME_NAME)

    frame = ta.Frame(csv, FRAME_NAME)

    print "Inspecting frame '{0}'.".format(FRAME_NAME)

    print frame.inspect()

    print "Filter frame by rating."

    frame.filter(lambda row: row.rating >= 5)

    print frame.inspect()

    print "Creating graph '{0}'.".format(GRAPH_NAME)

    # Create a graph
    graphs = ta.get_graph_names()
    if GRAPH_NAME in graphs:
        print "Deleting old '{0}' graph.".format(GRAPH_NAME)
        ta.drop_graphs(GRAPH_NAME)


    graph = ta.Graph()
    graph.name = GRAPH_NAME
    # Create some rules
    graph.define_vertex_type("user_id")
    graph.define_vertex_type("movie_id")
    graph.define_edge_type("rating", "user_id", "movie_id", directed=True)

    #add data to graph
    graph.vertices["user_id"].add_vertices(frame, 'user_id')
    graph.vertices["movie_id"].add_vertices(frame, 'movie_id')
    graph.edges['rating'].add_edges(frame, 'user_id', 'movie_id', ['rating'])

    print graph.vertex_count
    print graph.edge_count
    print graph.vertices["user_id"].inspect(20)
    print graph.vertices["movie_id"].inspect(20)
    print graph.edges["rating"].inspect(20)


    return {"frame": frame, "graph": graph}
예제 #4
0
파일: frame.py 프로젝트: xoltar/atk
def run(path=r"datasets/cities.csv", ta=None):
    """
    The default home directory is hdfs://user/atkuser all the sample data sets are saved to
    hdfs://user/atkuser/datasets when installing through the rpm
    you will need to copy the data sets to hdfs manually otherwise and adjust the data set location path accordingly
    :param path: data set hdfs path can be full and relative path
    """
    NAME = "TEST"

    if ta is None:
        ta = examples.connect()

    #csv schema definition
    schema = [('rank', ta.int32), ('city', str), ('population_2013', ta.int32),
              ('population_2010', ta.int32), ('change', str), ('county', str)]

    csv = ta.CsvFile(path, schema, skip_header_lines=1, delimiter='|')

    frames = ta.get_frame_names()
    if NAME in frames:
        print "Deleting old '{0}' frame.".format(NAME)
        ta.drop_frames(NAME)

    print "Building frame '{0}'.".format(NAME)

    frame = ta.Frame(csv, NAME)

    print "Inspecting frame '{0}'.".format(NAME)

    print frame.inspect()

    print "Drop Change column."

    frame.drop_columns("change")

    print frame.inspect()

    print "Add Change column."
    frame.add_columns(
        lambda row: ((row.population_2013 - row.population_2010) / float(
            row.population_2010)) * 100, ("change", ta.float32))

    print frame.inspect()

    print "Drop Change column."

    frame.drop_columns("change")

    print frame.inspect()

    print "Add Change columns."

    frame.add_columns(
        lambda row: [
            row.population_2013 - row.population_2010,
            ((row.population_2013 - row.population_2010) / float(
                row.population_2010)) * 100
        ], [("difference", ta.int32), ("change", ta.float32)])

    print "Format inspection."
    print frame.inspect(10,
                        wrap=10,
                        columns=[
                            "city", "population_2013", "population_2010",
                            "change", "difference"
                        ],
                        round=2)

    return {"frame": frame}
예제 #5
0
파일: frame.py 프로젝트: AllanY/atk
def run(path=r"datasets/cities.csv", ta=None):
    """
    Loads cities.csv into a frame and runs some simple frame operations. We will be dropping columns and adding new ones with python lambdas.
    We are not required to use cities.csv but rather it's schema. Any other csv file with the correct schema and delimeter will work.

    Parameters
    ----------
    path : str
        The HDFS path to the cities.csv dataset. If a path is not given the default is datasets/cities.csv. The dataset is
        available in the examples/datasets directory and in `github<https://github.com/trustedanalytics/atk/tree/master/python-client/trustedanalytics/examples/datasets>`__.
        Must be a valid HDFS path either fully qualified hdfs://some/path or relative the ATK rest servers HDFS home directory.

    ta : trusted analytics python import
        Can be safely ignored when running examples. It is only used during integration testing to pass pre-configured
        python client reference.


    Returns
    -------
        A dictionary with the final built frame


    Datasets
    --------
      All the datasets can be found in the examples/datasets directory of the python client or in `github<https://github.com/trustedanalytics/atk/tree/master/python-client/trustedanalytics/examples/datasets>`__.


    Dataset
    -------
      Name : cities.csv

      schema:
        rank(int32) | city(str) | population_2013(int32) | population_2010(int32) | change(str) | county(str)

        sample

        .. code::

          1|Portland|609456|583776|4.40%|Multnomah
          2|Salem|160614|154637|3.87%|Marion
          3|Eugene|159190|156185|1.92%|Lane

      delimeter: |


    Example
    -------
        To run the frame example first import the example.

        .. code::

          >>>import trustedanalytics.examples.frame as frame

        After importing you can execute run method with the path to the dataset

        .. code::

          >>>frame.run("hdfs://FULL_HDFS_PATH")



    """
    NAME = "TEST"

    if ta is None:
        ta = examples.connect()

    #csv schema definition
    schema = [('rank', ta.int32),
              ('city', str),
              ('population_2013', ta.int32),
              ('population_2010', ta.int32),
              ('change', str),
              ('county', str)]

    #import csv file
    csv = ta.CsvFile(path, schema, skip_header_lines=1, delimiter='|')

    frames = ta.get_frame_names()
    if NAME in frames:
        print "Deleting old '{0}' frame.".format(NAME)
        ta.drop_frames(NAME)

    print "Building frame '{0}'.".format(NAME)

    frame = ta.Frame(csv, NAME)

    print "Inspecting frame '{0}'.".format(NAME)

    print frame.inspect()

    print "Drop Change column."

    frame.drop_columns("change")

    print frame.inspect()

    print "Add Change column."
    frame.add_columns(lambda row: ((row.population_2013 - row.population_2010)/float(row.population_2010)) * 100,
                      ("change", ta.float32))

    print frame.inspect()

    print "Drop Change column."

    frame.drop_columns("change")

    print frame.inspect()

    print "Add Change columns."

    frame.add_columns(lambda row: [row.population_2013 - row.population_2010, ((row.population_2013 - row.population_2010)/float(row.population_2010)) * 100 ], [("difference", ta.int32 ), ("change", ta.float32 )])

    print "Format inspection."
    print frame.inspect(10, wrap=10, columns=["city", "population_2013", "population_2010", "change", "difference"], round=2)


    return {"frame": frame}
예제 #6
0
def run(path=r"datasets/movie_data_random.csv", ta=None):
    """
    The default home directory is hdfs://user/atkuser all the sample data sets are saved to
    hdfs://user/atkuser/datasets when installing through the rpm
    you will need to copy the data sets to hdfs manually otherwise and adjust the data set location path accordingly
    :param path: data set hdfs path can be full and relative path
    """
    NAME = "MGS"

    if ta is None:
        ta = examples.connect()
    #import trustedanalytics as ta

    #ta.connect()

    #csv schema definition
    schema = [("user_id", ta.int32), ("movie_id", ta.int32),
              ("rating", ta.int32), ("splits", str)]

    csv = ta.CsvFile(path, schema, skip_header_lines=1)

    frames = ta.get_frame_names()
    if NAME in frames:
        print "Deleting old '{0}' frame.".format(NAME)
        ta.drop_frames(NAME)

    print "Building frame '{0}'.".format(NAME)

    frame = ta.Frame(csv, NAME)

    print "Inspecting frame '{0}'.".format(NAME)

    print frame.inspect()

    print "Filter frame by rating."

    frame.filter(lambda row: row.rating >= 5)

    print frame.inspect()

    print "Creating graph '{0}'.".format(NAME)

    # Create a graph
    graphs = ta.get_graph_names()
    if NAME in graphs:
        print "Deleting old '{0}' graph.".format(NAME)
        ta.drop_graphs(NAME)

    graph = ta.Graph()
    graph.name = NAME
    # Create some rules
    graph.define_vertex_type("user_id")
    graph.define_vertex_type("movie_id")
    graph.define_edge_type("rating", "user_id", "movie_id", directed=True)

    #add data to graph
    graph.vertices["user_id"].add_vertices(frame, 'user_id')
    graph.vertices["movie_id"].add_vertices(frame, 'movie_id')
    graph.edges['rating'].add_edges(frame, 'user_id', 'movie_id', ['rating'])

    print graph.vertex_count
    print graph.edge_count
    print graph.vertices["user_id"].inspect(20)
    print graph.vertices["movie_id"].inspect(20)
    print graph.edges["rating"].inspect(20)

    return {"frame": frame, "graph": graph}
예제 #7
0
파일: pr.py 프로젝트: codeaudit/atk
def run(path=r"datasets/movie_data_random.csv", ta=None):
    """
    Loads movie_data_random.csv into a frame, creates a graph and runs the page rank algorithm.
    We are not required to use movie_data_random.csv but rather it's schema. Any other csv file with the correct schema and delimeter will work.

    Parameters
    ----------
    path : str
        The HDFS path to the movie_data_random.csv dataset. If a path is not given the default is datasets/movie_data_random.csv. The dataset is
        available in the examples/datasets directory and in `github<https://github.com/trustedanalytics/atk/tree/master/python-client/trustedanalytics/examples/datasets>`__.
        Must be a valid HDFS path either fully qualified hdfs://some/path or relative the ATK rest servers HDFS home directory.

    ta : trusted analytics python import
        Can be safely ignored when running examples. It is only used during integration testing to pass pre-configured
        python client reference.


    Returns
    -------
        A dictionary with the frame, graph, and algorithm result


    Datasets
    --------
      All the datasets can be found in the examples/datasets directory of the python client or in `github<https://github.com/trustedanalytics/atk/tree/master/python-client/trustedanalytics/examples/datasets>`__.


    Dataset
    -------
      Name : movie_data_random.csv

      schema:

        user_id(int32) , movie_id(int32) , rating(int32) , splits(str)

        sample

        .. code::
          58,-3,4,tr
          59,-3,5,tr
          60,-3,4,va

      delimeter: ,


    Example
    -------
        To run the movie example first import the example.

        .. code::

          >>>import trustedanalytics.examples.movie_graph_small as movie

        After importing you can execute run method with the path to the dataset

        .. code::

          >>>movie.run("hdfs://FULL_HDFS_PATH")



    """
    NAME = "PR"

    if ta is None:
        ta = examples.connect()

    #csv schema definition
    schema = [("user_id", ta.int32),
              ("movie_id", ta.int32),
              ("rating", ta.int32),
              ("splits", str)]

    csv = ta.CsvFile(path, schema, skip_header_lines=1)

    frames = ta.get_frame_names()
    if NAME in frames:
        print "Deleting old '{0}' frame.".format(NAME)
        ta.drop_frames(NAME)

    print "Building frame '{0}'.".format(NAME)

    frame = ta.Frame(csv, NAME)

    print "Inspecting frame '{0}'.".format(NAME)

    print frame.inspect()

    print "Creating graph '{0}'".format(NAME)

    # Create a graph
    graphs = ta.get_graph_names()
    if NAME in graphs:
        print "Deleting old '{0}' graph".format(NAME)
        ta.drop_graphs(NAME)

    # Create some rules
    graph = ta.Graph()
    graph.name = NAME
    graph.define_vertex_type("user_id")
    graph.define_vertex_type("movie_id")
    graph.define_edge_type("rating", "user_id", "movie_id", directed=True)

    graph.vertices["user_id"].add_vertices(frame, 'user_id')
    graph.vertices["movie_id"].add_vertices(frame, 'movie_id')
    graph.edges['rating'].add_edges(frame, 'user_id', 'movie_id', ['rating'])

    print graph.vertex_count
    print graph.edge_count
    print graph.vertices["user_id"].inspect(20)
    print graph.vertices["movie_id"].inspect(20)
    print graph.edges["rating"].inspect(20)

    result = graph.graphx_pagerank(output_property="PageRank", max_iterations=2, convergence_tolerance=0.001)

    for frame_name in result["vertex_dictionary"]:
        result["vertex_dictionary"][frame_name].inspect(20)

    for frame_name in result["edge_dictionary"]:
        result["edge_dictionary"][frame_name].inspect(20)


    return {"frame": frame, "graph": graph, "result": result}
예제 #8
0
def run(path=r"datasets/movie_data_random.csv", ta=None):
    """
    Loads movie_data_random.csv into a frame, filters it by movie rating and creates a graph.
    We are not required to use movie_data_random.csv but rather it's schema. Any other csv file with the correct schema and delimeter will work.

    Parameters
    ----------
    path : str
        The HDFS path to the movie_data_random.csv dataset. If a path is not given the default is datasets/movie_data_random.csv. The dataset is
        available in the examples/datasets directory and in `github<https://github.com/trustedanalytics/atk/tree/master/python-client/trustedanalytics/examples/datasets>`__.
        Must be a valid HDFS path either fully qualified hdfs://some/path or relative the ATK rest servers HDFS home directory.

    ta : trusted analytics python import
        Can be safely ignored when running examples. It is only used during integration testing to pass pre-configured
        python client reference.


    Returns
    -------
        A dictionary with the frame and graph


    Datasets
    --------
      All the datasets can be found in the examples/datasets directory of the python client or in `github<https://github.com/trustedanalytics/atk/tree/master/python-client/trustedanalytics/examples/datasets>`__.


    Dataset
    -------
      Name : movie_data_random.csv

      schema:

        user_id(int32) , movie_id(int32) , rating(int32) , splits(str)

        sample

        .. code::
          1,-1,4,tr
          2,-1,1,tr
          3,-1,2,tr

      delimeter: ,


    Example
    -------
        To run the movie example first import the example.

        .. code::

          >>>import trustedanalytics.examples.movie_graph_small as movie

        After importing you can execute run method with the path to the dataset

        .. code::

          >>>movie.run("hdfs://FULL_HDFS_PATH")



    """
    FRAME_NAME = "MGS_frame"
    GRAPH_NAME = "MGS_graph"

    if ta is None:
        ta = examples.connect()
    #import trustedanalytics as ta

    #ta.connect()

    #csv schema definition
    schema = [("user_id", ta.int32), ("movie_id", ta.int32),
              ("rating", ta.int32), ("splits", str)]

    csv = ta.CsvFile(path, schema, skip_header_lines=1)

    frames = ta.get_frame_names()
    if FRAME_NAME in frames:
        print "Deleting old '{0}' frame.".format(FRAME_NAME)
        ta.drop_frames(FRAME_NAME)

    print "Building frame '{0}'.".format(FRAME_NAME)

    frame = ta.Frame(csv, FRAME_NAME)

    print "Inspecting frame '{0}'.".format(FRAME_NAME)

    print frame.inspect()

    print "Filter frame by rating."

    frame.filter(lambda row: row.rating >= 5)

    print frame.inspect()

    print "Creating graph '{0}'.".format(GRAPH_NAME)

    # Create a graph
    graphs = ta.get_graph_names()
    if GRAPH_NAME in graphs:
        print "Deleting old '{0}' graph.".format(GRAPH_NAME)
        ta.drop_graphs(GRAPH_NAME)

    graph = ta.Graph()
    graph.name = GRAPH_NAME
    # Create some rules
    graph.define_vertex_type("user_id")
    graph.define_vertex_type("movie_id")
    graph.define_edge_type("rating", "user_id", "movie_id", directed=True)

    #add data to graph
    graph.vertices["user_id"].add_vertices(frame, 'user_id')
    graph.vertices["movie_id"].add_vertices(frame, 'movie_id')
    graph.edges['rating'].add_edges(frame, 'user_id', 'movie_id', ['rating'])

    print graph.vertex_count
    print graph.edge_count
    print graph.vertices["user_id"].inspect(20)
    print graph.vertices["movie_id"].inspect(20)
    print graph.edges["rating"].inspect(20)

    return {"frame": frame, "graph": graph}
예제 #9
0
파일: frame.py 프로젝트: codeaudit/atk
def run(path=r"datasets/cities.csv", ta=None):
    """
    Loads cities.csv into a frame and runs some simple frame operations. We will be dropping columns and adding new ones with python lambdas.
    We are not required to use cities.csv but rather it's schema. Any other csv file with the correct schema and delimeter will work.

    Parameters
    ----------
    path : str
        The HDFS path to the cities.csv dataset. If a path is not given the default is datasets/cities.csv. The dataset is
        available in the examples/datasets directory and in `github<https://github.com/trustedanalytics/atk/tree/master/python-client/trustedanalytics/examples/datasets>`__.
        Must be a valid HDFS path either fully qualified hdfs://some/path or relative the ATK rest servers HDFS home directory.

    ta : trusted analytics python import
        Can be safely ignored when running examples. It is only used during integration testing to pass pre-configured
        python client reference.


    Returns
    -------
        A dictionary with the final built frame


    Datasets
    --------
      All the datasets can be found in the examples/datasets directory of the python client or in `github<https://github.com/trustedanalytics/atk/tree/master/python-client/trustedanalytics/examples/datasets>`__.


    Dataset
    -------
      Name : cities.csv

      schema:
        rank(int32) | city(str) | population_2013(int32) | population_2010(int32) | change(str) | county(str)

        sample

        .. code::

          1|Portland|609456|583776|4.40%|Multnomah
          2|Salem|160614|154637|3.87%|Marion
          3|Eugene|159190|156185|1.92%|Lane

      delimeter: |


    Example
    -------
        To run the frame example first import the example.

        .. code::

          >>>import trustedanalytics.examples.frame as frame

        After importing you can execute run method with the path to the dataset

        .. code::

          >>>frame.run("hdfs://FULL_HDFS_PATH")



    """
    NAME = "TEST"

    if ta is None:
        ta = examples.connect()

    #csv schema definition
    schema = [('rank', ta.int32),
              ('city', str),
              ('population_2013', ta.int32),
              ('population_2010', ta.int32),
              ('change', str),
              ('county', str)]

    #import csv file
    csv = ta.CsvFile(path, schema, skip_header_lines=1, delimiter='|')

    frames = ta.get_frame_names()
    if NAME in frames:
        print "Deleting old '{0}' frame.".format(NAME)
        ta.drop_frames(NAME)

    print "Building frame '{0}'.".format(NAME)

    frame = ta.Frame(csv, NAME)

    print "Inspecting frame '{0}'.".format(NAME)

    print frame.inspect()

    print "Drop Change column."

    frame.drop_columns("change")

    print frame.inspect()

    print "Add Change column."
    frame.add_columns(lambda row: ((row.population_2013 - row.population_2010)/float(row.population_2010)) * 100,
                      ("change", ta.float32))

    print frame.inspect()

    print "Drop Change column."

    frame.drop_columns("change")

    print frame.inspect()

    print "Add Change columns."

    frame.add_columns(lambda row: [row.population_2013 - row.population_2010, ((row.population_2013 - row.population_2010)/float(row.population_2010)) * 100 ], [("difference", ta.int32 ), ("change", ta.float32 )])

    print "Format inspection."
    print frame.inspect(10, wrap=10, columns=["city", "population_2013", "population_2010", "change", "difference"], round=2)


    return {"frame": frame}
예제 #10
0
파일: pr.py 프로젝트: rainiraj/atk
def run(path=r"datasets/movie_data_random.csv", ta=None):
    """
    The default home directory is hdfs://user/atkuser all the sample data sets are saved to
    hdfs://user/atkuser/datasets when installing through the rpm
    you will need to copy the data sets to hdfs manually otherwise and adjust the data set location path accordingly
    :param path: data set hdfs path can be full and relative path
    """
    NAME = "PR"

    if ta is None:
        ta = examples.connect()

    #csv schema definition
    schema = [("user_id", ta.int32),
              ("movie_id", ta.int32),
              ("rating", ta.int32),
              ("splits", str)]

    csv = ta.CsvFile(path, schema, skip_header_lines=1)

    frames = ta.get_frame_names()
    if NAME in frames:
        print "Deleting old '{0}' frame.".format(NAME)
        ta.drop_frames(NAME)
        
    print "Building frame '{0}'.".format(NAME)

    frame = ta.Frame(csv, NAME)

    print "Inspecting frame '{0}'.".format(NAME)

    print frame.inspect()

    print "Creating graph '{0}'".format(NAME)

    # Create a graph
    graphs = ta.get_graph_names()
    if NAME in graphs:
        print "Deleting old '{0}' graph".format(NAME)
        ta.drop_graphs(NAME)

    # Create some rules
    graph = ta.Graph()
    graph.name = NAME
    graph.define_vertex_type("user_id")
    graph.define_vertex_type("movie_id")
    graph.define_edge_type("rating", "user_id", "movie_id", directed=True)

    graph.vertices["user_id"].add_vertices(frame, 'user_id')
    graph.vertices["movie_id"].add_vertices(frame, 'movie_id')
    graph.edges['rating'].add_edges(frame, 'user_id', 'movie_id', ['rating'])

    print graph.vertex_count
    print graph.edge_count
    print graph.vertices["user_id"].inspect(20)
    print graph.vertices["movie_id"].inspect(20)
    print graph.edges["rating"].inspect(20)

    result = graph.graphx_pagerank(output_property="PageRank", max_iterations=2, convergence_tolerance=0.001)

    for frame_name in result["vertex_dictionary"]:
        result["vertex_dictionary"][frame_name].inspect(20)

    for frame_name in result["edge_dictionary"]:
        result["edge_dictionary"][frame_name].inspect(20)


    return {"frame": frame, "graph": graph, "result": result}
예제 #11
0
def run(path=r"datasets/movie_data_random.csv", ta=None):
    """
    The default home directory is hdfs://user/atkuser all the sample data sets are saved to
    hdfs://user/atkuser/datasets when installing through the rpm
    you will need to copy the data sets to hdfs manually otherwise and adjust the data set location path accordingly
    :param path: data set hdfs path can be full and relative path
    """
    NAME = "MGS"

    if ta is None:
        ta = examples.connect()
    #import trustedanalytics as ta

    #ta.connect()

    #csv schema definition
    schema = [("user_id", ta.int32),
              ("movie_id", ta.int32),
              ("rating", ta.int32),
              ("splits", str)]

    csv = ta.CsvFile(path, schema, skip_header_lines=1)

    frames = ta.get_frame_names()
    if NAME in frames:
        print "Deleting old '{0}' frame.".format(NAME)
        ta.drop_frames(NAME)
        
    print "Building frame '{0}'.".format(NAME)

    frame = ta.Frame(csv, NAME)

    print "Inspecting frame '{0}'.".format(NAME)

    print frame.inspect()

    print "Filter frame by rating."

    frame.filter(lambda row: row.rating >= 5)

    print frame.inspect()

    print "Creating graph '{0}'.".format(NAME)

    # Create a graph
    graphs = ta.get_graph_names()
    if NAME in graphs:
        print "Deleting old '{0}' graph.".format(NAME)
        ta.drop_graphs(NAME)


    graph = ta.Graph()
    graph.name = NAME
    # Create some rules
    graph.define_vertex_type("user_id")
    graph.define_vertex_type("movie_id")
    graph.define_edge_type("rating", "user_id", "movie_id", directed=True)

    #add data to graph
    graph.vertices["user_id"].add_vertices(frame, 'user_id')
    graph.vertices["movie_id"].add_vertices(frame, 'movie_id')
    graph.edges['rating'].add_edges(frame, 'user_id', 'movie_id', ['rating'])

    print graph.vertex_count
    print graph.edge_count
    print graph.vertices["user_id"].inspect(20)
    print graph.vertices["movie_id"].inspect(20)
    print graph.edges["rating"].inspect(20)


    return {"frame": frame, "graph": graph}