예제 #1
0
def import_jdbc(connection_url, table_name, tc=implicit):
    """
    Import data from jdbc table into frame.

    Parameters
    ----------

    :param connection_url: JDBC connection url to database server
    :param table_name: JDBC table name
    :return: returns frame with jdbc table data

    Examples
    --------
    Load a frame from a jdbc table specifying the connection url to the database server.

    <skip>
        >>> url = "jdbc:postgresql://localhost/postgres"
        >>> tb_name = "demo_test"

        >>> frame = tc.frame.import_jdbc(url, tb_name)
        -etc-

        >>> frame.inspect()
        [#]  a  b    c   d
        ==================
        [0]  1  0.2  -2  5
        [1]  2  0.4  -1  6
        [2]  3  0.6   0  7
        [3]  4  0.8   1  8

        >>> frame.schema
        [(u'a', int), (u'b', float), (u'c', int), (u'd', int)]
    </skip>
    """
    if not isinstance(connection_url, basestring):
        raise ValueError("connection url parameter must be a string, but is {0}.".format(type(connection_url)))
    if not isinstance(table_name, basestring):
        raise ValueError("table name parameter must be a string, but is {0}.".format(type(table_name)))
    if tc is implicit:
        implicit.error("tc")
    if not isinstance(tc, TkContext):
        raise ValueError("tc must be type TkContext, received %s" % type(tc))

    scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importJdbc(tc.jutils.get_scala_sc(), connection_url, table_name)

    from sparktk.frame.frame import Frame
    return Frame(tc, scala_frame)
예제 #2
0
def import_hive(hive_query, tc=implicit):
    """
    Import data from hive table into frame.

    Define the sql query to retrieve the data from a hive table.

    Only a subset of Hive data types are supported.

    Data Type   Support
    ___________ ___________________________

    boolean     cast to int

    bigint      native support
    int         native support
    tinyint     cast to int
    smallint    cast to int

    decimal     cast to double, may lose precision
    double      native support
    float       native support

    date        cast to string
    string      native support
    timestamp   cast to string
    varchar     cast to string

    arrays      not supported
    binary      not supported
    char        not supported
    maps        not supported
    structs     not supported
    union       not supported

    Parameters
    ----------

    :param hive_query: (str) hive query to fetch data from table
    :return: (Frame) returns frame with hive table data

    Examples
    --------
    Load data into frame from a hive table based on hive query

    <skip>
        >>> h_query = "select * from demo_test"
        >>> frame = tc.frame.import_hive(h_query)
        -etc-

        >>> frame.inspect()
        [#]  number  strformat
        ======================
        [0]       1  one
        [1]       2  two
        [2]       3  three
        [3]       4  four
    </skip>

    """
    if not isinstance(hive_query, basestring):
        raise ValueError(
            "hive query parameter must be a string, but is {0}.".format(
                type(hive_query)))
    if tc is implicit:
        implicit.error("tc")
    if not isinstance(tc, TkContext):
        raise ValueError("tc must be type TkContext, received %s" % type(tc))

    scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importHive(
        tc.jutils.get_scala_sc(), hive_query)

    from sparktk.frame.frame import Frame
    return Frame(tc, scala_frame)
예제 #3
0
def import_csv(path,
               delimiter=",",
               header=False,
               inferschema=True,
               schema=None,
               tc=implicit):
    """
    Creates a frame with data from a csv file.

    :param path: Full path to the csv file
    :param delimiter: A string which indicates the separation of data fields.  This is usually a single
                      character and could be a non-visible character, such as a tab. The default delimiter
                      is a comma (,).
    :param header: Boolean value indicating if the first line of the file will be used to name columns,
                   and not be included in the data.  The default value is false.
    :param inferschema: Boolean value indicating if the column types will be automatically inferred.  It
                        requires one extra pass over the data and is false by default.
    :param: schema: Optionally specify the schema for the dataset.  Number of columns specified in the
                    schema must match the number of columns in the csv file provided.
    :return: Frame that contains the data from the csv file

    Examples
    --------
    Load a frame from a csv file by specifying the path to the file, delimiter, and options that specify that
    there is a header and to infer the schema based on the data.

    .. code::

        >>> file_path = "../integration-tests/datasets/cities.csv"
        >>> frame = tc.load_frame_from_csv(file_path, "|", header=True, inferschema=True)
        -etc-
        >>> frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]     1  Portland              609456           583776  4.40%   Multnomah
        [1]     2  Salem                 160614           154637  3.87%   Marion
        [2]     3  Eugene                159190           156185  1.92%   Lane
        [3]     4  Gresham               109397           105594  3.60%   Multnomah
        [4]     5  Hillsboro              97368            91611  6.28%   Washington
        [5]     6  Beaverton              93542            89803  4.16%   Washington
        [6]    15  Grants Pass            35076            34533  1.57%   Josephine
        [7]    16  Oregon City            34622            31859  8.67%   Clackamas
        [8]    17  McMinnville            33131            32187  2.93%   Yamhill
        [9]    18  Redmond                27427            26215  4.62%   Deschutes
        >>> frame.schema
        [('rank', int),
         ('city', str),
         ('population_2013', int),
         ('population_2010', int),
         ('change', str),
         ('county', str)]
    """

    from pyspark.sql import SQLContext

    if schema is not None:
        inferschema = False  # if a custom schema is provided, don't waste time inferring the schema during load
    if not isinstance(header, bool):
        raise ValueError(
            "header parameter must be a boolean, but is {0}.".format(
                type(header)))
    if not isinstance(inferschema, bool):
        raise ValueError(
            "inferschema parameter must be a boolean, but is {0}.".format(
                type(inferschema)))
    if tc is implicit:
        implicit.error('tc')
    if not isinstance(tc, TkContext):
        raise ValueError("tc must be type TkContext, received %s" % type(tc))

    header_str = str(header).lower()
    inferschema_str = str(inferschema).lower()
    sqlContext = SQLContext(tc.sc)
    df = sqlContext.read.format("com.databricks.spark.csv").options(
        delimiter=delimiter, header=header_str,
        inferschema=inferschema_str).load(path)
    df_schema = []

    if schema is None:
        for column in df.schema.fields:
            datatype = str
            import sparktk.dtypes as dtypes
            try:
                datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(
                    type(column.dataType))
            except ValueError:
                print "Warning: No mapping for type: {0}. Column '{1}' will default to use strings.".format(
                    str(column.dataType), column.name)
            df_schema.append((column.name, datatype))
    else:
        df_column_count = len(df.schema.fields)
        custom_column_count = len(schema)
        if (df_column_count != custom_column_count):
            raise ValueError(
                "Bad schema value.  The number of columns in the custom schema ({0}) must match the"
                "number of columns in the csv file data ({1}).".format(
                    custom_column_count, df_column_count))
        df_schema = schema
    from sparktk.frame.frame import Frame  # circular dependency, so import late
    return Frame(tc, df.rdd, df_schema)
예제 #4
0
def create(data, schema=None, validate_schema=False, tc=implicit):
    """
    Creates a frame from the given data and schema.  If no schema data types are provided, the schema is inferred
    based on the data in the first 100 rows.

    If schema validation is enabled, all data is is checked to ensure that it matches the schema.  If the data does
    not match the schema's data type, it attempts to cast the data to the proper data type.  When the data is unable
    to be casted to the schema's data type, the item will be missing (None) in the frame.

    :param data: Data source
    :param schema: Optionally specify a schema (list of tuples of string column names and data type), column names
                   (list of strings, and the column data types will be inferred) or None (column data types will be
                   inferred and column names will be numbered like C0, C1, C2, etc).
    :param validate_schema: When True, all data is is checked to ensure that it matches the schema.  If the data does
                            not match the schema's data type, it attempts to cast the data to the proper data type.
                            When the data is unable to be casted to the schema's data type, a ValueError is raised.
                            Defaults to False.
    :param tc: TkContext
    :return: Frame loaded with the specified data


    Examples
    --------

    Create a frame with the specified data.

        >>> data = [["Bob", 30, 8], ["Jim", 45, 9.5], ["Sue", 25, 7], ["George", 15, 6], ["Jennifer", 18, 8.5]]
        >>> frame = tc.frame.create(data)

    Since no schema is provided, the schema will be inferred.  Note that the data set had a mix of strings and
    integers in the third column.  The schema will use the most general data type from the data that it sees, so in
    this example, the column is treated as a float.

        >>> frame.schema
        [('C0', str), ('C1', int), ('C2', float)]

        >>> frame.inspect()
        [#]  C0        C1  C2
        ======================
        [0]  Bob       30    8
        [1]  Jim       45  9.5
        [2]  Sue       25    7
        [3]  George    15    6
        [4]  Jennifer  18  8.5

    We could also enable schema validation, which checks the data against the schema.  If the data does not match the
    schema's data type, it attempts to cast the data to the proper data type.

        >>> frame = tc.frame.create(data, validate_schema=True)

    In this example with schema validation enabled, the integers in column C2 get casted to floats:

        >>> frame.inspect()
        [#]  C0        C1  C2
        ======================
        [0]  Bob       30  8.0
        [1]  Jim       45  9.5
        [2]  Sue       25  7.0
        [3]  George    15  6.0
        [4]  Jennifer  18  8.5

    We could also provide a list of column names when creating the frame.  When a list of column names is provided,
    the data types for the schema are still inferred, but the columns in the schema are labeled with the specified names.

        >>> frame = tc.frame.create(data, schema=["name", "age", "shoe_size"], validate_schema=True)

        >>> frame.schema
        [('name', str), ('age', int), ('shoe_size', float)]

        >>> frame.inspect()
        [#]  name      age  shoe_size
        =============================
        [0]  Bob        30        8.0
        [1]  Jim        45        9.5
        [2]  Sue        25        7.0
        [3]  George     15        6.0
        [4]  Jennifer   18        8.5

    Note that if a value cannot be parsed as the specified data type in the schema, it will show up as missing (None),
    if validate_schema is enabled.  For example, consider the following frame where columns are defined as integers,
    but the data specified has a string in the second row.

        >>> data = [[1, 2, 3], [4, "five", 6]]
        >>> schema = [("a", int), ("b", int), ("c", int)]

        >>> frame = tc.frame.create(data, schema, validate_schema = True)

        >>> frame.inspect()
        [#]  a  b     c
        ===============
        [0]  1     2  3
        [1]  4  None  6

    Note that the spot where the string was located, has it's value missing (None) since it couldn't be parsed to an
    integer.  If validate_schema was disabled, no attempt is made to parse the data to the data type specified by the
    schema, and further frame operations may fail due to the data type discrepancy.

    """
    if tc is implicit:
        implicit.error('tc')
    from sparktk.frame.frame import Frame
    return Frame(tc, data, schema, validate_schema)
예제 #5
0
def import_hive(hive_query, tc=implicit):
    """
    Import data from hive table into frame.

    Define the sql query to retrieve the data from a hive table.

    Only a subset of Hive data types are supported.

    Data Type   Support
    ___________ ___________________________

    boolean     cast to int

    bigint      native support
    int         native support
    tinyint     cast to int
    smallint    cast to int

    decimal     cast to double, may lose precision
    double      native support
    float       native support

    date        cast to string
    string      native support
    timestamp   cast to string
    varchar     cast to string

    arrays      not supported
    binary      not supported
    char        not supported
    maps        not supported
    structs     not supported
    union       not supported

    Parameters
    ----------

    :param hive_query: (str) hive query to fetch data from table
    :return: (Frame) returns frame with hive table data

    Examples
    --------
    Load data into frame from a hive table based on hive query

    <skip>
        >>> h_query = "select * from demo_test"
        >>> frame = tc.frame.import_hive(h_query)
        -etc-

        >>> frame.inspect()
        [#]  number  strformat
        ======================
        [0]       1  one
        [1]       2  two
        [2]       3  three
        [3]       4  four
    </skip>

    """
    if not isinstance(hive_query, basestring):
        raise ValueError("hive query parameter must be a string, but is {0}.".format(type(hive_query)))
    if tc is implicit:
        implicit.error("tc")
    if not isinstance(tc, TkContext):
        raise ValueError("tc must be type TkContext, received %s" % type(tc))

    scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importHive(tc.jutils.get_scala_sc(), hive_query)

    from sparktk.frame.frame import Frame
    return Frame(tc, scala_frame)
예제 #6
0
def import_hbase(table_name,
                 schema,
                 start_tag=None,
                 end_tag=None,
                 tc=implicit):
    """
    Import data from hbase table into frame

    :param table_name: hbase table name
    :param schema: hbase schema as a List of List(string) (columnFamily, columnName, dataType for cell value)
    :param start_tag: optional start tag for filtering
    :param end_tag: optional end tag for filtering
    :return: frame with data from hbase table

    Example
    ---------
    Load data into frame from a hbase table

    <skip>
        >>> frame = tc.frame.import_hbase("demo_test_hbase", [["test_family", "a", int],["test_family", "b", float], ["test_family", "c", int],["test_family", "d", int]])
        -etc-
        >>> frame.inspect()
        [#]  test_family_a  test_family_b  test_family_c  test_family_d
        ===============================================================
        [0]              1            0.2             -2              5
        [1]              2            0.4             -1              6
        [2]              3            0.6              0              7
        [3]              4            0.8              1              8

        Use of start_tag and end_tag. (Hbase creates a unique row id for data in hbase tables)
        start_tag: It is the unique row id from where row scan should start
        end_tag: It is the unique row id where row scan should end

        Assuming you already have data on hbase table "test_startendtag" under "startendtag" family name with single column named "number".
        data: column contains values from 1 to 99. Here rowid is generated by hbase.

        Sample hbase data. Few rows from hbase table looks as below.
        hbase(main):002:0> scan "test_startendtag"
        ROW             COLUMN+CELL
         0          column=startendtag:number, timestamp=1465342524846, value=1
         1          column=startendtag:number, timestamp=1465342524846, value=25
         10         column=startendtag:number, timestamp=1465342524847, value=51
         103        column=startendtag:number, timestamp=1465342524851, value=98
         107        column=startendtag:number, timestamp=1465342524851, value=99
         11         column=startendtag:number, timestamp=1465342524851, value=75
         12         column=startendtag:number, timestamp=1465342524846, value=4
         13         column=startendtag:number, timestamp=1465342524846, value=28
         14         column=startendtag:number, timestamp=1465342524847, value=52
         15         column=startendtag:number, timestamp=1465342524851, value=76
         16         column=startendtag:number, timestamp=1465342524846, value=5
         17         column=startendtag:number, timestamp=1465342524846, value=29
         18         column=startendtag:number, timestamp=1465342524847, value=53
         19         column=startendtag:number, timestamp=1465342524851, value=77
         2          column=startendtag:number, timestamp=1465342524847, value=49
         20         column=startendtag:number, timestamp=1465342524846, value=6
         21         column=startendtag:number, timestamp=1465342524846, value=30

        >>> frame = tc.frame.import_hbase("test_startendtag", [["startendtag", "number", int]], start_tag="20", end_tag="50")
        -etc-
        >>> frame.row_count
        33
        >>> frame.inspect(frame.row_count)
        [##]  startendtag_number
        ========================
        [0]                    6
        [1]                   30
        [2]                   54
        [3]                   78
        [4]                    7
        [5]                   31
        [6]                   55
        [7]                   79
        [8]                    8
        [9]                   32
        [10]                  73
        [11]                  56
        [12]                  80
        [13]                   9
        [14]                  33
        [15]                  57
        [16]                  81
        [17]                  10
        [18]                  34
        [19]                  58


        [##]  startendtag_number
        ========================
        [20]                  82
        [21]                   2
        [22]                  11
        [23]                  35
        [24]                  59
        [25]                  83
        [26]                  12
        [27]                  36
        [28]                  60
        [29]                  84
        [30]                  13
        [31]                  37
        [32]                  26

    </skip>

    """

    if not isinstance(table_name, basestring):
        raise ValueError(
            "table name parameter must be a string, but is {0}.".format(
                type(table_name)))
    if not isinstance(schema, list):
        raise ValueError("schema parameter must be a list, but is {0}.".format(
            type(table_name)))
    if tc is implicit:
        implicit.error("tc")
    if not isinstance(tc, TkContext):
        raise ValueError("tc must be type TkContext, received %s" % type(tc))

    inner_lists = [
        tc._jutils.convert.to_scala_list(
            [item[0], item[1], dtypes.to_string(item[2])]) for item in schema
    ]
    scala_final_schema = tc.jutils.convert.to_scala_list(inner_lists)

    scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importHbase(
        tc.jutils.get_scala_sc(), table_name, scala_final_schema,
        tc._jutils.convert.to_scala_option(start_tag),
        tc._jutils.convert.to_scala_option(end_tag))

    from sparktk.frame.frame import Frame
    return Frame(tc, scala_frame)
예제 #7
0
def import_hbase(table_name, schema, start_tag=None, end_tag=None, tc=implicit):
    """
    Import data from hbase table into frame

    :param table_name: hbase table name
    :param schema: hbase schema as a List of List(string) (columnFamily, columnName, dataType for cell value)
    :param start_tag: optional start tag for filtering
    :param end_tag: optional end tag for filtering
    :return: frame with data from hbase table

    Example
    ---------
    Load data into frame from a hbase table

    <skip>
        >>> frame = tc.frame.import_hbase("demo_test_hbase", [["test_family", "a", int],["test_family", "b", float], ["test_family", "c", int],["test_family", "d", int]])
        -etc-
        >>> frame.inspect()
        [#]  test_family_a  test_family_b  test_family_c  test_family_d
        ===============================================================
        [0]              1            0.2             -2              5
        [1]              2            0.4             -1              6
        [2]              3            0.6              0              7
        [3]              4            0.8              1              8

        Use of start_tag and end_tag. (Hbase creates a unique row id for data in hbase tables)
        start_tag: It is the unique row id from where row scan should start
        end_tag: It is the unique row id where row scan should end

        Assuming you already have data on hbase table "test_startendtag" under "startendtag" family name with single column named "number".
        data: column contains values from 1 to 99. Here rowid is generated by hbase.

        Sample hbase data. Few rows from hbase table looks as below.
        hbase(main):002:0> scan "test_startendtag"
        ROW             COLUMN+CELL
         0          column=startendtag:number, timestamp=1465342524846, value=1
         1          column=startendtag:number, timestamp=1465342524846, value=25
         10         column=startendtag:number, timestamp=1465342524847, value=51
         103        column=startendtag:number, timestamp=1465342524851, value=98
         107        column=startendtag:number, timestamp=1465342524851, value=99
         11         column=startendtag:number, timestamp=1465342524851, value=75
         12         column=startendtag:number, timestamp=1465342524846, value=4
         13         column=startendtag:number, timestamp=1465342524846, value=28
         14         column=startendtag:number, timestamp=1465342524847, value=52
         15         column=startendtag:number, timestamp=1465342524851, value=76
         16         column=startendtag:number, timestamp=1465342524846, value=5
         17         column=startendtag:number, timestamp=1465342524846, value=29
         18         column=startendtag:number, timestamp=1465342524847, value=53
         19         column=startendtag:number, timestamp=1465342524851, value=77
         2          column=startendtag:number, timestamp=1465342524847, value=49
         20         column=startendtag:number, timestamp=1465342524846, value=6
         21         column=startendtag:number, timestamp=1465342524846, value=30

        >>> frame = tc.frame.import_hbase("test_startendtag", [["startendtag", "number", int]], start_tag="20", end_tag="50")
        -etc-
        >>> frame.row_count
        33
        >>> frame.inspect(frame.row_count)
        [##]  startendtag_number
        ========================
        [0]                    6
        [1]                   30
        [2]                   54
        [3]                   78
        [4]                    7
        [5]                   31
        [6]                   55
        [7]                   79
        [8]                    8
        [9]                   32
        [10]                  73
        [11]                  56
        [12]                  80
        [13]                   9
        [14]                  33
        [15]                  57
        [16]                  81
        [17]                  10
        [18]                  34
        [19]                  58


        [##]  startendtag_number
        ========================
        [20]                  82
        [21]                   2
        [22]                  11
        [23]                  35
        [24]                  59
        [25]                  83
        [26]                  12
        [27]                  36
        [28]                  60
        [29]                  84
        [30]                  13
        [31]                  37
        [32]                  26

    </skip>

    """

    if not isinstance(table_name, basestring):
        raise ValueError("table name parameter must be a string, but is {0}.".format(type(table_name)))
    if not isinstance(schema, list):
        raise ValueError("schema parameter must be a list, but is {0}.".format(type(table_name)))
    if tc is implicit:
        implicit.error("tc")
    if not isinstance(tc, TkContext):
        raise ValueError("tc must be type TkContext, received %s" % type(tc))

    inner_lists=[tc._jutils.convert.to_scala_list([item[0], item[1], dtypes.to_string(item[2])]) for item in schema]
    scala_final_schema = tc.jutils.convert.to_scala_list(inner_lists)

    scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importHbase(tc.jutils.get_scala_sc(),
                                                                                                         table_name, scala_final_schema,
                                                                                                         tc._jutils.convert.to_scala_option(start_tag),
                                                                                                         tc._jutils.convert.to_scala_option(end_tag))

    from sparktk.frame.frame import Frame
    return Frame(tc, scala_frame)
예제 #8
0
def import_csv(path, delimiter=",", header=False, inferschema=True, schema=None, tc=implicit):
    """
    Creates a frame with data from a csv file.

    Parameters
    ----------

    :param path: (str) Full path to the csv file
    :param delimiter: (Optional[str]) A string which indicates the separation of data fields.  This is usually a
                      single character and could be a non-visible character, such as a tab. The default delimiter
                      is a comma (,).
    :param header: (Optional[bool]) Boolean value indicating if the first line of the file will be used to name columns,
                   and not be included in the data.  The default value is false.
    :param inferschema:(Optional[bool]) Boolean value indicating if the column types will be automatically inferred.
                       It requires one extra pass over the data and is false by default.
    :param: schema: (Optional[List[tuple(str, type)]]) Optionally specify the schema for the dataset.  Number of
                    columns specified in the schema must match the number of columns in the csv file provided.
    :return: (Frame) Frame that contains the data from the csv file

    Examples
    --------
    Load a frame from a csv file by specifying the path to the file, delimiter, and options that specify that
    there is a header and to infer the schema based on the data.

        >>> file_path = "../integration-tests/datasets/cities.csv"

        >>> frame = tc.load_frame_from_csv(file_path, "|", header=True, inferschema=True)
        -etc-

        >>> frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]     1  Portland              609456           583776  4.40%   Multnomah
        [1]     2  Salem                 160614           154637  3.87%   Marion
        [2]     3  Eugene                159190           156185  1.92%   Lane
        [3]     4  Gresham               109397           105594  3.60%   Multnomah
        [4]     5  Hillsboro              97368            91611  6.28%   Washington
        [5]     6  Beaverton              93542            89803  4.16%   Washington
        [6]    15  Grants Pass            35076            34533  1.57%   Josephine
        [7]    16  Oregon City            34622            31859  8.67%   Clackamas
        [8]    17  McMinnville            33131            32187  2.93%   Yamhill
        [9]    18  Redmond                27427            26215  4.62%   Deschutes

        >>> frame.schema
        [('rank', int),
         ('city', str),
         ('population_2013', int),
         ('population_2010', int),
         ('change', str),
         ('county', str)]
    """


    if schema is not None:
        inferschema = False   # if a custom schema is provided, don't waste time inferring the schema during load
    if not isinstance(header, bool):
        raise ValueError("header parameter must be a boolean, but is {0}.".format(type(header)))
    if not isinstance(inferschema, bool):
        raise ValueError("inferschema parameter must be a boolean, but is {0}.".format(type(inferschema)))
    if tc is implicit:
        implicit.error('tc')
    if not isinstance(tc, TkContext):
        raise ValueError("tc must be type TkContext, received %s" % type(tc))

    header_str = str(header).lower()
    inferschema_str = str(inferschema).lower()
    pyspark_schema = None

    if (not inferschema) and (schema is not None):
        fields = []
        for column in schema:
            if dtypes._data_type_to_pyspark_type_table.has_key(column[1]):
                fields.append(StructField(column[0], dtypes._data_type_to_pyspark_type_table[column[1]], True))
            else:
                raise TypeError("Unsupported type {0} in schema for column {1}.".format(column[1], column[0]))
        pyspark_schema = StructType(fields)

    sqlContext = SQLContext(tc.sc)
    df = sqlContext.read.format("com.databricks.spark.csv").options(delimiter=delimiter,
                                                                    header=header_str,
                                                                    dateformat="yyyy-MM-dd'T'HH:mm:ss.SSSX",
                                                                    inferschema=inferschema_str).load(path, schema=pyspark_schema)

    df_schema = []

    if schema is None:
        for column in df.schema.fields:
            try:
                datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(type(column.dataType))
            except ValueError:
                raise TypeError("Unsupported data type ({0}) for column {1}.".format(str(column.dataType), column.name))
            df_schema.append((column.name, datatype))
    else:
        df_column_count = len(df.schema.fields)
        custom_column_count = len(schema)
        if (df_column_count != custom_column_count):
            raise ValueError("Bad schema value.  The number of columns in the custom schema ({0}) must match the"
                             "number of columns in the csv file data ({1}).".format(custom_column_count, df_column_count))
        df_schema = schema

    def cast_datetime(row):
        """
        The spark data frame gives uses datetime objects.  Convert them to long (ms since epoch) for our frame.
        """
        data = []
        for column_index in xrange(0, len(df_schema)):
            if df_schema[column_index][1] == dtypes.datetime and isinstance(row[column_index], datetime):
                data.append(long(dtypes.datetime_to_ms(row[column_index])))
            else:
                data.append(row[column_index])
        return data

    rdd = df.rdd

    if any(c[1] == dtypes.datetime for c in df_schema):
        # If any columns are date/time we must do this map
        rdd = df.rdd.map(cast_datetime)

    from sparktk.frame.frame import Frame  # circular dependency, so import late
    jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(df._jdf.rdd())
    rdd = RDD(jrdd, tc.sc)
    return Frame(tc, rdd, df_schema)
예제 #9
0
파일: gmm.py 프로젝트: AayushiD/spark-tk
def load(path, tc=implicit):
    """load GaussianMixtureModel from given path"""
    if tc is implicit:
        implicit.error("tc")
    return tc.load(path, GaussianMixtureModel)
예제 #10
0
파일: create.py 프로젝트: AayushiD/spark-tk
def create(data, schema=None, validate_schema=False, tc=implicit):
    """
    Creates a frame from the given data and schema.  If no schema data types are provided, the schema is inferred
    based on the data in the first 100 rows.

    If schema validation is enabled, all data is is checked to ensure that it matches the schema.  If the data does
    not match the schema's data type, it attempts to cast the data to the proper data type.  When the data is unable
    to be casted to the schema's data type, the item will be missing (None) in the frame.

    :param data: Data source
    :param schema: Optionally specify a schema (list of tuples of string column names and data type), column names
                   (list of strings, and the column data types will be inferred) or None (column data types will be
                   inferred and column names will be numbered like C0, C1, C2, etc).
    :param validate_schema: When True, all data is is checked to ensure that it matches the schema.  If the data does
                            not match the schema's data type, it attempts to cast the data to the proper data type.
                            When the data is unable to be casted to the schema's data type, a ValueError is raised.
                            Defaults to False.
    :param tc: TkContext
    :return: Frame loaded with the specified data


    Examples
    --------

    Create a frame with the specified data.

        >>> data = [["Bob", 30, 8], ["Jim", 45, 9.5], ["Sue", 25, 7], ["George", 15, 6], ["Jennifer", 18, 8.5]]
        >>> frame = tc.frame.create(data)

    Since no schema is provided, the schema will be inferred.  Note that the data set had a mix of strings and
    integers in the third column.  The schema will use the most general data type from the data that it sees, so in
    this example, the column is treated as a float.

        >>> frame.schema
        [('C0', str), ('C1', int), ('C2', float)]

        >>> frame.inspect()
        [#]  C0        C1  C2
        ======================
        [0]  Bob       30    8
        [1]  Jim       45  9.5
        [2]  Sue       25    7
        [3]  George    15    6
        [4]  Jennifer  18  8.5

    We could also enable schema validation, which checks the data against the schema.  If the data does not match the
    schema's data type, it attempts to cast the data to the proper data type.

        >>> frame = tc.frame.create(data, validate_schema=True)

    In this example with schema validation enabled, the integers in column C2 get casted to floats:

        >>> frame.inspect()
        [#]  C0        C1  C2
        ======================
        [0]  Bob       30  8.0
        [1]  Jim       45  9.5
        [2]  Sue       25  7.0
        [3]  George    15  6.0
        [4]  Jennifer  18  8.5

    We could also provide a list of column names when creating the frame.  When a list of column names is provided,
    the data types for the schema are still inferred, but the columns in the schema are labeled with the specified names.

        >>> frame = tc.frame.create(data, schema=["name", "age", "shoe_size"], validate_schema=True)

        >>> frame.schema
        [('name', str), ('age', int), ('shoe_size', float)]

        >>> frame.inspect()
        [#]  name      age  shoe_size
        =============================
        [0]  Bob        30        8.0
        [1]  Jim        45        9.5
        [2]  Sue        25        7.0
        [3]  George     15        6.0
        [4]  Jennifer   18        8.5

    Note that if a value cannot be parsed as the specified data type in the schema, it will show up as missing (None),
    if validate_schema is enabled.  For example, consider the following frame where columns are defined as integers,
    but the data specified has a string in the second row.

        >>> data = [[1, 2, 3], [4, "five", 6]]
        >>> schema = [("a", int), ("b", int), ("c", int)]

        >>> frame = tc.frame.create(data, schema, validate_schema = True)

        >>> frame.inspect()
        [#]  a  b     c
        ===============
        [0]  1     2  3
        [1]  4  None  6

    Note that the spot where the string was located, has it's value missing (None) since it couldn't be parsed to an
    integer.  If validate_schema was disabled, no attempt is made to parse the data to the data type specified by the
    schema, and further frame operations may fail due to the data type discrepancy.

    """
    if tc is implicit:
        implicit.error('tc')    
    from sparktk.frame.frame import Frame
    return Frame(tc, data, schema, validate_schema)
예제 #11
0
def load(path, tc=implicit):
    """load RandomForestClassifierModel from given path"""
    if tc is implicit:
        implicit.error("tc")
    return tc.load(path, RandomForestClassifierModel)
예제 #12
0
def import_csv(path,
               delimiter=",",
               header=False,
               inferschema=True,
               schema=None,
               tc=implicit):
    """
    Creates a frame with data from a csv file.

    Parameters
    ----------

    :param path: (str) Full path to the csv file
    :param delimiter: (Optional[str]) A string which indicates the separation of data fields.  This is usually a
                      single character and could be a non-visible character, such as a tab. The default delimiter
                      is a comma (,).
    :param header: (Optional[bool]) Boolean value indicating if the first line of the file will be used to name columns,
                   and not be included in the data.  The default value is false.
    :param inferschema:(Optional[bool]) Boolean value indicating if the column types will be automatically inferred.
                       It requires one extra pass over the data and is false by default.
    :param: schema: (Optional[List[tuple(str, type)]]) Optionally specify the schema for the dataset.  Number of
                    columns specified in the schema must match the number of columns in the csv file provided.
    :return: (Frame) Frame that contains the data from the csv file

    Examples
    --------
    Load a frame from a csv file by specifying the path to the file, delimiter, and options that specify that
    there is a header and to infer the schema based on the data.

        >>> file_path = "../integration-tests/datasets/cities.csv"

        >>> frame = tc.load_frame_from_csv(file_path, "|", header=True, inferschema=True)
        -etc-

        >>> frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]     1  Portland              609456           583776  4.40%   Multnomah
        [1]     2  Salem                 160614           154637  3.87%   Marion
        [2]     3  Eugene                159190           156185  1.92%   Lane
        [3]     4  Gresham               109397           105594  3.60%   Multnomah
        [4]     5  Hillsboro              97368            91611  6.28%   Washington
        [5]     6  Beaverton              93542            89803  4.16%   Washington
        [6]    15  Grants Pass            35076            34533  1.57%   Josephine
        [7]    16  Oregon City            34622            31859  8.67%   Clackamas
        [8]    17  McMinnville            33131            32187  2.93%   Yamhill
        [9]    18  Redmond                27427            26215  4.62%   Deschutes

        >>> frame.schema
        [('rank', int),
         ('city', str),
         ('population_2013', int),
         ('population_2010', int),
         ('change', str),
         ('county', str)]
    """

    if schema is not None:
        inferschema = False  # if a custom schema is provided, don't waste time inferring the schema during load
    if not isinstance(header, bool):
        raise ValueError(
            "header parameter must be a boolean, but is {0}.".format(
                type(header)))
    if not isinstance(inferschema, bool):
        raise ValueError(
            "inferschema parameter must be a boolean, but is {0}.".format(
                type(inferschema)))
    if tc is implicit:
        implicit.error('tc')
    if not isinstance(tc, TkContext):
        raise ValueError("tc must be type TkContext, received %s" % type(tc))

    header_str = str(header).lower()
    inferschema_str = str(inferschema).lower()
    pyspark_schema = None

    if (not inferschema) and (schema is not None):
        fields = []
        for column in schema:
            if dtypes._data_type_to_pyspark_type_table.has_key(column[1]):
                fields.append(
                    StructField(
                        column[0],
                        dtypes._data_type_to_pyspark_type_table[column[1]],
                        True))
            else:
                raise TypeError(
                    "Unsupported type {0} in schema for column {1}.".format(
                        column[1], column[0]))
        pyspark_schema = StructType(fields)

    sqlContext = SQLContext(tc.sc)
    df = sqlContext.read.format("com.databricks.spark.csv").options(
        delimiter=delimiter,
        header=header_str,
        dateformat="yyyy-MM-dd'T'HH:mm:ss.SSSX",
        inferschema=inferschema_str).load(path, schema=pyspark_schema)

    df_schema = []

    if schema is None:
        for column in df.schema.fields:
            try:
                datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(
                    type(column.dataType))
            except ValueError:
                raise TypeError(
                    "Unsupported data type ({0}) for column {1}.".format(
                        str(column.dataType), column.name))
            df_schema.append((column.name, datatype))
    else:
        df_column_count = len(df.schema.fields)
        custom_column_count = len(schema)
        if (df_column_count != custom_column_count):
            raise ValueError(
                "Bad schema value.  The number of columns in the custom schema ({0}) must match the"
                "number of columns in the csv file data ({1}).".format(
                    custom_column_count, df_column_count))
        df_schema = schema

    def cast_datetime(row):
        """
        The spark data frame gives uses datetime objects.  Convert them to long (ms since epoch) for our frame.
        """
        data = []
        for column_index in xrange(0, len(df_schema)):
            if df_schema[column_index][1] == dtypes.datetime and isinstance(
                    row[column_index], datetime):
                data.append(long(dtypes.datetime_to_ms(row[column_index])))
            else:
                data.append(row[column_index])
        return data

    rdd = df.rdd

    if any(c[1] == dtypes.datetime for c in df_schema):
        # If any columns are date/time we must do this map
        rdd = df.rdd.map(cast_datetime)

    from sparktk.frame.frame import Frame  # circular dependency, so import late
    jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(
        df._jdf.rdd())
    rdd = RDD(jrdd, tc.sc)
    return Frame(tc, rdd, df_schema)
예제 #13
0
def import_pandas(pandas_frame, schema=None, row_index=True, validate_schema=False, tc=implicit):
    """
    Imports data from the specified pandas data frame.

    Parameters
    ----------

    :param pandas_frame: (pandas.DataFrame)  pandas dataframe object
    :param schema: (Optional(list[tuples(string, type)])) Schema description of the fields for a given line.  It is a
                   list of tuples which describe each field, (field name, field type), where the field name is a
                   string, and file is a supported type.  If no schema is provided, the schema will be inferred based
                   on the column names and types from the pandas_frame.
    :param row_index: (Optional(bool)) Indicates if the row_index is present in the pandas dataframe and needs to be
                      ignored when looking at the data values. Default value is True.
    :param validate_schema: (Optional(bool)) If true, validates the data against the schema and attempts to cast the
                            data to the specified type, if it does not match the schema.  Defaults to False.
    :return: (Frame) spark-tk frame that contains data from the pandas_frame

    Examples
    --------

    Create a pandas data frame:

        >>> import pandas
        >>> ratings_data = [[0, "invalid"], [1, "Very Poor"], [2, "Poor"], [3, "Average"], [4, "Good"], [5, "Very Good"]]
        >>> df = pandas.DataFrame(ratings_data, columns=['rating_id', 'rating_text'])

        >>> df
           rating_id rating_text
        0          0     invalid
        1          1   Very Poor
        2          2        Poor
        3          3     Average
        4          4        Good
        5          5   Very Good

        >>> df.columns.tolist()
        ['rating_id', 'rating_text']

        >>> df.dtypes
        rating_id       int64
        rating_text    object
        dtype: object

    When using import_pandas by just passing the pandas data frame, it will use the column names and types from the
    pandas data frame to generate the schema.

        >>> frame = tc.frame.import_pandas(df)

        >>> frame.inspect()
        [#]  rating_id  rating_text
        ===========================
        [0]          0  invalid
        [1]          1  Very Poor
        [2]          2  Poor
        [3]          3  Average
        [4]          4  Good
        [5]          5  Very Good

        >>> frame.schema
        [('rating_id', long), ('rating_text', str)]

    Alternatively, you can specify a schema when importing the pandas data frame.  There is also the option to validate
    the data against the schema.  If this option is enabled, we will attempt to cast the data to the column's data type,
    if it does not match the schema.

    For example, here we will specify a schema where the rating_id column will instead be called 'rating_float' and it's
    data type will be a float.  We will also enable the validate_schema option so that the rating_id value will get
    casted to a float:

        >>> frame = tc.frame.import_pandas(df, schema, validate_schema=True)

        >>> frame.inspect()
        [#]  rating_float  rating_str
        =============================
        [0]           0.0  invalid
        [1]           1.0  Very Poor
        [2]           2.0  Poor
        [3]           3.0  Average
        [4]           4.0  Good
        [5]           5.0  Very Good

        >>> frame.schema
        [('rating_float', float), ('rating_str', unicode)]

    """
    try:
        import pandas
    except:
        raise RuntimeError("pandas module not found, unable to download.  Install pandas or try the take command.")

    if not isinstance(pandas_frame, pandas.DataFrame):
        raise TypeError("data_frame must be a pandas DataFrame.")
    if tc is implicit:
        implicit.error('tc')
    if not isinstance(tc, TkContext):
        raise ValueError("tc must be type TkContext, received %s" % type(tc))
    if schema is not None:
        schema = _validate(schema)
    else:
        schema = _get_schema_from_df(pandas_frame)

    if not row_index:
        pandas_frame = pandas_frame.reset_index()

    pandas_frame = pandas_frame.dropna(thresh=len(pandas_frame.columns))
    field_names = [x[0] for x in schema]
    if len(pandas_frame.columns) != len(field_names):
        raise ValueError("Number of columns in Pandasframe {0} does not match the number of columns in the"
                         " schema provided {1}.".format(len(pandas_frame.columns), len(field_names)))

    date_time_columns = [i for i, x in enumerate(pandas_frame.dtypes) if x == "datetime64[ns]"]
    has_date_time = len(date_time_columns) > 0

    # pandas gives us the date/time in nm or as a Timestamp, and spark-tk expects it as ms, so we need to do the conversion
    def pandas_datetime_to_ms(row):
        for i in date_time_columns:
            if isinstance(row[i], long):
                row[i] = row[i] / 1000000
            elif isinstance(row[i], pandas.tslib.Timestamp) or isinstance(row[i], datetime):
                dt = row[i]
                # get number of seconds since epoch (%s) and multiply by 1000 for ms then get the
                # microseconds to get the ms precision.
                row[i] = long((long(dt.strftime("%s")) * 1000) + (dt.microsecond // 1000))
        return row

    pandas_rows = pandas_frame[0:len(pandas_frame.index)].values.tolist()

    # if the dataframe has date/time columns, map them to ms
    if (has_date_time):
        pandas_rows = map(pandas_datetime_to_ms, pandas_rows)

    # create frame with the pandas_rows
    frame = tc.frame.create(pandas_rows, schema)

    if validate_schema:
        frame = tc.frame.create(frame.rdd, schema, validate_schema)

    return frame
예제 #14
0
def load(path, tc=implicit):
    """load MaxModel from given path"""
    if tc is implicit:
        implicit.error("tc")
    return tc.load(path, MaxModel)
예제 #15
0
 def subject(a, b, c=implicit, d=4):
     """I am the subject"""
     if c is implicit:
         implicit.error('c')
     return ':'.join([str(a), str(b), str(c), str(d)])
예제 #16
0
def load(path, tc=implicit):
    """load LdaModel from given path"""
    if tc is implicit:
        implicit.error("tc")
    return tc.load(path, LdaModel)
예제 #17
0
def import_csv(path, delimiter=",", header=False, inferschema=True, schema=None, tc=implicit):
    """
    Creates a frame with data from a csv file.

    :param path: Full path to the csv file
    :param delimiter: A string which indicates the separation of data fields.  This is usually a single
                      character and could be a non-visible character, such as a tab. The default delimiter
                      is a comma (,).
    :param header: Boolean value indicating if the first line of the file will be used to name columns,
                   and not be included in the data.  The default value is false.
    :param inferschema: Boolean value indicating if the column types will be automatically inferred.  It
                        requires one extra pass over the data and is false by default.
    :param: schema: Optionally specify the schema for the dataset.  Number of columns specified in the
                    schema must match the number of columns in the csv file provided.
    :return: Frame that contains the data from the csv file

    Examples
    --------
    Load a frame from a csv file by specifying the path to the file, delimiter, and options that specify that
    there is a header and to infer the schema based on the data.

    .. code::

        >>> file_path = "../integration-tests/datasets/cities.csv"
        >>> frame = tc.load_frame_from_csv(file_path, "|", header=True, inferschema=True)
        -etc-
        >>> frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]     1  Portland              609456           583776  4.40%   Multnomah
        [1]     2  Salem                 160614           154637  3.87%   Marion
        [2]     3  Eugene                159190           156185  1.92%   Lane
        [3]     4  Gresham               109397           105594  3.60%   Multnomah
        [4]     5  Hillsboro              97368            91611  6.28%   Washington
        [5]     6  Beaverton              93542            89803  4.16%   Washington
        [6]    15  Grants Pass            35076            34533  1.57%   Josephine
        [7]    16  Oregon City            34622            31859  8.67%   Clackamas
        [8]    17  McMinnville            33131            32187  2.93%   Yamhill
        [9]    18  Redmond                27427            26215  4.62%   Deschutes
        >>> frame.schema
        [('rank', int),
         ('city', str),
         ('population_2013', int),
         ('population_2010', int),
         ('change', str),
         ('county', str)]
    """

    from pyspark.sql import SQLContext

    if schema is not None:
        inferschema = False   # if a custom schema is provided, don't waste time inferring the schema during load
    if not isinstance(header, bool):
        raise ValueError("header parameter must be a boolean, but is {0}.".format(type(header)))
    if not isinstance(inferschema, bool):
        raise ValueError("inferschema parameter must be a boolean, but is {0}.".format(type(inferschema)))
    if tc is implicit:
        implicit.error('tc')
    if not isinstance(tc, TkContext):
        raise ValueError("tc must be type TkContext, received %s" % type(tc))

    header_str = str(header).lower()
    inferschema_str = str(inferschema).lower()
    sqlContext = SQLContext(tc.sc)
    df = sqlContext.read.format("com.databricks.spark.csv").options(delimiter=delimiter,
                                                                    header=header_str,
                                                                    inferschema=inferschema_str).load(path)
    df_schema = []

    if schema is None:
        for column in df.schema.fields:
            datatype = str
            import sparktk.dtypes as dtypes
            try:
                datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(type(column.dataType))
            except ValueError:
                print "Warning: No mapping for type: {0}. Column '{1}' will default to use strings.".format(str(column.dataType), column.name)
            df_schema.append((column.name, datatype))
    else:
        df_column_count = len(df.schema.fields)
        custom_column_count = len(schema)
        if (df_column_count != custom_column_count):
            raise ValueError("Bad schema value.  The number of columns in the custom schema ({0}) must match the"
                             "number of columns in the csv file data ({1}).".format(custom_column_count, df_column_count))
        df_schema = schema
    from sparktk.frame.frame import Frame  # circular dependency, so import late
    return Frame(tc, df.rdd, df_schema)
예제 #18
0
 def subject(a, b, c=implicit, d=4):
     """I am the subject"""
     if c is implicit:
         implicit.error('c')
     return ':'.join([str(a), str(b), str(c), str(d)])
예제 #19
0
def load(path, tc=implicit):
    """load ARIMAXModel from given path"""
    if tc is implicit:
        implicit.error("tc")
    return tc.load(path, ArimaxModel)