Exemplo n.º 1
0
 def save(self, path):
     """
     Save the trained model to the specified path
     :param path: Path to save
     """
     require_type.non_empty_str(path, "path")
     self._scala.save(self._tc._scala_sc, path)
Exemplo n.º 2
0
def import_dcm(dicom_dir_path, min_partitions=2, tc=TkContext.implicit):
    """
    Creates a dicom object with metadataFrame and pixeldataFrame from a dcm file(s)

    Parameters
    ----------

    :param dicom_dir_path: (str) Local/HDFS path of the dcm file(s)
    :param min_partitions: (int) Minimum no.of HDFS partitions to use for import dcm
    :return: (Dicom) returns a dicom object with metadata and pixeldata frames


    Examples
    --------
        #Path can be local/hdfs to dcm file(s)
        >>> dicom_path = "../datasets/dicom_uncompressed"

        #use import_dcm available inside dicom module to create a dicom object from given dicom_path
        >>> dicom = tc.dicom.import_dcm(dicom_path)

        #Type of dicom object created
        >>> type(dicom)
        <class 'sparktk.dicom.dicom.Dicom'>

        #Inspect metadata property to see dicom metadata xml content
        <skip>
        >>> dicom.metadata.inspect(truncate=30)
        [#]  id  metadata
        =======================================
        [0]   0  <?xml version="1.0" encodin...
        [1]   1  <?xml version="1.0" encodin...
        [2]   2  <?xml version="1.0" encodin...
        </skip>

        #pixeldata property is sparktk frame
        >>> pixeldata = dicom.pixeldata.take(1)

        <skip>

        >>> pixeldata
        [[0L, array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  7.,  5., ...,  5.,  7.,  8.],
        [ 0.,  7.,  6., ...,  5.,  6.,  7.],
        ...,
        [ 0.,  6.,  7., ...,  5.,  5.,  6.],
        [ 0.,  2.,  5., ...,  5.,  5.,  4.],
        [ 1.,  1.,  3., ...,  1.,  1.,  0.]])]]
        </skip>

    """

    require_type.non_empty_str(dicom_dir_path, "dicom_dir_path")
    require_type.non_negative_int(min_partitions, "min_partitions")

    TkContext.validate(tc)

    scala_dicom = tc.sc._jvm.org.trustedanalytics.sparktk.dicom.internal.constructors.Import.importDcm(
        tc.jutils.get_scala_sc(), dicom_dir_path, min_partitions)
    from sparktk.dicom.dicom import Dicom
    return Dicom._from_scala(tc, scala_dicom)
Exemplo n.º 3
0
def import_dcm(dicom_dir_path, min_partitions=2, tc=TkContext.implicit):
    """
    Creates a dicom object with metadataFrame and pixeldataFrame from a dcm file(s)

    Parameters
    ----------

    :param dicom_dir_path: (str) Local/HDFS path of the dcm file(s)
    :param min_partitions: (int) Minimum no.of HDFS partitions to use for import dcm
    :return: (Dicom) returns a dicom object with metadata and pixeldata frames


    Examples
    --------
        #Path can be local/hdfs to dcm file(s)
        >>> dicom_path = "../datasets/dicom_uncompressed"

        #use import_dcm available inside dicom module to create a dicom object from given dicom_path
        >>> dicom = tc.dicom.import_dcm(dicom_path)

        #Type of dicom object created
        >>> type(dicom)
        <class 'sparktk.dicom.dicom.Dicom'>

        #Inspect metadata property to see dicom metadata xml content
        <skip>
        >>> dicom.metadata.inspect(truncate=30)
        [#]  id  metadata
        =======================================
        [0]   0  <?xml version="1.0" encodin...
        [1]   1  <?xml version="1.0" encodin...
        [2]   2  <?xml version="1.0" encodin...
        </skip>

        #pixeldata property is sparktk frame
        >>> pixeldata = dicom.pixeldata.take(1)

        <skip>

        >>> pixeldata
        [[0L, array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  7.,  5., ...,  5.,  7.,  8.],
        [ 0.,  7.,  6., ...,  5.,  6.,  7.],
        ...,
        [ 0.,  6.,  7., ...,  5.,  5.,  6.],
        [ 0.,  2.,  5., ...,  5.,  5.,  4.],
        [ 1.,  1.,  3., ...,  1.,  1.,  0.]])]]
        </skip>

    """

    require_type.non_empty_str(dicom_dir_path, "dicom_dir_path")
    require_type.non_negative_int(min_partitions, "min_partitions")

    TkContext.validate(tc)

    scala_dicom = tc.sc._jvm.org.trustedanalytics.sparktk.dicom.internal.constructors.Import.importDcm(tc.jutils.get_scala_sc(), dicom_dir_path, min_partitions)
    from sparktk.dicom.dicom import Dicom
    return Dicom._from_scala(tc, scala_dicom)
Exemplo n.º 4
0
 def test_non_empty_str_type_error(self):
     try:
         require_type.non_empty_str(100, "a")
     except TypeError as e:
         msg = str(e)
         expected = "Expected type <type 'str'>"
         self.assertTrue(expected in msg, "\nexpected=%s\nmessage =%s" % (expected, msg))
     else:
         self.fail("A TypeError should have been raised")
 def test_non_empty_str_type_error(self):
     try:
         require_type.non_empty_str(100, "a")
     except TypeError as e:
         msg = str(e)
         expected = "Expected type <type 'str'>"
         self.assertTrue(expected in msg,
                         "\nexpected=%s\nmessage =%s" % (expected, msg))
     else:
         self.fail("A TypeError should have been raised")
Exemplo n.º 6
0
def export_to_hive(self, hive_table_name, overwrite=False):
    """
    Write current frame to Hive table.

    Table must not exist in Hive. Hive does not support case sensitive table names and columns names.
    Hence column names with uppercase letters will be converted to lower case by Hive.

    Parameters
    ----------

    :param hive_table_name: (str) hive table name
    :param overwrite: (Optional(bool)) Specify whether or not to overwrite the hive table if it already exists.  If
                      overwrite is set to False, and the table already exists, an exception is thrown.

    Example
    --------
        <skip>
        >>> data = [[1, 0.2, -2, 5], [2, 0.4, -1, 6], [3, 0.6, 0, 7], [4, 0.8, 1, 8]]
        >>> schema = [('a', int), ('b', float),('c', int) ,('d', int)]
        >>> my_frame = tc.frame.create(data, schema)
        <progress>

        </skip>

    table_name: (string): table name. It will create new table with given name if it does not exists already.

    <skip>
        >>> my_frame.export_to_hive("demo_test_hive")
        <progress>

    </skip>

    Verify exported frame in hive

    From bash shell

        $hive
        hive> show tables

    You should see demo_test_hive table.

    Run hive> select * from demo_test_hive; (to verify frame).

    To overwrite a table that already exists, set the overwrite parameter to 'True':

        <skip>
        >>> my_frame.export_to_hive("demo_test_hive", overwrite=True)
        </skip>

    """

    require_type.non_empty_str(hive_table_name, "hive_table_name")
    require_type(bool, overwrite, "overwrite")

    self._scala.exportToHive(hive_table_name, overwrite)
Exemplo n.º 7
0
 def test_non_empty_str_value_error(self):
     extra_message = "this is the end."
     try:
         require_type.non_empty_str('', "a", extra_message)
     except ValueError as e:
         msg = str(e)
         expected = "Expected non-empty string"
         self.assertTrue(expected in msg, "\nexpected=%s\nmessage =%s" % (expected, msg))
         self.assertTrue(msg.endswith(extra_message), "message should have ended with '%s', but got '%s'" % (extra_message, msg))
     else:
         self.fail("A ValueError should have been raised")
Exemplo n.º 8
0
    def save(self, path):
        """
        Save the trained model to path

        Parameters
        ----------

        :param path: (str) Path to save
        """
        require_type.non_empty_str(path, "path")
        self._scala.save(self._tc._scala_sc, path, False)
Exemplo n.º 9
0
    def export_to_mar(self, path):
        """
        Exports the trained model as a model archive (.mar) to the specified path

        Parameters
        ----------

        :param path: (str) Path to save the trained model
        :return: (str) Full path to the saved .mar file
        """
        require_type.non_empty_str(path, "path")
        return self._scala.exportToMar(self._tc._scala_sc, path)
Exemplo n.º 10
0
    def export_to_mar(self, path):
        """
        Exports the trained model as a model archive (.mar) to the specified path.

        Parameters
        ----------

        :param path: (str) Path to save the trained model
        :return: (str) Full path to the saved .mar file

        """
        require_type.non_empty_str(path, "path")
        return self._scala.exportToMar(self._tc._scala_sc, path)
Exemplo n.º 11
0
 def test_non_empty_str_value_error(self):
     extra_message = "this is the end."
     try:
         require_type.non_empty_str('', "a", extra_message)
     except ValueError as e:
         msg = str(e)
         expected = "Expected non-empty string"
         self.assertTrue(expected in msg,
                         "\nexpected=%s\nmessage =%s" % (expected, msg))
         self.assertTrue(
             msg.endswith(extra_message),
             "message should have ended with '%s', but got '%s'" %
             (extra_message, msg))
     else:
         self.fail("A ValueError should have been raised")
Exemplo n.º 12
0
def train(frame,
          time_column,
          covariate_columns,
          censor_column,
          convergence_tolerance=1E-6,
          max_steps=100):
    """
    Creates a CoxProportionalHazardsModel by training on the given frame

    Parameters
    ----------

    :param frame: (Frame) A frame to train the model on
    :param time_column: (str) Column name containing the time of occurence of each observation.
    :param covariate_columns: (Seq[str]) List of column(s) containing the covariates.
    :param censor_column: (str) Column name containing censor value of each observation.
    :param convergence_tolerance: (float) Parameter for the convergence tolerance for iterative algorithms. Default is 1E-6
    :param max_steps: (int) Parameter for maximum number of steps. Default is 100
    :return: (CoxProportionalHazardsModel) A trained coxPh model
    """
    from sparktk.frame.frame import Frame
    require_type(Frame, frame, "frame cannot be None")
    require_type.non_empty_str(time_column, "time_column")
    require_type.non_empty_str(censor_column, "censor_column")
    require_type(float, convergence_tolerance, "convergence_tolerance should be float")
    require_type.non_negative_int(max_steps, "max_steps")
    affirm_type.list_of_str(covariate_columns, "covariate_columns")

    tc = frame._tc
    _scala_obj = get_scala_obj(tc)
    scala_covariate_columns = tc.jutils.convert.to_scala_vector_string(covariate_columns)

    scala_model = _scala_obj.train(frame._scala,
                                   time_column,
                                   scala_covariate_columns,
                                   censor_column,
                                   convergence_tolerance,
                                   max_steps)
    return CoxProportionalHazardsModel(tc, scala_model)
Exemplo n.º 13
0
def import_csv(path, delimiter=",", header=False, schema=None, datetime_format="yyyy-MM-dd'T'HH:mm:ss.SSSX", tc=TkContext.implicit):
    """
    Creates a frame with data from a csv file.

    Parameters
    ----------

    :param path: (str) Full path to the csv file
    :param delimiter: (Optional[str]) A string which indicates the separation of data fields.  This is usually a
                      single character and could be a non-visible character, such as a tab. The default delimiter
                      is a comma (,).
    :param header: (Optional[bool]) Boolean value indicating if the first line of the file will be used to name columns
                   (unless a schema is provided), and not be included in the data.  The default value is false.
    :param schema: (Optional(list[tuple(str, type)] or list[str])) The are different options for specifying a schema:

    * Provide the full schema for the frame as a list of tuples (string column name and data type)
    * Provide the column names as a list of strings.  Column data types will be inferred, based on the data.  The column names specified will override column names that are found in the header row.
    * None, where the schema is automatically inferred based on the data.  Columns are named based on the header, or will be named generically ("C0", "C1", "C2", etc).

    :param datetime_format: (str) String specifying how date/time columns are formatted, using the java.text.SimpleDateFormat
                        specified at https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html
    :return: (Frame) Frame that contains the data from the csv file

    Examples
    --------

    Load a frame from a csv file by specifying the path to the file, delimiter

        >>> file_path = "../datasets/cities.csv"

        >>> frame = tc.frame.import_csv(file_path, "|", header=True)
        -etc-

        >>> frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]     1  Portland              609456           583776  4.40%   Multnomah
        [1]     2  Salem                 160614           154637  3.87%   Marion
        [2]     3  Eugene                159190           156185  1.92%   Lane
        [3]     4  Gresham               109397           105594  3.60%   Multnomah
        [4]     5  Hillsboro              97368            91611  6.28%   Washington
        [5]     6  Beaverton              93542            89803  4.16%   Washington
        [6]    15  Grants Pass            35076            34533  1.57%   Josephine
        [7]    16  Oregon City            34622            31859  8.67%   Clackamas
        [8]    17  McMinnville            33131            32187  2.93%   Yamhill
        [9]    18  Redmond                27427            26215  4.62%   Deschutes

        >>> frame.schema
        [('rank', <type 'int'>), ('city', <type 'str'>), ('population_2013', <type 'int'>), ('population_2010', <type 'int'>), ('change', <type 'str'>), ('county', <type 'str'>)]

    The schema parameter can be used to specify a custom schema (column names and data types) or column names (and the
    data types are inferred based on the data).  Here, we will specify the column names, which will override the
    header from the csv file.

        >>> column_names = ["Rank", "City", "2013", "2010", "Percent_Change", "County"]
        >>> frame = tc.frame.import_csv(file_path, "|", header=True, schema=column_names)
        -etc-

        >>> frame.schema
        [('Rank', <type 'int'>), ('City', <type 'str'>), ('2013', <type 'int'>), ('2010', <type 'int'>), ('Percent_Change', <type 'str'>), ('County', <type 'str'>)]

        <hide>
        >>> file_path = "../datasets/unicode.csv"
        >>> schema = [("a", unicode),("b", unicode),("c",unicode)]
        >>> frame = tc.frame.import_csv(file_path, schema=schema, header=False)
        -etc-

        >>> print unicode(frame.get_inspect()).encode('utf-8')  # because this file is UT8 and this docstring is str
        [#]  a  b  c
        ============
        [0]  à  ë  ñ
        [1]  ã  ê  ü

        </hide>

    """
    TkContext.validate(tc)
    require_type.non_empty_str(path, "path")
    require_type.non_empty_str(delimiter, "delimiter")
    require_type(bool, header, "header")
    require_type(str, datetime_format, "datetime_format")

    infer_schema = True
    column_names = []   # custom column names

    if schema is not None:
        if not isinstance(schema, list):
            raise TypeError("Unsupported type %s for schema parameter." % type(schema))
        elif all(isinstance(item, basestring) for item in schema):
            # schema is just column names
            column_names = schema
            schema = None
        else:
            infer_schema = False   # if a custom schema is provided, don't waste time inferring the schema during load
            sparktk_schema.validate(schema)

    header_str = str(header).lower()
    infer_schema_str = str(infer_schema).lower()
    pyspark_schema = None

    if schema is not None:
        fields = []
        for column in schema:
            if dtypes._data_type_to_pyspark_type_table.has_key(column[1]):
                fields.append(StructField(column[0], dtypes._data_type_to_pyspark_type_table[column[1]], True))
            else:
                raise TypeError("Unsupported type {0} in schema for column {1}.".format(column[1], column[0]))
        pyspark_schema = StructType(fields)

    df = tc.sql_context.read.format(
        "com.databricks.spark.csv.org.trustedanalytics.sparktk").options(
            delimiter=delimiter,
            header=header_str,
            dateformat=datetime_format,
            inferschema=infer_schema_str).load(path, schema=pyspark_schema)

    df_schema = []

    if schema is None:
        for i, column in enumerate(df.schema.fields):
            try:
                datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(type(column.dataType))
            except ValueError:
                raise TypeError("Unsupported data type ({0}) for column {1}.".format(str(column.dataType), column.name))
            column_name = column_names[i] if (i < len(column_names)) else column.name
            df_schema.append((column_name, datatype))
    else:
        df_column_count = len(df.schema.fields)
        custom_column_count = len(schema)
        if (df_column_count != custom_column_count):
            raise ValueError("Bad schema value.  The number of columns in the custom schema ({0}) must match the"
                             "number of columns in the csv file data ({1}).".format(custom_column_count, df_column_count))
        df_schema = schema

    def cast_datetime(row):
        """
        The spark data frame gives uses datetime objects.  Convert them to long (ms since epoch) for our frame.
        """
        data = []
        for column_index in xrange(0, len(df_schema)):
            if df_schema[column_index][1] == dtypes.datetime and isinstance(row[column_index], datetime):
                data.append(long(dtypes.datetime_to_ms(row[column_index])))
            else:
                data.append(row[column_index])
        return data

    jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(df._jdf.rdd())
    rdd = RDD(jrdd, tc.sc)

    if any(c[1] == dtypes.datetime for c in df_schema):
        # If any columns are date/time we must do this map
        rdd = df.rdd.map(cast_datetime)

    from sparktk.frame.frame import Frame  # circular dependency, so import late
    return Frame(tc, rdd, df_schema)
Exemplo n.º 14
0
def export_to_tensorflow(self, path, overwrite=False):
    """
    Export frame to TensorFlow Records file on given path

    TensorFlow records are the standard data format for TensorFlow. The recommended format for TensorFlow is a TFRecords file
    containing tf.train.Example protocol buffers. The tf.train.Example protocol buffers encodes (which contain Features as a field).
    https://www.tensorflow.org/how_tos/reading_data

    During export, the API parses Spark SQL DataTypes to TensorFlow compatible DataTypes as below:

    * IntegerType or LongType =>  Int64List
    * FloatType or DoubleType => FloatList
    * ArrayType(Double) [Vector] => FloatList
    * Any other DataType (Ex: String) => BytesList

    Parameters
    ----------

    :param path: (str) HDFS/Local path to export current frame as TensorFlow records
    :param overwrite: (Optional[bool]) Specify whether or not to overwrite the existing file, if a file already exists
                      at the specified path.  If overwrite is set to False, and a file already exists, an exception
                      is thrown.


    Examples
    --------

        >>> file_path = "../datasets/cities.csv"

        >>> frame = tc.frame.import_csv(file_path, "|", header=True)
        -etc-

        >>> frame.sort("rank")

        >>> frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]     1  Portland              609456           583776  4.40%   Multnomah
        [1]     2  Salem                 160614           154637  3.87%   Marion
        [2]     3  Eugene                159190           156185  1.92%   Lane
        [3]     4  Gresham               109397           105594  3.60%   Multnomah
        [4]     5  Hillsboro              97368            91611  6.28%   Washington
        [5]     6  Beaverton              93542            89803  4.16%   Washington
        [6]     7  Bend                   81236            76639  6.00%   Deschutes
        [7]     8  Medford                77677            74907  3.70%   Jackson
        [8]     9  Springfield            60177            59403  1.30%   Lane
        [9]    10  Corvallis              55298            54462  1.54%   Benton

        >>> destPath = "../tests/sandbox/output24.tfr"

        >>> import os
        ... if os.path.exists(filename) os.remove(destPath)

        >>> frame.export_to_tensorflow(destPath)

    Check for output24.tfr in specified destination path either on Local or HDFS file system.

    An existing file can be overwritten by setting the overwrite parameter to True when using the export_to_tensorflow
    operation.  To demonstrate this, we will modify the frame, by removing some columns, and then export the frame
    the the same path that was previously used.  Note that if the overwrite parameter is not set to True, an exception
    would be thrown, since there is already a file at the specified path.

        >>> frame.drop_columns(["population_2010", "change"])
        >>> frame.export_to_tensorflow(destPath, overwrite=True)

    """

    require_type.non_empty_str(path, "path")
    require_type(bool, overwrite, "overwrite")

    self._scala.exportToTensorflow(path, overwrite)
Exemplo n.º 15
0
 def test_non_empty_str(self):
     require_type.non_empty_str("something", "a")
Exemplo n.º 16
0
def export_to_json(self, path, count=0, offset=0, overwrite=False):
    """
    Write current frame to HDFS in Json format.

    Parameters
    ----------

    :param path: (str) The HDFS folder path where the files will be created.
    :param count: (Optional[int]) The number of records you want. Default (0), or a non-positive value, is the
                   whole frame.
    :param offset: (Optional[int]) The number of rows to skip before exporting to the file. Default is zero (0).
    :param overwrite: (Optional[bool]) Specify whether or not to overwrite the existing file, if one already
                      exists at the specified path.  If overwrite is set to False and the file already exists,
                      an exception is thrown.

    Example
    -------

    Start out by creating a frame and then exporting it to a json file.

        <hide>
        >>> from setup import get_sandbox_path
        >>> file_path = get_sandbox_path("export_example.json")
        </hide>
        >>> frame = tc.frame.create([[1, 2, 3], [4, 5, 6]])
        >>> frame.inspect()
        [#]  C0  C1  C2
        ===============
        [0]   1   2   3
        [1]   4   5   6

        >>> frame.export_to_json(file_path)

    Import the data from the json file that we just created, and then inspect the data in the frame.

        >>> import json
        >>> # function used for parsing json rows
        >>> def parse_json(row):
        ...     record = json.loads(row.records)
        ...     columns = record.values()
        ...     columns.reverse()
        ...     return columns

        >>> frame2 = tc.frame.import_json(file_path)
        <hide>
        >>> frame2.sort("records")
        </hide>
        >>> frame2.inspect()
        [#]  records
        =================================
        [0]  {"C0":"1","C1":"2","C2":"3"}
        [1]  {"C0":"4","C1":"5","C2":"6"}

    Map columns and parse json into columns:

        >>> frame2 = frame2.map_columns(parse_json, [('C0', int), ('C1', int), ('C2', int)])
        <hide>
        >>> frame2.sort("C0")
        </hide>
        >>> frame2.inspect()
        [#]  C0  C1  C2
        ===============
        [0]   1   2   3
        [1]   4   5   6

    We can also modify the data in the original frame, and then export to the json file again, using the 'overwrite'
    parameter to specify that we want to overwrite the existing file with the new data.

        >>> frame.add_columns(lambda row: row.C2 * 2, ("C3", int))
        <hide>
        >>> frame.sort("C0")
        </hide>
        >>> frame.inspect()
        [#]  C0  C1  C2  C3
        ===================
        [0]   1   2   3   6
        [1]   4   5   6  12

        >>> frame.export_to_json(file_path, overwrite=True)

    Again, import the data from the json file, and inspect the data in the frame.

        >>> frame3 = tc.frame.import_json(file_path)
        <hide>
        >>> frame3.sort("records")
        </hide>
        >>> frame3.inspect()
        [#]  records
        ===========================================
        [0]  {"C0":"1","C1":"2","C2":"3","C3":"6"}
        [1]  {"C0":"4","C1":"5","C2":"6","C3":"12"}

        >>> frame3 = frame3.map_columns(parse_json, [('C0', int), ('C1', int), ('C2', int), ('C3', int)])
        <hide>
        >>> frame3.sort("C0")
        </hide>
        >>> frame3.inspect()
        [#]  C0  C1  C2  C3
        ===================
        [0]  1   2   3    6
        [1]  4   5   6   12

    """

    require_type.non_empty_str(path, "path")
    require_type(int, count, "count")
    require_type(int, offset, "offset")
    require_type(bool, overwrite, "overwrite")

    self._scala.exportToJson(path, count, offset, overwrite)
Exemplo n.º 17
0
def import_xml(file_name, record_tag, tc=TkContext.implicit):
    """
    Imports a file of XML records

    XML records can span multiple lines.  Returns a Frame of one column containing a XML string per row

    Note: Only records which start with the given tag will be included (multiple different tags not supported)

    Parameters
    ----------

    :param file_name: file path
    :param record_tag: value of the XML element which contains a record
    :return: Frame

    Examples
    --------

    Consider a file of XML records:

        <?xml version="1.0" encoding="UTF-8"?>
        <table>
            <shape type="triangle">
                <x>0</x>
                <y>0</y>
                <size>12</size>
            </shape>
            <shape type="square">
                <x>8</x>
                <y>0</y>
                <size>4</size>
            </shape>
            <shape color="blue" type="pentagon">
                <x>0</x>
                <y>10</y>
                <size>2</size>
            </shape>
            <shape type="square">
                <x>-4</x>
                <y>6</y>
                <size>7</size>
            </shape>
        </table>

    We can parse this file into a frame of records:

        >>> f = tc.frame.import_xml("../datasets/shapes1.xml", "shape")
        >>> f.inspect()
        [#]  records
        =========================================
        [0]  <shape type="triangle">
                     <x>0</x>
                     <y>0</y>
                     <size>12</size>
                 </shape>
        [1]  <shape type="square">
                     <x>8</x>
                     <y>0</y>
                     <size>4</size>
                 </shape>
        [2]  <shape color="blue" type="pentagon">
                     <x>0</x>
                     <y>10</y>
                     <size>2</size>
                 </shape>
        [3]  <shape type="square">
                     <x>-4</x>
                     <y>6</y>
                     <size>7</size>
                 </shape>


    We can further break the XML records into individual columns with a map_columns (or add_columns) operation:


        >>> import xml.etree.ElementTree as ET
        >>> def parse_my_xml(row):
        ...     ele = ET.fromstring(row[0])
        ...     return [ele.get("type"), int(ele.find("x").text), int(ele.find("y").text), int(ele.find("size").text)]

        >>> f2 = f.map_columns(parse_my_xml, [('shape', str), ('x', int), ('y', int), ('size', int)])

        >>> f2.inspect()
        [#]  shape     x   y   size
        ===========================
        [0]  triangle   0   0    12
        [1]  square     8   0     4
        [2]  pentagon   0  10     2
        [3]  square    -4   6     7


    Consider another file of XML records, this time with different element names for the records:

        <?xml version="1.0" encoding="UTF-8"?>
        <shapes>
            <triangle>
                <x>0</x>
                <y>0</y>
                <size>12</size>
            </triangle>
            <square>
                <x>8</x>
                <y>0</y>
                <size>4</size>
            </square>
            <pentagon color="blue">
                <x>0</x>
                <y>10</y>
                <size>2</size>
            </pentagon>
            <square>
                <x>-4</x>
                <y>6</y>
                <size>7</size>
            </square>
        </shapes>

    We can parse this file into a frame of records of a single type.  We must pick only one.  The others
    will be filtered out:

        >>> f3 = tc.frame.import_xml("../datasets/shapes2.xml", "square")
        >>> f3.inspect()
        [#]  records
        ===========================
        [0]  <square>
                     <x>8</x>
                     <y>0</y>
                     <size>4</size>
                 </square>
        [1]  <square>
                     <x>-4</x>
                     <y>6</y>
                     <size>7</size>
                 </square>


    We can further break the XML records into individual columns with a map_columns (or add_columns) operation:


        >>> def parse_my_squares(row):
        ...     ele = ET.fromstring(row[0])
        ...     return [int(ele.find("x").text), int(ele.find("y").text), int(ele.find("size").text)]

        >>> f4 = f3.map_columns(parse_my_squares, [('x', int), ('y', int), ('size', int)])

        >>> f4.inspect()
        [#]  x   y  size
        ================
        [0]   8  0     4
        [1]  -4  6     7

    """

    TkContext.validate(tc)
    require_type.non_empty_str(file_name, "file_name")
    require_type.non_empty_str(record_tag, "record_tag")
    scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.\
        frame.internal.constructors.ImportMultiLineRecords.importXml(tc.jutils.get_scala_sc(), file_name, record_tag)
    from sparktk.frame.frame import Frame
    return Frame(tc, scala_frame)
Exemplo n.º 18
0
def import_json(file_name, tc=TkContext.implicit):
    """
    Imports a file of JSON records

    JSON records can span multiple lines.  Returns a Frame of one column containing a JSON string per row

    Parameters
    ----------

    :param file_name: file path
    :return: Frame

    Examples
    --------

    Consider a file of JSON records:

        { "obj": {
            "color": "blue",
            "size": 4,
            "shape": "square" }
          }
          { "obj": {
          "color": "green",
          "size": 3,
          "shape": "triangle" }
          }
          { "obj": { "color": "yellow", "size": 5, "shape": "pentagon" } }
          { "obj": {
          "color": "orange",
          "size": 2,
          "shape": "lentil" }
        }

    We can parse this file into a frame of records:

        >>> f = tc.frame.import_json("../datasets/shapes.json")
        >>> f.inspect()
        [#]  records
        =====================================================================
        [0]  { "obj": {
               "color": "blue",
               "size": 4,
               "shape": "square" }
             }
        [1]  { "obj": {
             "color": "green",
             "size": 3,
             "shape": "triangle" }
             }
        [2]  { "obj": { "color": "yellow", "size": 5, "shape": "pentagon" } }
        [3]  { "obj": {
             "color": "orange",
             "size": 2,
             "shape": "lentil" }
             }


    We can further break the JSON records into individual columns with a map_columns (or add_columns) operation:

        >>> import json
        >>> def parse_my_json(row):
        ...     record = json.loads(row.records)['obj']
        ...     return [record['color'], record['size'], record['shape']]

        >>> f2 = f.map_columns(parse_my_json, [('color', str), ('size', int), ('shape', str)])
        >>> f2.inspect()
        [#]  color   size  shape
        ===========================
        [0]  blue       4  square
        [1]  green      3  triangle
        [2]  yellow     5  pentagon
        [3]  orange     2  lentil

    """

    TkContext.validate(tc)
    require_type.non_empty_str(file_name, "file_name")
    scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.\
        frame.internal.constructors.ImportMultiLineRecords.importJson(tc.jutils.get_scala_sc(), file_name)
    from sparktk.frame.frame import Frame
    return Frame(tc, scala_frame)
Exemplo n.º 19
0
 def test_non_empty_str(self):
     require_type.non_empty_str("something", "a")
Exemplo n.º 20
0
def train(frame,
          source_column_name,
          dest_column_name,
          weight_column_name,
          max_steps=10,
          regularization=0.5,
          alpha=0.5,
          num_factors=3,
          use_implicit=False,
          num_user_blocks=2,
          num_item_blocks=3,
          checkpoint_iterations=10,
          target_rmse=0.05):
    """
    Create collaborative filtering model by training on given frame

    Parameters
    ----------

    :param frame: (Frame) The frame containing the data to train on
    :param source_column_name: (str) source column name.
    :param dest_column_name: (str) destination column name.
    :param weight_column_name: (str) weight column name.
    :param max_steps: (int) max number of super-steps (max iterations) before the algorithm terminates. Default = 10
    :param regularization: (float) value between 0 .. 1
    :param alpha: (double) value between 0 .. 1
    :param num_factors: (int) number of the desired factors (rank)
    :param use_implicit: (bool) use implicit preference
    :param num_user_blocks: (int) number of user blocks
    :param num_item_blocks: (int) number of item blocks
    :param checkpoint_iterations: (int) Number of iterations between checkpoints
    :param target_rmse: (double) target RMSE
    :return: (CollaborativeFilteringModel) A trained collaborative filtering model
    """
    from sparktk.frame.frame import Frame
    require_type(Frame, frame, 'frame')
    require_type.non_empty_str(source_column_name, "source_column_name")
    require_type.non_empty_str(dest_column_name, "dest_column_name")
    require_type.non_empty_str(weight_column_name, "weight_column_name")
    require_type.non_negative_int(max_steps, "max_steps")
    require_type(float, regularization, "regularization")
    if regularization > 1 or regularization < 0:
        raise ValueError(
            "'regularization' parameter must have a value between 0 and 1")
    require_type(float, alpha, "alpha")
    if alpha > 1 or alpha < 0:
        raise ValueError("'alpha' parameter must have a value between 0 and 1")
    require_type.non_negative_int(num_factors, "num_factors")
    require_type(bool, use_implicit, "use_implicit")
    require_type.non_negative_int(num_user_blocks, "num_user_blocks")
    require_type.non_negative_int(num_item_blocks, "num_item_blocks")
    require_type.non_negative_int(checkpoint_iterations,
                                  "checkpoint_iterations")
    require_type(float, target_rmse, "target_rmse")
    tc = frame._tc
    _scala_obj = get_scala_obj(tc)
    scala_model = _scala_obj.train(frame._scala, source_column_name,
                                   dest_column_name, weight_column_name,
                                   max_steps, regularization, alpha,
                                   num_factors, use_implicit, num_user_blocks,
                                   num_item_blocks, checkpoint_iterations,
                                   target_rmse)
    return CollaborativeFilteringModel(tc, scala_model)
Exemplo n.º 21
0
def export_to_jdbc(self, connection_url, table_name, overwrite=False):
    """
    Write current frame to JDBC table

    Parameters
    ----------

    :param connection_url: (str) JDBC connection url to database server
    :param table_name: (str) JDBC table name
    :param overwrite: (Optional(bool)) Specify whether or not to overwrite the existing table, if one already exists with the
                      the same name.  If overwrite is set to False and a table with the same name already exists, an
                      exception is thrown.

    Example
    -------

    <skip>

        >>> from sparktk import TkContext
        >>> c=TkContext(sc)
        >>> data = [[1, 0.2, -2, 5], [2, 0.4, -1, 6], [3, 0.6, 0, 7], [4, 0.8, 1, 8]]
        >>> schema = [('a', int), ('b', float),('c', int) ,('d', int)]
        >>> my_frame = tc.frame.create(data, schema)
        <progress>
    </skip>

    connection_url : (string) : "jdbc:{datasbase_type}://{host}/{database_name}

    Sample connection string for postgres
    ex: jdbc:postgresql://localhost/postgres [standard connection string to connect to default 'postgres' database]

    table_name: (string): table name. It will create new table with given name if it does not exists already.

    <skip>
        >>> my_frame.export_to_jdbc("jdbc:postgresql://localhost/postgres", "demo_test")
        <progress>
    </skip>

    Verify exported frame in postgres

        From bash shell

        $sudo -su ppostgres psql
        postgres=#\d

    You should see demo_test table.

    Run postgres=#select * from demo_test (to verify frame).

     Notes
    -----

        java.sql.SQLException: No suitable driver found for <jdbcUrl>

    If this error is encountered while running your application, then your JDBC library cannot be found by the node
    running the application. If you're running in Local mode, make sure that you have used the --driver-class-path
    parameter. If a Spark cluster is involved, make sure that each cluster member has a copy of library, and that
    each node of the cluster has been restarted since you modified the spark-defaults.conf file.  See this
    [site](https://sparkour.urizone.net/recipes/using-jdbc/).

    Sparktk does not come with any JDBC drivers.  A driver compatible with the JDBC data sink must be supplied when
    creating the TkContext instance:

        <skip>
        >>> tc = sparktk.TkContext(pyspark_submit_args='--jars myJDBCDriver.jar')
        </skip>
    """

    require_type.non_empty_str(connection_url, "connection_url")
    require_type.non_empty_str(table_name, "table_name")
    require_type(bool, overwrite, "overwrite")

    self._scala.exportToJdbc(connection_url, table_name, overwrite)
Exemplo n.º 22
0
def train(frame,
          source_column_name,
          dest_column_name,
          weight_column_name,
          max_steps=10,
          regularization=0.5,
          alpha=0.5,
          num_factors=3,
          use_implicit=False,
          num_user_blocks=2,
          num_item_blocks=3,
          checkpoint_iterations=10,
          target_rmse=0.05):
    """
    Create collaborative filtering model by training on given frame

    Parameters
    ----------

    :param frame: (Frame) The frame containing the data to train on
    :param source_column_name: (str) source column name.
    :param dest_column_name: (str) destination column name.
    :param weight_column_name: (str) weight column name.
    :param max_steps: (int) max number of super-steps (max iterations) before the algorithm terminates. Default = 10
    :param regularization: (float) value between 0 .. 1
    :param alpha: (double) value between 0 .. 1
    :param num_factors: (int) number of the desired factors (rank)
    :param use_implicit: (bool) use implicit preference
    :param num_user_blocks: (int) number of user blocks
    :param num_item_blocks: (int) number of item blocks
    :param checkpoint_iterations: (int) Number of iterations between checkpoints
    :param target_rmse: (double) target RMSE
    :return: (CollaborativeFilteringModel) A trained collaborative filtering model
    """
    from sparktk.frame.frame import Frame
    require_type(Frame, frame, 'frame')
    require_type.non_empty_str(source_column_name, "source_column_name")
    require_type.non_empty_str(dest_column_name, "dest_column_name")
    require_type.non_empty_str(weight_column_name, "weight_column_name")
    require_type.non_negative_int(max_steps, "max_steps")
    require_type(float, regularization, "regularization")
    if regularization > 1 or regularization < 0:
        raise ValueError("'regularization' parameter must have a value between 0 and 1")
    require_type(float, alpha, "alpha")
    if alpha > 1 or alpha < 0:
        raise ValueError("'alpha' parameter must have a value between 0 and 1")
    require_type.non_negative_int(num_factors, "num_factors")
    require_type(bool, use_implicit, "use_implicit")
    require_type.non_negative_int(num_user_blocks, "num_user_blocks")
    require_type.non_negative_int(num_item_blocks, "num_item_blocks")
    require_type.non_negative_int(checkpoint_iterations, "checkpoint_iterations")
    require_type(float, target_rmse, "target_rmse")
    tc = frame._tc
    _scala_obj = get_scala_obj(tc)
    scala_model = _scala_obj.train(frame._scala,
                                   source_column_name,
                                   dest_column_name,
                                   weight_column_name,
                                   max_steps,
                                   regularization,
                                   alpha,
                                   num_factors,
                                   use_implicit,
                                   num_user_blocks,
                                   num_item_blocks,
                                   checkpoint_iterations,
                                   target_rmse)
    return CollaborativeFilteringModel(tc, scala_model)
Exemplo n.º 23
0
def import_xml(file_name, record_tag, tc=TkContext.implicit):
    """
    Imports a file of XML records

    XML records can span multiple lines.  Returns a Frame of one column containing a XML string per row

    Note: Only records which start with the given tag will be included (multiple different tags not supported)

    Parameters
    ----------

    :param file_name: file path
    :param record_tag: value of the XML element which contains a record
    :return: Frame

    Examples
    --------

    Consider a file of XML records:

        <?xml version="1.0" encoding="UTF-8"?>
        <table>
            <shape type="triangle">
                <x>0</x>
                <y>0</y>
                <size>12</size>
            </shape>
            <shape type="square">
                <x>8</x>
                <y>0</y>
                <size>4</size>
            </shape>
            <shape color="blue" type="pentagon">
                <x>0</x>
                <y>10</y>
                <size>2</size>
            </shape>
            <shape type="square">
                <x>-4</x>
                <y>6</y>
                <size>7</size>
            </shape>
        </table>

    We can parse this file into a frame of records:

        >>> f = tc.frame.import_xml("../datasets/shapes1.xml", "shape")
        >>> f.inspect()
        [#]  records
        =========================================
        [0]  <shape type="triangle">
                     <x>0</x>
                     <y>0</y>
                     <size>12</size>
                 </shape>
        [1]  <shape type="square">
                     <x>8</x>
                     <y>0</y>
                     <size>4</size>
                 </shape>
        [2]  <shape color="blue" type="pentagon">
                     <x>0</x>
                     <y>10</y>
                     <size>2</size>
                 </shape>
        [3]  <shape type="square">
                     <x>-4</x>
                     <y>6</y>
                     <size>7</size>
                 </shape>


    We can further break the XML records into individual columns with a map_columns (or add_columns) operation:


        >>> import xml.etree.ElementTree as ET
        >>> def parse_my_xml(row):
        ...     ele = ET.fromstring(row[0])
        ...     return [ele.get("type"), int(ele.find("x").text), int(ele.find("y").text), int(ele.find("size").text)]

        >>> f2 = f.map_columns(parse_my_xml, [('shape', str), ('x', int), ('y', int), ('size', int)])

        >>> f2.inspect()
        [#]  shape     x   y   size
        ===========================
        [0]  triangle   0   0    12
        [1]  square     8   0     4
        [2]  pentagon   0  10     2
        [3]  square    -4   6     7


    Consider another file of XML records, this time with different element names for the records:

        <?xml version="1.0" encoding="UTF-8"?>
        <shapes>
            <triangle>
                <x>0</x>
                <y>0</y>
                <size>12</size>
            </triangle>
            <square>
                <x>8</x>
                <y>0</y>
                <size>4</size>
            </square>
            <pentagon color="blue">
                <x>0</x>
                <y>10</y>
                <size>2</size>
            </pentagon>
            <square>
                <x>-4</x>
                <y>6</y>
                <size>7</size>
            </square>
        </shapes>

    We can parse this file into a frame of records of a single type.  We must pick only one.  The others
    will be filtered out:

        >>> f3 = tc.frame.import_xml("../datasets/shapes2.xml", "square")
        >>> f3.inspect()
        [#]  records
        ===========================
        [0]  <square>
                     <x>8</x>
                     <y>0</y>
                     <size>4</size>
                 </square>
        [1]  <square>
                     <x>-4</x>
                     <y>6</y>
                     <size>7</size>
                 </square>


    We can further break the XML records into individual columns with a map_columns (or add_columns) operation:


        >>> def parse_my_squares(row):
        ...     ele = ET.fromstring(row[0])
        ...     return [int(ele.find("x").text), int(ele.find("y").text), int(ele.find("size").text)]

        >>> f4 = f3.map_columns(parse_my_squares, [('x', int), ('y', int), ('size', int)])

        >>> f4.inspect()
        [#]  x   y  size
        ================
        [0]   8  0     4
        [1]  -4  6     7

    """

    TkContext.validate(tc)
    require_type.non_empty_str(file_name, "file_name")
    require_type.non_empty_str(record_tag, "record_tag")
    scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.\
        frame.internal.constructors.ImportMultiLineRecords.importXml(tc.jutils.get_scala_sc(), file_name, record_tag)
    from sparktk.frame.frame import Frame
    return Frame(tc, scala_frame)
Exemplo n.º 24
0
def import_csv_raw(path, delimiter=",", header=False, tc=TkContext.implicit):
    """
    Creates a frame by importing the data as strings from the specified csv file.  If the csv file has a header row,
    those values will be used as column names.  Otherwise, columns will be named generically, like 'C0', 'C1', 'C2', etc.

    Parameters
    ----------

    :param path: (str) Full path to the csv file
    :param delimiter: (str) A string which indicates the separation of data fields.  This is usually a single character
                      and could be a non-visible character, such as a tab. The default delimiter is a comma (,).
    :param header: (bool) Boolean value indicating if the first line of the file will be used to name columns, and not
                   be included in the data.  The default value is false.
    :return: (Frame) Frame that contains the data from the csv file

    Examples
    --------

    Import raw data from a csv file by specifying the path to the file, delimiter, and header option.  All data will
    be brought in the frame as strings, and columns will be named according to the header row, if there was one.

        >>> file_path = "../datasets/cities.csv"

        >>> frame = tc.frame.import_csv_raw(file_path, delimiter="|", header=True)
        -etc-

        >>> frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]  1     Portland     609456           583776           4.40%   Multnomah
        [1]  2     Salem        160614           154637           3.87%   Marion
        [2]  3     Eugene       159190           156185           1.92%   Lane
        [3]  4     Gresham      109397           105594           3.60%   Multnomah
        [4]  5     Hillsboro    97368            91611            6.28%   Washington
        [5]  6     Beaverton    93542            89803            4.16%   Washington
        [6]  15    Grants Pass  35076            34533            1.57%   Josephine
        [7]  16    Oregon City  34622            31859            8.67%   Clackamas
        [8]  17    McMinnville  33131            32187            2.93%   Yamhill
        [9]  18    Redmond      27427            26215            4.62%   Deschutes

        >>> frame.schema
        [('rank', <type 'str'>), ('city', <type 'str'>), ('population_2013', <type 'str'>), ('population_2010', <type 'str'>), ('change', <type 'str'>), ('county', <type 'str'>)]


    """
    TkContext.validate(tc)
    require_type.non_empty_str(path, "path")
    require_type.non_empty_str(delimiter, "delimiter")
    require_type(bool, header, "header")

    df = tc.sql_context.read.format(
        "com.databricks.spark.csv.org.trustedanalytics.sparktk").options(
            delimiter=delimiter,
            header=str(header).lower(),
            inferschema="false").load(path, schema=None)

    df_schema = []

    for column in df.schema.fields:
        try:
            datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(
                type(column.dataType))
        except ValueError:
            raise TypeError(
                "Unsupported data type ({0}) for column {1}.".format(
                    str(column.dataType), column.name))
        df_schema.append((column.name, datatype))

    jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(
        df._jdf.rdd())
    rdd = RDD(jrdd, tc.sc)

    from sparktk.frame.frame import Frame  # circular dependency, so import late
    return Frame(tc, rdd, df_schema)
Exemplo n.º 25
0
def import_json(file_name, tc=TkContext.implicit):
    """
    Imports a file of JSON records

    JSON records can span multiple lines.  Returns a Frame of one column containing a JSON string per row

    Parameters
    ----------

    :param file_name: file path
    :return: Frame

    Examples
    --------

    Consider a file of JSON records:

        { "obj": {
            "color": "blue",
            "size": 4,
            "shape": "square" }
          }
          { "obj": {
          "color": "green",
          "size": 3,
          "shape": "triangle" }
          }
          { "obj": { "color": "yellow", "size": 5, "shape": "pentagon" } }
          { "obj": {
          "color": "orange",
          "size": 2,
          "shape": "lentil" }
        }

    We can parse this file into a frame of records:

        >>> f = tc.frame.import_json("../datasets/shapes.json")
        >>> f.inspect()
        [#]  records
        =====================================================================
        [0]  { "obj": {
               "color": "blue",
               "size": 4,
               "shape": "square" }
             }
        [1]  { "obj": {
             "color": "green",
             "size": 3,
             "shape": "triangle" }
             }
        [2]  { "obj": { "color": "yellow", "size": 5, "shape": "pentagon" } }
        [3]  { "obj": {
             "color": "orange",
             "size": 2,
             "shape": "lentil" }
             }


    We can further break the JSON records into individual columns with a map_columns (or add_columns) operation:

        >>> import json
        >>> def parse_my_json(row):
        ...     record = json.loads(row.records)['obj']
        ...     return [record['color'], record['size'], record['shape']]

        >>> f2 = f.map_columns(parse_my_json, [('color', str), ('size', int), ('shape', str)])
        >>> f2.inspect()
        [#]  color   size  shape
        ===========================
        [0]  blue       4  square
        [1]  green      3  triangle
        [2]  yellow     5  pentagon
        [3]  orange     2  lentil

    """

    TkContext.validate(tc)
    require_type.non_empty_str(file_name, "file_name")
    scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.\
        frame.internal.constructors.ImportMultiLineRecords.importJson(tc.jutils.get_scala_sc(), file_name)
    from sparktk.frame.frame import Frame
    return Frame(tc, scala_frame)
Exemplo n.º 26
0
def train(frame,
          observation_columns,
          label_column,
          num_trees = 1,
          impurity = "variance",
          max_depth = 4,
          max_bins = 100,
          min_instances_per_node = 1,
          sub_sampling_rate = 1.0,
          feature_subset_category = "auto",
          seed = None,
          categorical_features_info = None):
    """
    Creates a Random Forest Regressor Model by training on the given frame

    Parameters
    ----------

    :param frame: (Frame) frame frame of training data
    :param observation_columns: (list(str)) Column(s) containing the observations
    :param label_column: (str) Column name containing the label for each observation
    :param num_trees: (int) Number of tress in the random forest. Default is 1
    :param impurity: (str) Criterion used for information gain calculation. Default value is "variance".
    :param max_depth: (int) Maximum depth of the tree. Default is 4
    :param max_bins: (int) Maximum number of bins used for splitting features.
    :param min_instances_per_node: (int) Minimum number of records each child node must have after a split.
    :param sub_sampling_rate: (double) Fraction between 0..1 of the training data used for learning each decision tree.
    :param feature_subset_category: (str) Subset of observation columns, i.e., features,
                                 to consider when looking for the best split.
                                 Supported values "auto","all","sqrt","log2","onethird".
                                 If "auto" is set, this is based on num_trees: if num_trees == 1, set to "all"
                                 ; if num_trees > 1, set to "sqrt".
    :param seed: (Optional(int)) Random seed for bootstrapping and choosing feature subsets. Default is a randomly chosen seed.
    :param categorical_features_info: (Optional(Dict(str:int))) Arity of categorical features. Entry (name-> k) indicates
                                      that feature 'name' is categorical with 'k' categories indexed from 0:{0,1,...,k-1}

    :return: (RandomForestRegressorModel) The trained random forest regressor model

    Notes
    -----
    Random Forest is a supervised ensemble learning algorithm used to perform regression. A Random Forest
    Regressor model is initialized, trained on columns of a frame, and used to predict the value of each
    observation in the frame. This model runs the Spark ML implementation of Random Forest. During training,
    the decision trees are trained in parallel. During prediction, the average over-all tree's predicted
    value is the predicted value of the random forest.

    """
    require_type(Frame, frame, 'frame')
    column_list = affirm_type.list_of_str(observation_columns, "observation_columns")
    require_type.non_empty_str(label_column, "label_column")
    require_type.non_negative_int(num_trees, "num_trees")
    require_type.non_empty_str(impurity, "impurity")
    require_type.non_negative_int(max_depth, "max_depth")
    require_type.non_negative_int(max_bins, "max_bins")
    require_type.non_negative_int(min_instances_per_node, "min_instances_per_node")
    require_type(float, sub_sampling_rate, "sub_sampling_rate")
    if sub_sampling_rate > 1 or sub_sampling_rate < 0:
        raise ValueError("'sub_sampling_rate' parameter must have a value between 0 and 1")
    require_type.non_empty_str(feature_subset_category, "feature_subset_category")

    tc = frame._tc
    _scala_obj = get_scala_obj(tc)
    seed = int(os.urandom(2).encode('hex'), 16) if seed is None else seed
    scala_model = _scala_obj.train(frame._scala,
                                   tc.jutils.convert.to_scala_list_string(column_list),
                                   label_column,
                                   num_trees,
                                   impurity,
                                   max_depth,
                                   max_bins,
                                   min_instances_per_node,
                                   sub_sampling_rate,
                                   feature_subset_category,
                                   seed,
                                   __get_categorical_features_info(tc, categorical_features_info))

    return RandomForestRegressorModel(tc, scala_model)
Exemplo n.º 27
0
def import_csv(path, delimiter=",", header=False, schema=None, datetime_format="yyyy-MM-dd'T'HH:mm:ss.SSSX", tc=TkContext.implicit):
    """
    Creates a frame with data from a csv file.

    Parameters
    ----------

    :param path: (str) Full path to the csv file
    :param delimiter: (Optional[str]) A string which indicates the separation of data fields.  This is usually a
                      single character and could be a non-visible character, such as a tab. The default delimiter
                      is a comma (,).
    :param header: (Optional[bool]) Boolean value indicating if the first line of the file will be used to name columns
                   (unless a schema is provided), and not be included in the data.  The default value is false.
    :param schema: (Optional(list[tuple(str, type)] or list[str])) The are different options for specifying a schema:

    * Provide the full schema for the frame as a list of tuples (string column name and data type)
    * Provide the column names as a list of strings.  Column data types will be inferred, based on thedata.  The column names specified will override column names that are found in the header row.
    * None, where the schema is automatically inferred based on the data.  Columns are named based on the header, or will be named generically ("C0", "C1", "C2", etc).

    :param datetime_format: (str) String specifying how date/time columns are formatted, using the java.text.SimpleDateFormat
                        specified at https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html
    :return: (Frame) Frame that contains the data from the csv file

    Examples
    --------

    Load a frame from a csv file by specifying the path to the file, delimiter

        >>> file_path = "../datasets/cities.csv"

        >>> frame = tc.frame.import_csv(file_path, "|", header=True)
        -etc-

        >>> frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]     1  Portland              609456           583776  4.40%   Multnomah
        [1]     2  Salem                 160614           154637  3.87%   Marion
        [2]     3  Eugene                159190           156185  1.92%   Lane
        [3]     4  Gresham               109397           105594  3.60%   Multnomah
        [4]     5  Hillsboro              97368            91611  6.28%   Washington
        [5]     6  Beaverton              93542            89803  4.16%   Washington
        [6]    15  Grants Pass            35076            34533  1.57%   Josephine
        [7]    16  Oregon City            34622            31859  8.67%   Clackamas
        [8]    17  McMinnville            33131            32187  2.93%   Yamhill
        [9]    18  Redmond                27427            26215  4.62%   Deschutes

        >>> frame.schema
        [('rank', <type 'int'>), ('city', <type 'str'>), ('population_2013', <type 'int'>), ('population_2010', <type 'int'>), ('change', <type 'str'>), ('county', <type 'str'>)]

    The schema parameter can be used to specify a custom schema (column names and data types) or column names (and the
    data types are inferred based on the data).  Here, we will specify the column names, which will override the
    header from the csv file.

        >>> column_names = ["Rank", "City", "2013", "2010", "Percent_Change", "County"]
        >>> frame = tc.frame.import_csv(file_path, "|", header=True, schema=column_names)
        -etc-

        >>> frame.schema
        [('Rank', <type 'int'>), ('City', <type 'str'>), ('2013', <type 'int'>), ('2010', <type 'int'>), ('Percent_Change', <type 'str'>), ('County', <type 'str'>)]

        <hide>
        >>> file_path = "../datasets/unicode.csv"
        >>> schema = [("a", unicode),("b", unicode),("c",unicode)]
        >>> frame = tc.frame.import_csv(file_path, schema=schema, header=False)
        -etc-

        >>> frame.inspect()
        [#]  a  b  c
        ============
        [0]  à  ë  ñ
        [1]  ã  ê  ü

        </hide>

    """
    TkContext.validate(tc)
    require_type.non_empty_str(path, "path")
    require_type.non_empty_str(delimiter, "delimiter")
    require_type(bool, header, "header")
    require_type(str, datetime_format, "datetime_format")

    infer_schema = True
    column_names = []   # custom column names

    if schema is not None:
        if not isinstance(schema, list):
            raise TypeError("Unsupported type %s for schema parameter." % type(schema))
        elif all(isinstance(item, basestring) for item in schema):
            # schema is just column names
            column_names = schema
            schema = None
        else:
            infer_schema = False   # if a custom schema is provided, don't waste time inferring the schema during load
            sparktk_schema.validate(schema)

    header_str = str(header).lower()
    infer_schema_str = str(infer_schema).lower()
    pyspark_schema = None

    if schema is not None:
        fields = []
        for column in schema:
            if dtypes._data_type_to_pyspark_type_table.has_key(column[1]):
                fields.append(StructField(column[0], dtypes._data_type_to_pyspark_type_table[column[1]], True))
            else:
                raise TypeError("Unsupported type {0} in schema for column {1}.".format(column[1], column[0]))
        pyspark_schema = StructType(fields)

    df = tc.sql_context.read.format(
        "com.databricks.spark.csv.org.trustedanalytics.sparktk").options(
            delimiter=delimiter,
            header=header_str,
            dateformat=datetime_format,
            inferschema=infer_schema_str).load(path, schema=pyspark_schema)

    df_schema = []

    if schema is None:
        for i, column in enumerate(df.schema.fields):
            try:
                datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(type(column.dataType))
            except ValueError:
                raise TypeError("Unsupported data type ({0}) for column {1}.".format(str(column.dataType), column.name))
            column_name = column_names[i] if (i < len(column_names)) else column.name
            df_schema.append((column_name, datatype))
    else:
        df_column_count = len(df.schema.fields)
        custom_column_count = len(schema)
        if (df_column_count != custom_column_count):
            raise ValueError("Bad schema value.  The number of columns in the custom schema ({0}) must match the"
                             "number of columns in the csv file data ({1}).".format(custom_column_count, df_column_count))
        df_schema = schema

    def cast_datetime(row):
        """
        The spark data frame gives uses datetime objects.  Convert them to long (ms since epoch) for our frame.
        """
        data = []
        for column_index in xrange(0, len(df_schema)):
            if df_schema[column_index][1] == dtypes.datetime and isinstance(row[column_index], datetime):
                data.append(long(dtypes.datetime_to_ms(row[column_index])))
            else:
                data.append(row[column_index])
        return data

    jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(df._jdf.rdd())
    rdd = RDD(jrdd, tc.sc)

    if any(c[1] == dtypes.datetime for c in df_schema):
        # If any columns are date/time we must do this map
        rdd = df.rdd.map(cast_datetime)

    from sparktk.frame.frame import Frame  # circular dependency, so import late
    return Frame(tc, rdd, df_schema)
Exemplo n.º 28
0
def import_csv_raw(path, delimiter=",", header=False, tc=TkContext.implicit):
    """
    Creates a frame by importing the data as strings from the specified csv file.  If the csv file has a header row,
    those values will be used as column names.  Otherwise, columns will be named generically, like 'C0', 'C1', 'C2', etc.

    Parameters
    ----------

    :param path: (str) Full path to the csv file
    :param delimiter: (str) A string which indicates the separation of data fields.  This is usually a single character
                      and could be a non-visible character, such as a tab. The default delimiter is a comma (,).
    :param header: (bool) Boolean value indicating if the first line of the file will be used to name columns, and not
                   be included in the data.  The default value is false.
    :return: (Frame) Frame that contains the data from the csv file

    Examples
    --------

    Import raw data from a csv file by specifying the path to the file, delimiter, and header option.  All data will
    be brought in the frame as strings, and columns will be named according to the header row, if there was one.

        >>> file_path = "../datasets/cities.csv"

        >>> frame = tc.frame.import_csv_raw(file_path, delimiter="|", header=True)
        -etc-

        >>> frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]  1     Portland     609456           583776           4.40%   Multnomah
        [1]  2     Salem        160614           154637           3.87%   Marion
        [2]  3     Eugene       159190           156185           1.92%   Lane
        [3]  4     Gresham      109397           105594           3.60%   Multnomah
        [4]  5     Hillsboro    97368            91611            6.28%   Washington
        [5]  6     Beaverton    93542            89803            4.16%   Washington
        [6]  15    Grants Pass  35076            34533            1.57%   Josephine
        [7]  16    Oregon City  34622            31859            8.67%   Clackamas
        [8]  17    McMinnville  33131            32187            2.93%   Yamhill
        [9]  18    Redmond      27427            26215            4.62%   Deschutes

        >>> frame.schema
        [('rank', <type 'str'>), ('city', <type 'str'>), ('population_2013', <type 'str'>), ('population_2010', <type 'str'>), ('change', <type 'str'>), ('county', <type 'str'>)]


    """
    TkContext.validate(tc)
    require_type.non_empty_str(path, "path")
    require_type.non_empty_str(delimiter, "delimiter")
    require_type(bool, header, "header")

    df = tc.sql_context.read.format(
        "com.databricks.spark.csv.org.trustedanalytics.sparktk").options(
        delimiter=delimiter,
        header=str(header).lower(),
        inferschema="false").load(path, schema=None)

    df_schema = []

    for column in df.schema.fields:
        try:
            datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(type(column.dataType))
        except ValueError:
            raise TypeError("Unsupported data type ({0}) for column {1}.".format(str(column.dataType), column.name))
        df_schema.append((column.name, datatype))

    jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(df._jdf.rdd())
    rdd = RDD(jrdd, tc.sc)

    from sparktk.frame.frame import Frame  # circular dependency, so import late
    return Frame(tc, rdd, df_schema)