示例#1
0
    def __init__(self, tc, source, schema=None, validate_schema=False):
        self._tc = tc
        if self._is_scala_frame(source):
            self._frame = source
        elif self.is_scala_rdd(source):
            scala_schema = schema_to_scala(tc.sc, schema)
            self._frame = self.create_scala_frame(tc.sc, source, scala_schema)
        else:
            if not isinstance(source, RDD):
                if isinstance(schema, list):
                    if all(isinstance(item, basestring) for item in schema):
                        # check if schema is just a list of column names (versus string and data type tuples)
                        schema = self._infer_schema(source, schema)
                    elif not all(isinstance(item, tuple) and
                                  len(item) == 2 and
                                  isinstance(item[0], str) for item in schema):
                        raise TypeError("Invalid schema.  Expected a list of tuples (str, type) with the column name and data type." % type(schema))
                    else:
                        for item in schema:
                            if not self._is_supported_datatype(item[1]):
                                raise TypeError("Invalid schema.  %s is not a supported data type." % str(item[1]))
                elif schema is None:
                    schema = self._infer_schema(source)
                else:
                    # Schema is not a list or None
                    raise TypeError("Invalid schema type: %s.  Expected a list of tuples (str, type) with the column name and data type." % type(schema))
                source = tc.sc.parallelize(source)
            if schema and validate_schema:
                # Validate schema by going through the data and checking the data type and attempting to parse it
                validate_schema_result = self.validate_pyrdd_schema(source, schema)
                source = validate_schema_result.validated_rdd
                logger.debug("%s values were unable to be parsed to the schema's data type." % validate_schema_result.bad_value_count)

            self._frame = PythonFrame(source, schema)
示例#2
0
    def __init__(self, tc, source, schema=None, validate_schema=False):
        self._tc = tc
        if self._is_scala_frame(source):
            self._frame = source
        elif self.is_scala_rdd(source):
            scala_schema = schema_to_scala(tc.sc, schema)
            self._frame = self.create_scala_frame(tc.sc, source, scala_schema)
        elif self.is_scala_dataframe(source):
            self._frame = self.create_scala_frame_from_scala_dataframe(tc.sc, source)
        elif isinstance(source, DataFrame):
            self._frame = self.create_scala_frame_from_scala_dataframe(tc.sc, source._jdf)
        elif isinstance(source, PythonFrame):
            self._frame = source
        else:
            if not isinstance(source, RDD):
                if not isinstance(source, list) or (len(source) > 0 and any(not isinstance(row, (list, tuple)) for row in source)):
                    raise TypeError("Invalid data source.  The data parameter must be a 2-dimensional list (list of row data) or an RDD.")

                inferred_schema = False
                if isinstance(schema, list):
                    if all(isinstance(item, basestring) for item in schema):
                        # check if schema is just a list of column names (versus string and data type tuples)
                        schema = self._infer_schema(source, schema)
                        inferred_schema = True
                    elif not all(isinstance(item, tuple) and
                                  len(item) == 2 and
                                  isinstance(item[0], basestring) for item in schema):
                        raise TypeError("Invalid schema.  Expected a list of tuples (str, type) with the column name and data type, but received type %s." % type(schema))
                    # check for duplicate column names
                    column_names = [col[0] for col in schema]
                    duplicate_column_names = set([col for col in column_names if column_names.count(col) > 1])
                    if len(duplicate_column_names) > 0:
                        raise ValueError("Invalid schema, column names cannot be duplicated: %s" % ", ".join(duplicate_column_names))
                elif schema is None:
                    schema = self._infer_schema(source)
                    inferred_schema = True
                else:
                    # Schema is not a list or None
                    raise TypeError("Invalid schema type: %s.  Expected a list of tuples (str, type) with the column name and data type." % type(schema))
                for item in schema:
                    if not self._is_supported_datatype(item[1]):
                        if inferred_schema:
                            raise TypeError("The %s data type was found when inferring the schema, and it is not a "
                                            "supported data type.  Instead, specify a schema that uses a supported data "
                                            "type, and enable validate_schema so that the data is converted to the proper "
                                            "data type.\n\nInferred schema: %s\n\nSupported data types: %s" %
                                            (str(item[1]), str(schema), dtypes.dtypes))
                        else:
                            raise TypeError("Invalid schema.  %s is not a supported data type.\n\nSupported data types: %s" %
                                            (str(item[1]), dtypes.dtypes))

                source = tc.sc.parallelize(source)
            if schema and validate_schema:
                # Validate schema by going through the data and checking the data type and attempting to parse it
                validate_schema_result = self.validate_pyrdd_schema(source, schema)
                source = validate_schema_result.validated_rdd
                logger.debug("%s values were unable to be parsed to the schema's data type." % validate_schema_result.bad_value_count)

            self._frame = PythonFrame(source, schema)
示例#3
0
 def _scala(self):
     """gets frame backend as Scala Frame, causes conversion if it is current not"""
     if self._is_python:
         # convert PythonFrame to a Scala Frame"""
         scala_schema = schema_to_scala(self._tc.sc, self._frame.schema)
         scala_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.pythonToScala(self._frame.rdd._jrdd, scala_schema)
         self._frame = self.create_scala_frame(self._tc.sc, scala_rdd, scala_schema)
     return self._frame
示例#4
0
 def _scala(self):
     """gets frame backend as Scala Frame, causes conversion if it is current not"""
     if self._is_python:
         # If schema contains matrix dataype,
         # then apply type_coercer_pymlib to convert ndarray to pymlib DenseMatrix for serialization purpose at java
         self._frame.rdd = MatrixCoercion.schema_is_coercible(self._frame.rdd, list(self._frame.schema), True)
         # convert PythonFrame to a Scala Frame"""
         scala_schema = schema_to_scala(self._tc.sc, self._frame.schema)
         scala_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.pythonToScala(self._frame.rdd._jrdd, scala_schema)
         self._frame = self.create_scala_frame(self._tc.sc, scala_rdd, scala_schema)
     return self._frame
示例#5
0
 def __init__(self, tc, source, schema=None):
     self._tc = tc
     if self._is_scala_frame(source):
         self._frame = source
     elif self.is_scala_rdd(source):
         scala_schema = schema_to_scala(tc.sc, schema)
         self._frame = self.create_scala_frame(tc.sc, source, scala_schema)
     else:
         if not isinstance(source, RDD):
             source = tc.sc.parallelize(source)
         if schema:
             self.validate_pyrdd_schema(source, schema)
         self._frame = PythonFrame(source, schema)
示例#6
0
 def __init__(self, tc, source, schema=None):
     self._tc = tc
     if self._is_scala_frame(source):
         self._frame = source
     elif self.is_scala_rdd(source):
         scala_schema = schema_to_scala(tc.sc, schema)
         self._frame = self.create_scala_frame(tc.sc, source, scala_schema)
     else:
         if not isinstance(source, RDD):
             source = tc.sc.parallelize(source)
         if schema:
             self.validate_pyrdd_schema(source, schema)
         self._frame = PythonFrame(source, schema)
示例#7
0
 def _scala(self):
     """gets frame backend as Scala Frame, causes conversion if it is current not"""
     if self._is_python:
         # If schema contains matrix dataype,
         # then apply type_coercer_pymlib to convert ndarray to pymlib DenseMatrix for serialization purpose at java
         self._frame.rdd = MatrixCoercion.schema_is_coercible(
             self._frame.rdd, list(self._frame.schema), True)
         # convert PythonFrame to a Scala Frame"""
         scala_schema = schema_to_scala(self._tc.sc, self._frame.schema)
         scala_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.pythonToScala(
             self._frame.rdd._jrdd, scala_schema)
         self._frame = self.create_scala_frame(self._tc.sc, scala_rdd,
                                               scala_schema)
     return self._frame
示例#8
0
def rename_columns(self, names):
    """
    Rename columns

    Parameters
    ----------

    :param names: (dict) Dictionary of old names to new names.

    Examples
    --------
    Start with a frame with columns *Black* and *White*.

        <hide>

        >>> s = [('Black', unicode), ('White', unicode)]
        >>> rows = [["glass", "clear"],["paper","unclear"]]
        >>> my_frame = tc.frame.create(rows, s)
        -etc-

        </hide>

        >>> print my_frame.schema
        [('Black', <type 'unicode'>), ('White', <type 'unicode'>)]

    Rename the columns to *Mercury* and *Venus*:

        >>> my_frame.rename_columns({"Black": "Mercury", "White": "Venus"})

        >>> print my_frame.schema
        [(u'Mercury', <type 'unicode'>), (u'Venus', <type 'unicode'>)]

    """
    if not isinstance(names, dict):
        raise ValueError(
            "Unsupported 'names' parameter type.  Expected dictionary, but found %s."
            % type(names))
    if self.schema is None:
        raise RuntimeError(
            "Unable rename column(s), because the frame's schema has not been defined."
        )
    if self._is_python:
        scala_rename_map = self._tc.jutils.convert.to_scala_map(names)
        scala_schema = schema_to_scala(self._tc.sc, self._python.schema)
        rename_scala_schema = scala_schema.renameColumns(scala_rename_map)
        self._python.schema = schema_to_python(self._tc.sc,
                                               rename_scala_schema)
    else:
        self._scala.renameColumns(self._tc.jutils.convert.to_scala_map(names))
示例#9
0
def rename_columns(self, names):
    """
    Rename columns

    Parameters
    ----------

    :param names: (dict) Dictionary of old names to new names.

    Examples
    --------
    Start with a frame with columns *Black* and *White*.

        <hide>

        >>> s = [('Black', unicode), ('White', unicode)]
        >>> rows = [["glass", "clear"],["paper","unclear"]]
        >>> my_frame = tc.frame.create(rows, s)
        -etc-

        </hide>

        >>> print my_frame.schema
        [('Black', <type 'unicode'>), ('White', <type 'unicode'>)]

    Rename the columns to *Mercury* and *Venus*:

        >>> my_frame.rename_columns({"Black": "Mercury", "White": "Venus"})

        >>> print my_frame.schema
        [(u'Mercury', <type 'unicode'>), (u'Venus', <type 'unicode'>)]

    """
    if not isinstance(names, dict):
        raise ValueError("Unsupported 'names' parameter type.  Expected dictionary, but found %s." % type(names))
    if self.schema is None:
        raise RuntimeError("Unable rename column(s), because the frame's schema has not been defined.")
    if self._is_python:
        scala_rename_map = self._tc.jutils.convert.to_scala_map(names)
        scala_schema = schema_to_scala(self._tc.sc, self._python.schema)
        rename_scala_schema = scala_schema.renameColumns(scala_rename_map)
        self._python.schema = schema_to_python(self._tc.sc, rename_scala_schema)
    else:
        self._scala.renameColumns(self._tc.jutils.convert.to_scala_map(names))
示例#10
0
 def _frame_to_scala(self, python_frame):
     """converts a PythonFrame to a Scala Frame"""
     scala_schema = schema_to_scala(self._tc.sc, python_frame.schema)
     scala_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.rdd.PythonJavaRdd.pythonToScala(python_frame.rdd._jrdd, scala_schema)
     return self.create_scala_frame(self._tc.sc, scala_rdd, scala_schema)
示例#11
0
 def append_csv_file(self, file_name, schema, separator=','):
     self._scala.appendCsvFile(file_name, schema_to_scala(self._tc.sc, schema), separator)
示例#12
0
 def _frame_to_scala(self, python_frame):
     """converts a PythonFrame to a Scala Frame"""
     scala_schema = schema_to_scala(self._tc.sc, python_frame.schema)
     scala_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.rdd.PythonJavaRdd.pythonToScala(python_frame.rdd._jrdd, scala_schema)
     return self.create_scala_frame(self._tc.sc, scala_rdd, scala_schema)
示例#13
0
    def __init__(self, tc, source, schema=None, validate_schema=False):
        """(Private constructor -- use tc.frame.create or other methods available from the TkContext)"""
        self._tc = tc
        if self._is_scala_frame(source):
            self._frame = source
        elif self._is_scala_rdd(source):
            scala_schema = schema_to_scala(tc.sc, schema)
            self._frame = self._create_scala_frame(tc.sc, source, scala_schema)
        elif self._is_scala_dataframe(source):
            self._frame = self._create_scala_frame_from_scala_dataframe(
                tc.sc, source)
        elif isinstance(source, DataFrame):
            self._frame = self._create_scala_frame_from_scala_dataframe(
                tc.sc, source._jdf)
        elif isinstance(source, PythonFrame):
            self._frame = source
        else:
            if not isinstance(source, RDD):
                if not isinstance(source, list) or (len(source) > 0 and any(
                        not isinstance(row, (list, tuple)) for row in source)):
                    raise TypeError(
                        "Invalid data source.  The data parameter must be a 2-dimensional list (list of row data) or an RDD."
                    )

                inferred_schema = False
                if isinstance(schema, list):
                    if all(isinstance(item, basestring) for item in schema):
                        # check if schema is just a list of column names (versus string and data type tuples)
                        schema = self._infer_schema(source, schema)
                        inferred_schema = True
                    elif not all(
                            isinstance(item, tuple) and len(item) == 2
                            and isinstance(item[0], basestring)
                            for item in schema):
                        raise TypeError(
                            "Invalid schema.  Expected a list of tuples (str, type) with the column name and data type, but received type %s."
                            % type(schema))
                    # check for duplicate column names
                    column_names = [col[0] for col in schema]
                    duplicate_column_names = set([
                        col for col in column_names
                        if column_names.count(col) > 1
                    ])
                    if len(duplicate_column_names) > 0:
                        raise ValueError(
                            "Invalid schema, column names cannot be duplicated: %s"
                            % ", ".join(duplicate_column_names))
                elif schema is None:
                    schema = self._infer_schema(source)
                    inferred_schema = True
                else:
                    # Schema is not a list or None
                    raise TypeError(
                        "Invalid schema type: %s.  Expected a list of tuples (str, type) with the column name and data type."
                        % type(schema))
                for item in schema:
                    if not self._is_supported_datatype(item[1]):
                        if inferred_schema:
                            raise TypeError(
                                "The %s data type was found when inferring the schema, and it is not a "
                                "supported data type.  Instead, specify a schema that uses a supported data "
                                "type, and enable validate_schema so that the data is converted to the proper "
                                "data type.\n\nInferred schema: %s\n\nSupported data types: %s"
                                % (str(item[1]), str(schema), dtypes.dtypes))
                        else:
                            raise TypeError(
                                "Invalid schema.  %s is not a supported data type.\n\nSupported data types: %s"
                                % (str(item[1]), dtypes.dtypes))

                source = tc.sc.parallelize(source)
            if schema and validate_schema:
                # Validate schema by going through the data and checking the data type and attempting to parse it
                validate_schema_result = self.validate_pyrdd_schema(
                    source, schema)
                source = validate_schema_result.validated_rdd
                logger.debug(
                    "%s values were unable to be parsed to the schema's data type."
                    % validate_schema_result.bad_value_count)

            # If schema contains matrix datatype, then apply type_coercer to convert list[list] to numpy ndarray
            map_source = schema_is_coercible(source, list(schema))
            self._frame = PythonFrame(map_source, schema)
示例#14
0
    def __init__(self, tc, source, schema=None, validate_schema=False):
        self._tc = tc
        if self._is_scala_frame(source):
            self._frame = source
        elif self.is_scala_rdd(source):
            scala_schema = schema_to_scala(tc.sc, schema)
            self._frame = self.create_scala_frame(tc.sc, source, scala_schema)
        elif self.is_scala_dataframe(source):
            self._frame = self.create_scala_frame_from_scala_dataframe(
                tc.sc, source)
        elif isinstance(source, DataFrame):
            self._frame = self.create_scala_frame_from_scala_dataframe(
                tc.sc, source._jdf)
        elif isinstance(source, PythonFrame):
            self._frame = source
        else:
            if not isinstance(source, RDD):
                if not isinstance(source, list) or (len(source) > 0 and any(
                        not isinstance(row, (list, tuple)) for row in source)):
                    raise TypeError(
                        "Invalid data source.  The data parameter must be a 2-dimensional list (list of row data) or an RDD."
                    )

                inferred_schema = False
                if isinstance(schema, list):
                    if all(isinstance(item, basestring) for item in schema):
                        # check if schema is just a list of column names (versus string and data type tuples)
                        schema = self._infer_schema(source, schema)
                        inferred_schema = True
                    elif not all(
                            isinstance(item, tuple) and len(item) == 2
                            and isinstance(item[0], basestring)
                            for item in schema):
                        raise TypeError(
                            "Invalid schema.  Expected a list of tuples (str, type) with the column name and data type, but received type %s."
                            % type(schema))
                elif schema is None:
                    schema = self._infer_schema(source)
                    inferred_schema = True
                else:
                    # Schema is not a list or None
                    raise TypeError(
                        "Invalid schema type: %s.  Expected a list of tuples (str, type) with the column name and data type."
                        % type(schema))
                for item in schema:
                    if not self._is_supported_datatype(item[1]):
                        if inferred_schema:
                            raise TypeError(
                                "The %s data type was found when inferring the schema, and it is not a "
                                "supported data type.  Instead, specify a schema that uses a supported data "
                                "type, and enable validate_schema so that the data is converted to the proper "
                                "data type.\n\nInferred schema: %s\n\nSupported data types: %s"
                                % (str(item[1]), str(schema), dtypes.dtypes))
                        else:
                            raise TypeError(
                                "Invalid schema.  %s is not a supported data type.\n\nSupported data types: %s"
                                % (str(item[1]), dtypes.dtypes))

                source = tc.sc.parallelize(source)
            if schema and validate_schema:
                # Validate schema by going through the data and checking the data type and attempting to parse it
                validate_schema_result = self.validate_pyrdd_schema(
                    source, schema)
                source = validate_schema_result.validated_rdd
                logger.debug(
                    "%s values were unable to be parsed to the schema's data type."
                    % validate_schema_result.bad_value_count)

            self._frame = PythonFrame(source, schema)
示例#15
0
def import_tensorflow(tf_path, schema=None, tc=TkContext.implicit):
    """
    Create a frame with data from a TensorFlow records file

    TensorFlow records are the standard data format for TensorFlow. The recommended format for TensorFlow is a TFRecords file
    containing tf.train.Example protocol buffers. The tf.train.Example protocol buffers encodes (which contain Features as a field).
    https://www.tensorflow.org/how_tos/reading_data

    During Import, the API parses TensorFlow DataTypes as below:

    * Int64List => IntegerType or LongType
    * FloatList => FloatType or DoubleType
    * Any other DataType (Ex: String) => BytesList

    Parameters
    ----------

    :param tf_path:(str) Full path to TensorFlow records
    :param schema: (Optional(list[tuple(str, type)] or list[str])) There are different options for specifying a schema:

    * Provide the full schema for the frame as a list of tuples (string column name and data type)
    * Provide the column names as a list of strings.  Column data types will be inferred, based on the data.

    :return: frame with data from TensorFlow records

    Examples
    --------

        >>> file_path = "../datasets/cities.csv"

        >>> frame = tc.frame.import_csv(file_path, "|", header=True)
        -etc-

        >>> frame.count()
        20

        >>> frame.sort("rank")

        >>> frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]     1  Portland              609456           583776  4.40%   Multnomah
        [1]     2  Salem                 160614           154637  3.87%   Marion
        [2]     3  Eugene                159190           156185  1.92%   Lane
        [3]     4  Gresham               109397           105594  3.60%   Multnomah
        [4]     5  Hillsboro              97368            91611  6.28%   Washington
        [5]     6  Beaverton              93542            89803  4.16%   Washington
        [6]     7  Bend                   81236            76639  6.00%   Deschutes
        [7]     8  Medford                77677            74907  3.70%   Jackson
        [8]     9  Springfield            60177            59403  1.30%   Lane
        [9]    10  Corvallis              55298            54462  1.54%   Benton

        >>> destPath = "../tests/sandbox/output26.tfr"

        >>> import os
        ... if os.path.exists(filename) os.remove(destPath)

        >>> frame.export_to_tensorflow(destPath)

        >>> tf_schema=[("rank", int),("city", unicode),("population_2013", int),("population_2010", int),("change", unicode),("county", unicode)]

        >>> tf_frame = tc.frame.import_tensorflow(destPath, tf_schema)

        >>> tf_frame.count()
        20

        >>> tf_frame.sort("rank")

        >>> tf_frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]     1  Portland              609456           583776  4.40%   Multnomah
        [1]     2  Salem                 160614           154637  3.87%   Marion
        [2]     3  Eugene                159190           156185  1.92%   Lane
        [3]     4  Gresham               109397           105594  3.60%   Multnomah
        [4]     5  Hillsboro              97368            91611  6.28%   Washington
        [5]     6  Beaverton              93542            89803  4.16%   Washington
        [6]     7  Bend                   81236            76639  6.00%   Deschutes
        [7]     8  Medford                77677            74907  3.70%   Jackson
        [8]     9  Springfield            60177            59403  1.30%   Lane
        [9]    10  Corvallis              55298            54462  1.54%   Benton

    """

    if schema is not None:
        schema_as_list_of_lists = [list(elem) for elem in schema]
        scala_frame_schema = schema_to_scala(tc.sc, schema_as_list_of_lists)
    else:
        scala_frame_schema = schema

    scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.ImportTensorflow.importTensorflow(
        tc._scala_sc, tf_path,
        tc.jutils.convert.to_scala_option(scala_frame_schema))
    from sparktk.frame.frame import Frame
    return Frame(tc, scala_frame)
示例#16
0
 def append_csv_file(self, file_name, schema, separator=','):
     self._scala.appendCsvFile(file_name, schema_to_scala(self._tc.sc, schema), separator)
示例#17
0
def import_tensorflow(tf_path, schema=None, tc=TkContext.implicit):
    """
    Create a frame with data from a TensorFlow records file

    TensorFlow records are the standard data format for TensorFlow. The recommended format for TensorFlow is a TFRecords file
    containing tf.train.Example protocol buffers. The tf.train.Example protocol buffers encodes (which contain Features as a field).
    https://www.tensorflow.org/how_tos/reading_data

    During Import, the API parses TensorFlow DataTypes as below:

    * Int64List => IntegerType or LongType
    * FloatList => FloatType or DoubleType
    * Any other DataType (Ex: String) => BytesList

    Parameters
    ----------

    :param tf_path:(str) Full path to TensorFlow records
    :param schema: (Optional(list[tuple(str, type)] or list[str])) There are different options for specifying a schema:

    * Provide the full schema for the frame as a list of tuples (string column name and data type)
    * Provide the column names as a list of strings.  Column data types will be inferred, based on the data.

    :return: frame with data from TensorFlow records

    Examples
    --------

        >>> file_path = "../datasets/cities.csv"

        >>> frame = tc.frame.import_csv(file_path, "|", header=True)
        -etc-

        >>> frame.count()
        20

        >>> frame.sort("rank")

        >>> frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]     1  Portland              609456           583776  4.40%   Multnomah
        [1]     2  Salem                 160614           154637  3.87%   Marion
        [2]     3  Eugene                159190           156185  1.92%   Lane
        [3]     4  Gresham               109397           105594  3.60%   Multnomah
        [4]     5  Hillsboro              97368            91611  6.28%   Washington
        [5]     6  Beaverton              93542            89803  4.16%   Washington
        [6]     7  Bend                   81236            76639  6.00%   Deschutes
        [7]     8  Medford                77677            74907  3.70%   Jackson
        [8]     9  Springfield            60177            59403  1.30%   Lane
        [9]    10  Corvallis              55298            54462  1.54%   Benton

        >>> destPath = "../tests/sandbox/output26.tfr"

        >>> import os
        ... if os.path.exists(filename) os.remove(destPath)

        >>> frame.export_to_tensorflow(destPath)

        >>> tf_schema=[("rank", int),("city", unicode),("population_2013", int),("population_2010", int),("change", unicode),("county", unicode)]

        >>> tf_frame = tc.frame.import_tensorflow(destPath, tf_schema)

        >>> tf_frame.count()
        20

        >>> tf_frame.sort("rank")

        >>> tf_frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]     1  Portland              609456           583776  4.40%   Multnomah
        [1]     2  Salem                 160614           154637  3.87%   Marion
        [2]     3  Eugene                159190           156185  1.92%   Lane
        [3]     4  Gresham               109397           105594  3.60%   Multnomah
        [4]     5  Hillsboro              97368            91611  6.28%   Washington
        [5]     6  Beaverton              93542            89803  4.16%   Washington
        [6]     7  Bend                   81236            76639  6.00%   Deschutes
        [7]     8  Medford                77677            74907  3.70%   Jackson
        [8]     9  Springfield            60177            59403  1.30%   Lane
        [9]    10  Corvallis              55298            54462  1.54%   Benton

    """

    if schema is not None:
        schema_as_list_of_lists = [list(elem) for elem in schema]
        scala_frame_schema = schema_to_scala(tc.sc, schema_as_list_of_lists)
    else:
        scala_frame_schema = schema

    scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.ImportTensorflow.importTensorflow(tc._scala_sc,
                                                                                                                        tf_path,
                                                                                                                        tc.jutils.convert.to_scala_option(scala_frame_schema))
    from sparktk.frame.frame import Frame
    return Frame(tc, scala_frame)