def __init__(self, tc, source, schema=None, validate_schema=False): self._tc = tc if self._is_scala_frame(source): self._frame = source elif self.is_scala_rdd(source): scala_schema = schema_to_scala(tc.sc, schema) self._frame = self.create_scala_frame(tc.sc, source, scala_schema) else: if not isinstance(source, RDD): if isinstance(schema, list): if all(isinstance(item, basestring) for item in schema): # check if schema is just a list of column names (versus string and data type tuples) schema = self._infer_schema(source, schema) elif not all(isinstance(item, tuple) and len(item) == 2 and isinstance(item[0], str) for item in schema): raise TypeError("Invalid schema. Expected a list of tuples (str, type) with the column name and data type." % type(schema)) else: for item in schema: if not self._is_supported_datatype(item[1]): raise TypeError("Invalid schema. %s is not a supported data type." % str(item[1])) elif schema is None: schema = self._infer_schema(source) else: # Schema is not a list or None raise TypeError("Invalid schema type: %s. Expected a list of tuples (str, type) with the column name and data type." % type(schema)) source = tc.sc.parallelize(source) if schema and validate_schema: # Validate schema by going through the data and checking the data type and attempting to parse it validate_schema_result = self.validate_pyrdd_schema(source, schema) source = validate_schema_result.validated_rdd logger.debug("%s values were unable to be parsed to the schema's data type." % validate_schema_result.bad_value_count) self._frame = PythonFrame(source, schema)
def __init__(self, tc, source, schema=None, validate_schema=False): self._tc = tc if self._is_scala_frame(source): self._frame = source elif self.is_scala_rdd(source): scala_schema = schema_to_scala(tc.sc, schema) self._frame = self.create_scala_frame(tc.sc, source, scala_schema) elif self.is_scala_dataframe(source): self._frame = self.create_scala_frame_from_scala_dataframe(tc.sc, source) elif isinstance(source, DataFrame): self._frame = self.create_scala_frame_from_scala_dataframe(tc.sc, source._jdf) elif isinstance(source, PythonFrame): self._frame = source else: if not isinstance(source, RDD): if not isinstance(source, list) or (len(source) > 0 and any(not isinstance(row, (list, tuple)) for row in source)): raise TypeError("Invalid data source. The data parameter must be a 2-dimensional list (list of row data) or an RDD.") inferred_schema = False if isinstance(schema, list): if all(isinstance(item, basestring) for item in schema): # check if schema is just a list of column names (versus string and data type tuples) schema = self._infer_schema(source, schema) inferred_schema = True elif not all(isinstance(item, tuple) and len(item) == 2 and isinstance(item[0], basestring) for item in schema): raise TypeError("Invalid schema. Expected a list of tuples (str, type) with the column name and data type, but received type %s." % type(schema)) # check for duplicate column names column_names = [col[0] for col in schema] duplicate_column_names = set([col for col in column_names if column_names.count(col) > 1]) if len(duplicate_column_names) > 0: raise ValueError("Invalid schema, column names cannot be duplicated: %s" % ", ".join(duplicate_column_names)) elif schema is None: schema = self._infer_schema(source) inferred_schema = True else: # Schema is not a list or None raise TypeError("Invalid schema type: %s. Expected a list of tuples (str, type) with the column name and data type." % type(schema)) for item in schema: if not self._is_supported_datatype(item[1]): if inferred_schema: raise TypeError("The %s data type was found when inferring the schema, and it is not a " "supported data type. Instead, specify a schema that uses a supported data " "type, and enable validate_schema so that the data is converted to the proper " "data type.\n\nInferred schema: %s\n\nSupported data types: %s" % (str(item[1]), str(schema), dtypes.dtypes)) else: raise TypeError("Invalid schema. %s is not a supported data type.\n\nSupported data types: %s" % (str(item[1]), dtypes.dtypes)) source = tc.sc.parallelize(source) if schema and validate_schema: # Validate schema by going through the data and checking the data type and attempting to parse it validate_schema_result = self.validate_pyrdd_schema(source, schema) source = validate_schema_result.validated_rdd logger.debug("%s values were unable to be parsed to the schema's data type." % validate_schema_result.bad_value_count) self._frame = PythonFrame(source, schema)
def _scala(self): """gets frame backend as Scala Frame, causes conversion if it is current not""" if self._is_python: # convert PythonFrame to a Scala Frame""" scala_schema = schema_to_scala(self._tc.sc, self._frame.schema) scala_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.pythonToScala(self._frame.rdd._jrdd, scala_schema) self._frame = self.create_scala_frame(self._tc.sc, scala_rdd, scala_schema) return self._frame
def _scala(self): """gets frame backend as Scala Frame, causes conversion if it is current not""" if self._is_python: # If schema contains matrix dataype, # then apply type_coercer_pymlib to convert ndarray to pymlib DenseMatrix for serialization purpose at java self._frame.rdd = MatrixCoercion.schema_is_coercible(self._frame.rdd, list(self._frame.schema), True) # convert PythonFrame to a Scala Frame""" scala_schema = schema_to_scala(self._tc.sc, self._frame.schema) scala_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.pythonToScala(self._frame.rdd._jrdd, scala_schema) self._frame = self.create_scala_frame(self._tc.sc, scala_rdd, scala_schema) return self._frame
def __init__(self, tc, source, schema=None): self._tc = tc if self._is_scala_frame(source): self._frame = source elif self.is_scala_rdd(source): scala_schema = schema_to_scala(tc.sc, schema) self._frame = self.create_scala_frame(tc.sc, source, scala_schema) else: if not isinstance(source, RDD): source = tc.sc.parallelize(source) if schema: self.validate_pyrdd_schema(source, schema) self._frame = PythonFrame(source, schema)
def _scala(self): """gets frame backend as Scala Frame, causes conversion if it is current not""" if self._is_python: # If schema contains matrix dataype, # then apply type_coercer_pymlib to convert ndarray to pymlib DenseMatrix for serialization purpose at java self._frame.rdd = MatrixCoercion.schema_is_coercible( self._frame.rdd, list(self._frame.schema), True) # convert PythonFrame to a Scala Frame""" scala_schema = schema_to_scala(self._tc.sc, self._frame.schema) scala_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.pythonToScala( self._frame.rdd._jrdd, scala_schema) self._frame = self.create_scala_frame(self._tc.sc, scala_rdd, scala_schema) return self._frame
def rename_columns(self, names): """ Rename columns Parameters ---------- :param names: (dict) Dictionary of old names to new names. Examples -------- Start with a frame with columns *Black* and *White*. <hide> >>> s = [('Black', unicode), ('White', unicode)] >>> rows = [["glass", "clear"],["paper","unclear"]] >>> my_frame = tc.frame.create(rows, s) -etc- </hide> >>> print my_frame.schema [('Black', <type 'unicode'>), ('White', <type 'unicode'>)] Rename the columns to *Mercury* and *Venus*: >>> my_frame.rename_columns({"Black": "Mercury", "White": "Venus"}) >>> print my_frame.schema [(u'Mercury', <type 'unicode'>), (u'Venus', <type 'unicode'>)] """ if not isinstance(names, dict): raise ValueError( "Unsupported 'names' parameter type. Expected dictionary, but found %s." % type(names)) if self.schema is None: raise RuntimeError( "Unable rename column(s), because the frame's schema has not been defined." ) if self._is_python: scala_rename_map = self._tc.jutils.convert.to_scala_map(names) scala_schema = schema_to_scala(self._tc.sc, self._python.schema) rename_scala_schema = scala_schema.renameColumns(scala_rename_map) self._python.schema = schema_to_python(self._tc.sc, rename_scala_schema) else: self._scala.renameColumns(self._tc.jutils.convert.to_scala_map(names))
def rename_columns(self, names): """ Rename columns Parameters ---------- :param names: (dict) Dictionary of old names to new names. Examples -------- Start with a frame with columns *Black* and *White*. <hide> >>> s = [('Black', unicode), ('White', unicode)] >>> rows = [["glass", "clear"],["paper","unclear"]] >>> my_frame = tc.frame.create(rows, s) -etc- </hide> >>> print my_frame.schema [('Black', <type 'unicode'>), ('White', <type 'unicode'>)] Rename the columns to *Mercury* and *Venus*: >>> my_frame.rename_columns({"Black": "Mercury", "White": "Venus"}) >>> print my_frame.schema [(u'Mercury', <type 'unicode'>), (u'Venus', <type 'unicode'>)] """ if not isinstance(names, dict): raise ValueError("Unsupported 'names' parameter type. Expected dictionary, but found %s." % type(names)) if self.schema is None: raise RuntimeError("Unable rename column(s), because the frame's schema has not been defined.") if self._is_python: scala_rename_map = self._tc.jutils.convert.to_scala_map(names) scala_schema = schema_to_scala(self._tc.sc, self._python.schema) rename_scala_schema = scala_schema.renameColumns(scala_rename_map) self._python.schema = schema_to_python(self._tc.sc, rename_scala_schema) else: self._scala.renameColumns(self._tc.jutils.convert.to_scala_map(names))
def _frame_to_scala(self, python_frame): """converts a PythonFrame to a Scala Frame""" scala_schema = schema_to_scala(self._tc.sc, python_frame.schema) scala_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.rdd.PythonJavaRdd.pythonToScala(python_frame.rdd._jrdd, scala_schema) return self.create_scala_frame(self._tc.sc, scala_rdd, scala_schema)
def append_csv_file(self, file_name, schema, separator=','): self._scala.appendCsvFile(file_name, schema_to_scala(self._tc.sc, schema), separator)
def __init__(self, tc, source, schema=None, validate_schema=False): """(Private constructor -- use tc.frame.create or other methods available from the TkContext)""" self._tc = tc if self._is_scala_frame(source): self._frame = source elif self._is_scala_rdd(source): scala_schema = schema_to_scala(tc.sc, schema) self._frame = self._create_scala_frame(tc.sc, source, scala_schema) elif self._is_scala_dataframe(source): self._frame = self._create_scala_frame_from_scala_dataframe( tc.sc, source) elif isinstance(source, DataFrame): self._frame = self._create_scala_frame_from_scala_dataframe( tc.sc, source._jdf) elif isinstance(source, PythonFrame): self._frame = source else: if not isinstance(source, RDD): if not isinstance(source, list) or (len(source) > 0 and any( not isinstance(row, (list, tuple)) for row in source)): raise TypeError( "Invalid data source. The data parameter must be a 2-dimensional list (list of row data) or an RDD." ) inferred_schema = False if isinstance(schema, list): if all(isinstance(item, basestring) for item in schema): # check if schema is just a list of column names (versus string and data type tuples) schema = self._infer_schema(source, schema) inferred_schema = True elif not all( isinstance(item, tuple) and len(item) == 2 and isinstance(item[0], basestring) for item in schema): raise TypeError( "Invalid schema. Expected a list of tuples (str, type) with the column name and data type, but received type %s." % type(schema)) # check for duplicate column names column_names = [col[0] for col in schema] duplicate_column_names = set([ col for col in column_names if column_names.count(col) > 1 ]) if len(duplicate_column_names) > 0: raise ValueError( "Invalid schema, column names cannot be duplicated: %s" % ", ".join(duplicate_column_names)) elif schema is None: schema = self._infer_schema(source) inferred_schema = True else: # Schema is not a list or None raise TypeError( "Invalid schema type: %s. Expected a list of tuples (str, type) with the column name and data type." % type(schema)) for item in schema: if not self._is_supported_datatype(item[1]): if inferred_schema: raise TypeError( "The %s data type was found when inferring the schema, and it is not a " "supported data type. Instead, specify a schema that uses a supported data " "type, and enable validate_schema so that the data is converted to the proper " "data type.\n\nInferred schema: %s\n\nSupported data types: %s" % (str(item[1]), str(schema), dtypes.dtypes)) else: raise TypeError( "Invalid schema. %s is not a supported data type.\n\nSupported data types: %s" % (str(item[1]), dtypes.dtypes)) source = tc.sc.parallelize(source) if schema and validate_schema: # Validate schema by going through the data and checking the data type and attempting to parse it validate_schema_result = self.validate_pyrdd_schema( source, schema) source = validate_schema_result.validated_rdd logger.debug( "%s values were unable to be parsed to the schema's data type." % validate_schema_result.bad_value_count) # If schema contains matrix datatype, then apply type_coercer to convert list[list] to numpy ndarray map_source = schema_is_coercible(source, list(schema)) self._frame = PythonFrame(map_source, schema)
def __init__(self, tc, source, schema=None, validate_schema=False): self._tc = tc if self._is_scala_frame(source): self._frame = source elif self.is_scala_rdd(source): scala_schema = schema_to_scala(tc.sc, schema) self._frame = self.create_scala_frame(tc.sc, source, scala_schema) elif self.is_scala_dataframe(source): self._frame = self.create_scala_frame_from_scala_dataframe( tc.sc, source) elif isinstance(source, DataFrame): self._frame = self.create_scala_frame_from_scala_dataframe( tc.sc, source._jdf) elif isinstance(source, PythonFrame): self._frame = source else: if not isinstance(source, RDD): if not isinstance(source, list) or (len(source) > 0 and any( not isinstance(row, (list, tuple)) for row in source)): raise TypeError( "Invalid data source. The data parameter must be a 2-dimensional list (list of row data) or an RDD." ) inferred_schema = False if isinstance(schema, list): if all(isinstance(item, basestring) for item in schema): # check if schema is just a list of column names (versus string and data type tuples) schema = self._infer_schema(source, schema) inferred_schema = True elif not all( isinstance(item, tuple) and len(item) == 2 and isinstance(item[0], basestring) for item in schema): raise TypeError( "Invalid schema. Expected a list of tuples (str, type) with the column name and data type, but received type %s." % type(schema)) elif schema is None: schema = self._infer_schema(source) inferred_schema = True else: # Schema is not a list or None raise TypeError( "Invalid schema type: %s. Expected a list of tuples (str, type) with the column name and data type." % type(schema)) for item in schema: if not self._is_supported_datatype(item[1]): if inferred_schema: raise TypeError( "The %s data type was found when inferring the schema, and it is not a " "supported data type. Instead, specify a schema that uses a supported data " "type, and enable validate_schema so that the data is converted to the proper " "data type.\n\nInferred schema: %s\n\nSupported data types: %s" % (str(item[1]), str(schema), dtypes.dtypes)) else: raise TypeError( "Invalid schema. %s is not a supported data type.\n\nSupported data types: %s" % (str(item[1]), dtypes.dtypes)) source = tc.sc.parallelize(source) if schema and validate_schema: # Validate schema by going through the data and checking the data type and attempting to parse it validate_schema_result = self.validate_pyrdd_schema( source, schema) source = validate_schema_result.validated_rdd logger.debug( "%s values were unable to be parsed to the schema's data type." % validate_schema_result.bad_value_count) self._frame = PythonFrame(source, schema)
def import_tensorflow(tf_path, schema=None, tc=TkContext.implicit): """ Create a frame with data from a TensorFlow records file TensorFlow records are the standard data format for TensorFlow. The recommended format for TensorFlow is a TFRecords file containing tf.train.Example protocol buffers. The tf.train.Example protocol buffers encodes (which contain Features as a field). https://www.tensorflow.org/how_tos/reading_data During Import, the API parses TensorFlow DataTypes as below: * Int64List => IntegerType or LongType * FloatList => FloatType or DoubleType * Any other DataType (Ex: String) => BytesList Parameters ---------- :param tf_path:(str) Full path to TensorFlow records :param schema: (Optional(list[tuple(str, type)] or list[str])) There are different options for specifying a schema: * Provide the full schema for the frame as a list of tuples (string column name and data type) * Provide the column names as a list of strings. Column data types will be inferred, based on the data. :return: frame with data from TensorFlow records Examples -------- >>> file_path = "../datasets/cities.csv" >>> frame = tc.frame.import_csv(file_path, "|", header=True) -etc- >>> frame.count() 20 >>> frame.sort("rank") >>> frame.inspect() [#] rank city population_2013 population_2010 change county ============================================================================ [0] 1 Portland 609456 583776 4.40% Multnomah [1] 2 Salem 160614 154637 3.87% Marion [2] 3 Eugene 159190 156185 1.92% Lane [3] 4 Gresham 109397 105594 3.60% Multnomah [4] 5 Hillsboro 97368 91611 6.28% Washington [5] 6 Beaverton 93542 89803 4.16% Washington [6] 7 Bend 81236 76639 6.00% Deschutes [7] 8 Medford 77677 74907 3.70% Jackson [8] 9 Springfield 60177 59403 1.30% Lane [9] 10 Corvallis 55298 54462 1.54% Benton >>> destPath = "../tests/sandbox/output26.tfr" >>> import os ... if os.path.exists(filename) os.remove(destPath) >>> frame.export_to_tensorflow(destPath) >>> tf_schema=[("rank", int),("city", unicode),("population_2013", int),("population_2010", int),("change", unicode),("county", unicode)] >>> tf_frame = tc.frame.import_tensorflow(destPath, tf_schema) >>> tf_frame.count() 20 >>> tf_frame.sort("rank") >>> tf_frame.inspect() [#] rank city population_2013 population_2010 change county ============================================================================ [0] 1 Portland 609456 583776 4.40% Multnomah [1] 2 Salem 160614 154637 3.87% Marion [2] 3 Eugene 159190 156185 1.92% Lane [3] 4 Gresham 109397 105594 3.60% Multnomah [4] 5 Hillsboro 97368 91611 6.28% Washington [5] 6 Beaverton 93542 89803 4.16% Washington [6] 7 Bend 81236 76639 6.00% Deschutes [7] 8 Medford 77677 74907 3.70% Jackson [8] 9 Springfield 60177 59403 1.30% Lane [9] 10 Corvallis 55298 54462 1.54% Benton """ if schema is not None: schema_as_list_of_lists = [list(elem) for elem in schema] scala_frame_schema = schema_to_scala(tc.sc, schema_as_list_of_lists) else: scala_frame_schema = schema scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.ImportTensorflow.importTensorflow( tc._scala_sc, tf_path, tc.jutils.convert.to_scala_option(scala_frame_schema)) from sparktk.frame.frame import Frame return Frame(tc, scala_frame)
def import_tensorflow(tf_path, schema=None, tc=TkContext.implicit): """ Create a frame with data from a TensorFlow records file TensorFlow records are the standard data format for TensorFlow. The recommended format for TensorFlow is a TFRecords file containing tf.train.Example protocol buffers. The tf.train.Example protocol buffers encodes (which contain Features as a field). https://www.tensorflow.org/how_tos/reading_data During Import, the API parses TensorFlow DataTypes as below: * Int64List => IntegerType or LongType * FloatList => FloatType or DoubleType * Any other DataType (Ex: String) => BytesList Parameters ---------- :param tf_path:(str) Full path to TensorFlow records :param schema: (Optional(list[tuple(str, type)] or list[str])) There are different options for specifying a schema: * Provide the full schema for the frame as a list of tuples (string column name and data type) * Provide the column names as a list of strings. Column data types will be inferred, based on the data. :return: frame with data from TensorFlow records Examples -------- >>> file_path = "../datasets/cities.csv" >>> frame = tc.frame.import_csv(file_path, "|", header=True) -etc- >>> frame.count() 20 >>> frame.sort("rank") >>> frame.inspect() [#] rank city population_2013 population_2010 change county ============================================================================ [0] 1 Portland 609456 583776 4.40% Multnomah [1] 2 Salem 160614 154637 3.87% Marion [2] 3 Eugene 159190 156185 1.92% Lane [3] 4 Gresham 109397 105594 3.60% Multnomah [4] 5 Hillsboro 97368 91611 6.28% Washington [5] 6 Beaverton 93542 89803 4.16% Washington [6] 7 Bend 81236 76639 6.00% Deschutes [7] 8 Medford 77677 74907 3.70% Jackson [8] 9 Springfield 60177 59403 1.30% Lane [9] 10 Corvallis 55298 54462 1.54% Benton >>> destPath = "../tests/sandbox/output26.tfr" >>> import os ... if os.path.exists(filename) os.remove(destPath) >>> frame.export_to_tensorflow(destPath) >>> tf_schema=[("rank", int),("city", unicode),("population_2013", int),("population_2010", int),("change", unicode),("county", unicode)] >>> tf_frame = tc.frame.import_tensorflow(destPath, tf_schema) >>> tf_frame.count() 20 >>> tf_frame.sort("rank") >>> tf_frame.inspect() [#] rank city population_2013 population_2010 change county ============================================================================ [0] 1 Portland 609456 583776 4.40% Multnomah [1] 2 Salem 160614 154637 3.87% Marion [2] 3 Eugene 159190 156185 1.92% Lane [3] 4 Gresham 109397 105594 3.60% Multnomah [4] 5 Hillsboro 97368 91611 6.28% Washington [5] 6 Beaverton 93542 89803 4.16% Washington [6] 7 Bend 81236 76639 6.00% Deschutes [7] 8 Medford 77677 74907 3.70% Jackson [8] 9 Springfield 60177 59403 1.30% Lane [9] 10 Corvallis 55298 54462 1.54% Benton """ if schema is not None: schema_as_list_of_lists = [list(elem) for elem in schema] scala_frame_schema = schema_to_scala(tc.sc, schema_as_list_of_lists) else: scala_frame_schema = schema scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.ImportTensorflow.importTensorflow(tc._scala_sc, tf_path, tc.jutils.convert.to_scala_option(scala_frame_schema)) from sparktk.frame.frame import Frame return Frame(tc, scala_frame)