def __init__(self, tc, source, schema=None, validate_schema=False): self._tc = tc if self._is_scala_frame(source): self._frame = source elif self.is_scala_rdd(source): scala_schema = schema_to_scala(tc.sc, schema) self._frame = self.create_scala_frame(tc.sc, source, scala_schema) else: if not isinstance(source, RDD): if isinstance(schema, list): if all(isinstance(item, basestring) for item in schema): # check if schema is just a list of column names (versus string and data type tuples) schema = self._infer_schema(source, schema) elif not all(isinstance(item, tuple) and len(item) == 2 and isinstance(item[0], str) for item in schema): raise TypeError("Invalid schema. Expected a list of tuples (str, type) with the column name and data type." % type(schema)) else: for item in schema: if not self._is_supported_datatype(item[1]): raise TypeError("Invalid schema. %s is not a supported data type." % str(item[1])) elif schema is None: schema = self._infer_schema(source) else: # Schema is not a list or None raise TypeError("Invalid schema type: %s. Expected a list of tuples (str, type) with the column name and data type." % type(schema)) source = tc.sc.parallelize(source) if schema and validate_schema: # Validate schema by going through the data and checking the data type and attempting to parse it validate_schema_result = self.validate_pyrdd_schema(source, schema) source = validate_schema_result.validated_rdd logger.debug("%s values were unable to be parsed to the schema's data type." % validate_schema_result.bad_value_count) self._frame = PythonFrame(source, schema)
def _python(self): """gets frame backend as _PythonFrame, causes conversion if it is current not""" if self._is_scala: # convert Scala Frame to a PythonFrame""" scala_schema = self._frame.schema() java_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(self._frame.rdd()) python_schema = schema_to_python(self._tc.sc, scala_schema) python_rdd = RDD(java_rdd, self._tc.sc) self._frame = PythonFrame(python_rdd, python_schema) return self._frame
def __init__(self, tc, source, schema=None): self._tc = tc if self._is_scala_frame(source): self._frame = source elif self.is_scala_rdd(source): scala_schema = schema_to_scala(tc.sc, schema) self._frame = self.create_scala_frame(tc.sc, source, scala_schema) else: if not isinstance(source, RDD): source = tc.sc.parallelize(source) if schema: self.validate_pyrdd_schema(source, schema) self._frame = PythonFrame(source, schema)
def _python(self): """gets frame backend as _PythonFrame, causes conversion if it is current not""" if self._is_scala: # convert Scala Frame to a PythonFrame""" scala_schema = self._frame.schema() java_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython( self._frame.rdd()) python_schema = schema_to_python(self._tc.sc, scala_schema) python_rdd = RDD(java_rdd, self._tc.sc) # If schema contains matrix datatype, then apply type_coercer to convert list[list] to numpy ndarray map_python_rdd = MatrixCoercion.schema_is_coercible( python_rdd, list(python_schema)) self._frame = PythonFrame(map_python_rdd, python_schema) return self._frame
def __init__(self, tc, source, schema=None, validate_schema=False): """(Private constructor -- use tc.frame.create or other methods available from the TkContext)""" self._tc = tc if self._is_scala_frame(source): self._frame = source elif self._is_scala_rdd(source): scala_schema = schema_to_scala(tc.sc, schema) self._frame = self._create_scala_frame(tc.sc, source, scala_schema) elif self._is_scala_dataframe(source): self._frame = self._create_scala_frame_from_scala_dataframe( tc.sc, source) elif isinstance(source, DataFrame): self._frame = self._create_scala_frame_from_scala_dataframe( tc.sc, source._jdf) elif isinstance(source, PythonFrame): self._frame = source else: if not isinstance(source, RDD): if not isinstance(source, list) or (len(source) > 0 and any( not isinstance(row, (list, tuple)) for row in source)): raise TypeError( "Invalid data source. The data parameter must be a 2-dimensional list (list of row data) or an RDD." ) inferred_schema = False if isinstance(schema, list): if all(isinstance(item, basestring) for item in schema): # check if schema is just a list of column names (versus string and data type tuples) schema = self._infer_schema(source, schema) inferred_schema = True elif not all( isinstance(item, tuple) and len(item) == 2 and isinstance(item[0], basestring) for item in schema): raise TypeError( "Invalid schema. Expected a list of tuples (str, type) with the column name and data type, but received type %s." % type(schema)) # check for duplicate column names column_names = [col[0] for col in schema] duplicate_column_names = set([ col for col in column_names if column_names.count(col) > 1 ]) if len(duplicate_column_names) > 0: raise ValueError( "Invalid schema, column names cannot be duplicated: %s" % ", ".join(duplicate_column_names)) elif schema is None: schema = self._infer_schema(source) inferred_schema = True else: # Schema is not a list or None raise TypeError( "Invalid schema type: %s. Expected a list of tuples (str, type) with the column name and data type." % type(schema)) for item in schema: if not self._is_supported_datatype(item[1]): if inferred_schema: raise TypeError( "The %s data type was found when inferring the schema, and it is not a " "supported data type. Instead, specify a schema that uses a supported data " "type, and enable validate_schema so that the data is converted to the proper " "data type.\n\nInferred schema: %s\n\nSupported data types: %s" % (str(item[1]), str(schema), dtypes.dtypes)) else: raise TypeError( "Invalid schema. %s is not a supported data type.\n\nSupported data types: %s" % (str(item[1]), dtypes.dtypes)) source = tc.sc.parallelize(source) if schema and validate_schema: # Validate schema by going through the data and checking the data type and attempting to parse it validate_schema_result = self.validate_pyrdd_schema( source, schema) source = validate_schema_result.validated_rdd logger.debug( "%s values were unable to be parsed to the schema's data type." % validate_schema_result.bad_value_count) # If schema contains matrix datatype, then apply type_coercer to convert list[list] to numpy ndarray map_source = schema_is_coercible(source, list(schema)) self._frame = PythonFrame(map_source, schema)
def __init__(self, tc, source, schema=None, validate_schema=False): self._tc = tc if self._is_scala_frame(source): self._frame = source elif self.is_scala_rdd(source): scala_schema = schema_to_scala(tc.sc, schema) self._frame = self.create_scala_frame(tc.sc, source, scala_schema) elif self.is_scala_dataframe(source): self._frame = self.create_scala_frame_from_scala_dataframe( tc.sc, source) elif isinstance(source, DataFrame): self._frame = self.create_scala_frame_from_scala_dataframe( tc.sc, source._jdf) elif isinstance(source, PythonFrame): self._frame = source else: if not isinstance(source, RDD): if not isinstance(source, list) or (len(source) > 0 and any( not isinstance(row, (list, tuple)) for row in source)): raise TypeError( "Invalid data source. The data parameter must be a 2-dimensional list (list of row data) or an RDD." ) inferred_schema = False if isinstance(schema, list): if all(isinstance(item, basestring) for item in schema): # check if schema is just a list of column names (versus string and data type tuples) schema = self._infer_schema(source, schema) inferred_schema = True elif not all( isinstance(item, tuple) and len(item) == 2 and isinstance(item[0], basestring) for item in schema): raise TypeError( "Invalid schema. Expected a list of tuples (str, type) with the column name and data type, but received type %s." % type(schema)) elif schema is None: schema = self._infer_schema(source) inferred_schema = True else: # Schema is not a list or None raise TypeError( "Invalid schema type: %s. Expected a list of tuples (str, type) with the column name and data type." % type(schema)) for item in schema: if not self._is_supported_datatype(item[1]): if inferred_schema: raise TypeError( "The %s data type was found when inferring the schema, and it is not a " "supported data type. Instead, specify a schema that uses a supported data " "type, and enable validate_schema so that the data is converted to the proper " "data type.\n\nInferred schema: %s\n\nSupported data types: %s" % (str(item[1]), str(schema), dtypes.dtypes)) else: raise TypeError( "Invalid schema. %s is not a supported data type.\n\nSupported data types: %s" % (str(item[1]), dtypes.dtypes)) source = tc.sc.parallelize(source) if schema and validate_schema: # Validate schema by going through the data and checking the data type and attempting to parse it validate_schema_result = self.validate_pyrdd_schema( source, schema) source = validate_schema_result.validated_rdd logger.debug( "%s values were unable to be parsed to the schema's data type." % validate_schema_result.bad_value_count) self._frame = PythonFrame(source, schema)