def _python(self): """gets frame backend as _PythonFrame, causes conversion if it is current not""" if self._is_scala: logger.info("frame._python reference: converting frame backend from Scala to Python") # convert Scala Frame to a PythonFrame""" scala_schema = self._frame.schema() java_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(self._frame.rdd()) python_schema = schema_to_python(self._tc.sc, scala_schema) python_rdd = RDD(java_rdd, self._tc.sc) # If schema contains matrix datatype, then apply type_coercer to convert list[list] to numpy ndarray map_python_rdd = schema_is_coercible(python_rdd, list(python_schema)) self._frame = PythonFrame(map_python_rdd, python_schema) else: logger.info("frame._python reference: frame already has a python backend") return self._frame
def _scala(self): """gets frame backend as Scala Frame, causes conversion if it is current not""" if self._is_python: logger.info("frame._scala reference: converting frame backend from Python to Scala") # If schema contains matrix dataype, # then apply type_coercer_pymlib to convert ndarray to pymlib DenseMatrix for serialization purpose at java self._frame.rdd = schema_is_coercible(self._frame.rdd, list(self._frame.schema), True) # convert PythonFrame to a Scala Frame""" scala_schema = schema_to_scala(self._tc.sc, self._frame.schema) scala_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.pythonToScala(self._frame.rdd._jrdd, scala_schema) self._frame = self._create_scala_frame(self._tc.sc, scala_rdd, scala_schema) else: logger.info("frame._scala reference: frame already has a scala backend") return self._frame
def __init__(self, tc, source, schema=None, validate_schema=False): """(Private constructor -- use tc.frame.create or other methods available from the TkContext)""" self._tc = tc if self._is_scala_frame(source): self._frame = source elif self._is_scala_rdd(source): scala_schema = schema_to_scala(tc.sc, schema) self._frame = self._create_scala_frame(tc.sc, source, scala_schema) elif self._is_scala_dataframe(source): self._frame = self._create_scala_frame_from_scala_dataframe( tc.sc, source) elif isinstance(source, DataFrame): self._frame = self._create_scala_frame_from_scala_dataframe( tc.sc, source._jdf) elif isinstance(source, PythonFrame): self._frame = source else: if not isinstance(source, RDD): if not isinstance(source, list) or (len(source) > 0 and any( not isinstance(row, (list, tuple)) for row in source)): raise TypeError( "Invalid data source. The data parameter must be a 2-dimensional list (list of row data) or an RDD." ) inferred_schema = False if isinstance(schema, list): if all(isinstance(item, basestring) for item in schema): # check if schema is just a list of column names (versus string and data type tuples) schema = self._infer_schema(source, schema) inferred_schema = True elif not all( isinstance(item, tuple) and len(item) == 2 and isinstance(item[0], basestring) for item in schema): raise TypeError( "Invalid schema. Expected a list of tuples (str, type) with the column name and data type, but received type %s." % type(schema)) # check for duplicate column names column_names = [col[0] for col in schema] duplicate_column_names = set([ col for col in column_names if column_names.count(col) > 1 ]) if len(duplicate_column_names) > 0: raise ValueError( "Invalid schema, column names cannot be duplicated: %s" % ", ".join(duplicate_column_names)) elif schema is None: schema = self._infer_schema(source) inferred_schema = True else: # Schema is not a list or None raise TypeError( "Invalid schema type: %s. Expected a list of tuples (str, type) with the column name and data type." % type(schema)) for item in schema: if not self._is_supported_datatype(item[1]): if inferred_schema: raise TypeError( "The %s data type was found when inferring the schema, and it is not a " "supported data type. Instead, specify a schema that uses a supported data " "type, and enable validate_schema so that the data is converted to the proper " "data type.\n\nInferred schema: %s\n\nSupported data types: %s" % (str(item[1]), str(schema), dtypes.dtypes)) else: raise TypeError( "Invalid schema. %s is not a supported data type.\n\nSupported data types: %s" % (str(item[1]), dtypes.dtypes)) source = tc.sc.parallelize(source) if schema and validate_schema: # Validate schema by going through the data and checking the data type and attempting to parse it validate_schema_result = self.validate_pyrdd_schema( source, schema) source = validate_schema_result.validated_rdd logger.debug( "%s values were unable to be parsed to the schema's data type." % validate_schema_result.bad_value_count) # If schema contains matrix datatype, then apply type_coercer to convert list[list] to numpy ndarray map_source = schema_is_coercible(source, list(schema)) self._frame = PythonFrame(map_source, schema)
def __init__(self, tc, source, schema=None, validate_schema=False): """(Private constructor -- use tc.frame.create or other methods available from the TkContext)""" self._tc = tc if self._is_scala_frame(source): self._frame = source elif self._is_scala_rdd(source): scala_schema = schema_to_scala(tc.sc, schema) self._frame = self._create_scala_frame(tc.sc, source, scala_schema) elif self._is_scala_dataframe(source): self._frame = self._create_scala_frame_from_scala_dataframe(tc.sc, source) elif isinstance(source, DataFrame): self._frame = self._create_scala_frame_from_scala_dataframe(tc.sc, source._jdf) elif isinstance(source, PythonFrame): self._frame = source else: if not isinstance(source, RDD): if not isinstance(source, list) or (len(source) > 0 and any(not isinstance(row, (list, tuple)) for row in source)): raise TypeError("Invalid data source. The data parameter must be a 2-dimensional list (list of row data) or an RDD.") inferred_schema = False if isinstance(schema, list): if all(isinstance(item, basestring) for item in schema): # check if schema is just a list of column names (versus string and data type tuples) schema = self._infer_schema(source, schema) inferred_schema = True elif not all(isinstance(item, tuple) and len(item) == 2 and isinstance(item[0], basestring) for item in schema): raise TypeError("Invalid schema. Expected a list of tuples (str, type) with the column name and data type, but received type %s." % type(schema)) # check for duplicate column names column_names = [col[0] for col in schema] duplicate_column_names = set([col for col in column_names if column_names.count(col) > 1]) if len(duplicate_column_names) > 0: raise ValueError("Invalid schema, column names cannot be duplicated: %s" % ", ".join(duplicate_column_names)) elif schema is None: schema = self._infer_schema(source) inferred_schema = True else: # Schema is not a list or None raise TypeError("Invalid schema type: %s. Expected a list of tuples (str, type) with the column name and data type." % type(schema)) for item in schema: if not self._is_supported_datatype(item[1]): if inferred_schema: raise TypeError("The %s data type was found when inferring the schema, and it is not a " "supported data type. Instead, specify a schema that uses a supported data " "type, and enable validate_schema so that the data is converted to the proper " "data type.\n\nInferred schema: %s\n\nSupported data types: %s" % (str(item[1]), str(schema), dtypes.dtypes)) else: raise TypeError("Invalid schema. %s is not a supported data type.\n\nSupported data types: %s" % (str(item[1]), dtypes.dtypes)) source = tc.sc.parallelize(source) if schema and validate_schema: # Validate schema by going through the data and checking the data type and attempting to parse it validate_schema_result = self.validate_pyrdd_schema(source, schema) source = validate_schema_result.validated_rdd logger.debug("%s values were unable to be parsed to the schema's data type." % validate_schema_result.bad_value_count) # If schema contains matrix datatype, then apply type_coercer to convert list[list] to numpy ndarray map_source = schema_is_coercible(source, list(schema)) self._frame = PythonFrame(map_source, schema)