예제 #1
0
파일: frame.py 프로젝트: grehx/spark-tk
 def _python(self):
     """gets frame backend as _PythonFrame, causes conversion if it is current not"""
     if self._is_scala:
         logger.info("frame._python reference: converting frame backend from Scala to Python")
         # convert Scala Frame to a PythonFrame"""
         scala_schema = self._frame.schema()
         java_rdd =  self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(self._frame.rdd())
         python_schema = schema_to_python(self._tc.sc, scala_schema)
         python_rdd = RDD(java_rdd, self._tc.sc)
         # If schema contains matrix datatype, then apply type_coercer to convert list[list] to numpy ndarray
         map_python_rdd = schema_is_coercible(python_rdd, list(python_schema))
         self._frame = PythonFrame(map_python_rdd, python_schema)
     else:
         logger.info("frame._python reference: frame already has a python backend")
     return self._frame
예제 #2
0
파일: frame.py 프로젝트: grehx/spark-tk
    def _scala(self):
        """gets frame backend as Scala Frame, causes conversion if it is current not"""

        if self._is_python:
            logger.info("frame._scala reference: converting frame backend from Python to Scala")
            # If schema contains matrix dataype,
            # then apply type_coercer_pymlib to convert ndarray to pymlib DenseMatrix for serialization purpose at java
            self._frame.rdd = schema_is_coercible(self._frame.rdd, list(self._frame.schema), True)
            # convert PythonFrame to a Scala Frame"""
            scala_schema = schema_to_scala(self._tc.sc, self._frame.schema)
            scala_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.pythonToScala(self._frame.rdd._jrdd, scala_schema)
            self._frame = self._create_scala_frame(self._tc.sc, scala_rdd, scala_schema)
        else:
            logger.info("frame._scala reference: frame already has a scala backend")
        return self._frame
예제 #3
0
 def _python(self):
     """gets frame backend as _PythonFrame, causes conversion if it is current not"""
     if self._is_scala:
         logger.info("frame._python reference: converting frame backend from Scala to Python")
         # convert Scala Frame to a PythonFrame"""
         scala_schema = self._frame.schema()
         java_rdd =  self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(self._frame.rdd())
         python_schema = schema_to_python(self._tc.sc, scala_schema)
         python_rdd = RDD(java_rdd, self._tc.sc)
         # If schema contains matrix datatype, then apply type_coercer to convert list[list] to numpy ndarray
         map_python_rdd = schema_is_coercible(python_rdd, list(python_schema))
         self._frame = PythonFrame(map_python_rdd, python_schema)
     else:
         logger.info("frame._python reference: frame already has a python backend")
     return self._frame
예제 #4
0
    def _scala(self):
        """gets frame backend as Scala Frame, causes conversion if it is current not"""

        if self._is_python:
            logger.info("frame._scala reference: converting frame backend from Python to Scala")
            # If schema contains matrix dataype,
            # then apply type_coercer_pymlib to convert ndarray to pymlib DenseMatrix for serialization purpose at java
            self._frame.rdd = schema_is_coercible(self._frame.rdd, list(self._frame.schema), True)
            # convert PythonFrame to a Scala Frame"""
            scala_schema = schema_to_scala(self._tc.sc, self._frame.schema)
            scala_rdd = self._tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.pythonToScala(self._frame.rdd._jrdd, scala_schema)
            self._frame = self._create_scala_frame(self._tc.sc, scala_rdd, scala_schema)
        else:
            logger.info("frame._scala reference: frame already has a scala backend")
        return self._frame
예제 #5
0
    def __init__(self, tc, source, schema=None, validate_schema=False):
        """(Private constructor -- use tc.frame.create or other methods available from the TkContext)"""
        self._tc = tc
        if self._is_scala_frame(source):
            self._frame = source
        elif self._is_scala_rdd(source):
            scala_schema = schema_to_scala(tc.sc, schema)
            self._frame = self._create_scala_frame(tc.sc, source, scala_schema)
        elif self._is_scala_dataframe(source):
            self._frame = self._create_scala_frame_from_scala_dataframe(
                tc.sc, source)
        elif isinstance(source, DataFrame):
            self._frame = self._create_scala_frame_from_scala_dataframe(
                tc.sc, source._jdf)
        elif isinstance(source, PythonFrame):
            self._frame = source
        else:
            if not isinstance(source, RDD):
                if not isinstance(source, list) or (len(source) > 0 and any(
                        not isinstance(row, (list, tuple)) for row in source)):
                    raise TypeError(
                        "Invalid data source.  The data parameter must be a 2-dimensional list (list of row data) or an RDD."
                    )

                inferred_schema = False
                if isinstance(schema, list):
                    if all(isinstance(item, basestring) for item in schema):
                        # check if schema is just a list of column names (versus string and data type tuples)
                        schema = self._infer_schema(source, schema)
                        inferred_schema = True
                    elif not all(
                            isinstance(item, tuple) and len(item) == 2
                            and isinstance(item[0], basestring)
                            for item in schema):
                        raise TypeError(
                            "Invalid schema.  Expected a list of tuples (str, type) with the column name and data type, but received type %s."
                            % type(schema))
                    # check for duplicate column names
                    column_names = [col[0] for col in schema]
                    duplicate_column_names = set([
                        col for col in column_names
                        if column_names.count(col) > 1
                    ])
                    if len(duplicate_column_names) > 0:
                        raise ValueError(
                            "Invalid schema, column names cannot be duplicated: %s"
                            % ", ".join(duplicate_column_names))
                elif schema is None:
                    schema = self._infer_schema(source)
                    inferred_schema = True
                else:
                    # Schema is not a list or None
                    raise TypeError(
                        "Invalid schema type: %s.  Expected a list of tuples (str, type) with the column name and data type."
                        % type(schema))
                for item in schema:
                    if not self._is_supported_datatype(item[1]):
                        if inferred_schema:
                            raise TypeError(
                                "The %s data type was found when inferring the schema, and it is not a "
                                "supported data type.  Instead, specify a schema that uses a supported data "
                                "type, and enable validate_schema so that the data is converted to the proper "
                                "data type.\n\nInferred schema: %s\n\nSupported data types: %s"
                                % (str(item[1]), str(schema), dtypes.dtypes))
                        else:
                            raise TypeError(
                                "Invalid schema.  %s is not a supported data type.\n\nSupported data types: %s"
                                % (str(item[1]), dtypes.dtypes))

                source = tc.sc.parallelize(source)
            if schema and validate_schema:
                # Validate schema by going through the data and checking the data type and attempting to parse it
                validate_schema_result = self.validate_pyrdd_schema(
                    source, schema)
                source = validate_schema_result.validated_rdd
                logger.debug(
                    "%s values were unable to be parsed to the schema's data type."
                    % validate_schema_result.bad_value_count)

            # If schema contains matrix datatype, then apply type_coercer to convert list[list] to numpy ndarray
            map_source = schema_is_coercible(source, list(schema))
            self._frame = PythonFrame(map_source, schema)
예제 #6
0
파일: frame.py 프로젝트: grehx/spark-tk
    def __init__(self, tc, source, schema=None, validate_schema=False):
        """(Private constructor -- use tc.frame.create or other methods available from the TkContext)"""
        self._tc = tc
        if self._is_scala_frame(source):
            self._frame = source
        elif self._is_scala_rdd(source):
            scala_schema = schema_to_scala(tc.sc, schema)
            self._frame = self._create_scala_frame(tc.sc, source, scala_schema)
        elif self._is_scala_dataframe(source):
            self._frame = self._create_scala_frame_from_scala_dataframe(tc.sc, source)
        elif isinstance(source, DataFrame):
            self._frame = self._create_scala_frame_from_scala_dataframe(tc.sc, source._jdf)
        elif isinstance(source, PythonFrame):
            self._frame = source
        else:
            if not isinstance(source, RDD):
                if not isinstance(source, list) or (len(source) > 0 and any(not isinstance(row, (list, tuple)) for row in source)):
                    raise TypeError("Invalid data source.  The data parameter must be a 2-dimensional list (list of row data) or an RDD.")

                inferred_schema = False
                if isinstance(schema, list):
                    if all(isinstance(item, basestring) for item in schema):
                        # check if schema is just a list of column names (versus string and data type tuples)
                        schema = self._infer_schema(source, schema)
                        inferred_schema = True
                    elif not all(isinstance(item, tuple) and
                                  len(item) == 2 and
                                  isinstance(item[0], basestring) for item in schema):
                        raise TypeError("Invalid schema.  Expected a list of tuples (str, type) with the column name and data type, but received type %s." % type(schema))
                    # check for duplicate column names
                    column_names = [col[0] for col in schema]
                    duplicate_column_names = set([col for col in column_names if column_names.count(col) > 1])
                    if len(duplicate_column_names) > 0:
                        raise ValueError("Invalid schema, column names cannot be duplicated: %s" % ", ".join(duplicate_column_names))
                elif schema is None:
                    schema = self._infer_schema(source)
                    inferred_schema = True
                else:
                    # Schema is not a list or None
                    raise TypeError("Invalid schema type: %s.  Expected a list of tuples (str, type) with the column name and data type." % type(schema))
                for item in schema:
                    if not self._is_supported_datatype(item[1]):
                        if inferred_schema:
                            raise TypeError("The %s data type was found when inferring the schema, and it is not a "
                                            "supported data type.  Instead, specify a schema that uses a supported data "
                                            "type, and enable validate_schema so that the data is converted to the proper "
                                            "data type.\n\nInferred schema: %s\n\nSupported data types: %s" %
                                            (str(item[1]), str(schema), dtypes.dtypes))
                        else:
                            raise TypeError("Invalid schema.  %s is not a supported data type.\n\nSupported data types: %s" %
                                            (str(item[1]), dtypes.dtypes))

                source = tc.sc.parallelize(source)
            if schema and validate_schema:
                # Validate schema by going through the data and checking the data type and attempting to parse it
                validate_schema_result = self.validate_pyrdd_schema(source, schema)
                source = validate_schema_result.validated_rdd
                logger.debug("%s values were unable to be parsed to the schema's data type." % validate_schema_result.bad_value_count)

            # If schema contains matrix datatype, then apply type_coercer to convert list[list] to numpy ndarray
            map_source = schema_is_coercible(source, list(schema))
            self._frame = PythonFrame(map_source, schema)