Пример #1
0
    def get_write_partition_to_hive_table_query(
        self,
        table_name,
        partitions=None,
        schema_to_table_name_map=None,
        partitions_in_table=False,
        append_to_partition=False,
    ):
        """
        Returns a Hive query string that will update the metatable to point to the data as the new partition.

        :param Text table_name:
        :param dict[Text, T] partitions: A dictionary mapping table partition key names to the values matching this
            partition.
        :param dict[Text, Text] schema_to_table_name_map: Mapping of names in current schema to table in which it is
            being inserted.  Currently not supported.  Must be None.
        :param bool partitions_in_table:  Whether or not the partition columns exist in the data being submitted.
            Currently not supported.  Must be false
        :param bool append_to_partition: Whether or not to append new values to a partition.  Currently not supported.
        :return: Text
        """
        partition_string = ""
        where_string = ""
        identity_dict = {n: n for n in _six.iterkeys(self.type.sdk_columns)}
        identity_dict.update(schema_to_table_name_map or {})
        schema_to_table_name_map = identity_dict
        table_to_schema_name_map = {
            v: k
            for k, v in _six.iteritems(schema_to_table_name_map)
        }

        if partitions:
            partition_conditions = []
            for partition_name, partition_value in _six.iteritems(partitions):
                if not isinstance(partition_name, (str, _six.text_type)):
                    raise _user_exceptions.FlyteTypeException(
                        expected_type={str, _six.text_type},
                        received_type=type(partition_name),
                        received_value=partition_name,
                        additional_msg="All partition names must be type str.",
                    )
                if type(partition_value) not in _ALLOWED_PARTITION_TYPES:
                    raise _user_exceptions.FlyteTypeException(
                        expected_type=_ALLOWED_PARTITION_TYPES,
                        received_type=type(partition_value),
                        received_value=partition_value,
                        additional_msg=
                        "Partition {name} has an unsupported type.".format(
                            name=partition_name),
                    )

                # We need the string to be quoted in the query, so let's take repr of it.
                if isinstance(partition_value, (str, _six.text_type)):
                    partition_value = repr(partition_value)
                partition_conditions.append(
                    "{partition_name} = {partition_value}".format(
                        partition_name=partition_name,
                        partition_value=partition_value))
            partition_formatter = "PARTITION (\n\t{conditions}\n)"
            partition_string = partition_formatter.format(
                conditions=",\n\t".join(partition_conditions))

        if partitions_in_table and partitions:
            where_clauses = []
            for partition_name, partition_value in partitions:
                where_clauses.append(
                    "\n\t\t{schema_name} = {value_str} AND ".format(
                        schema_name=table_to_schema_name_map[partition_name],
                        value_str=partition_value,
                    ))
            where_string = "WHERE\n\t\t{where_clauses}".format(
                where_clauses=" AND\n\t\t".join(where_clauses))

        if where_string or partitions_in_table:
            raise _user_exceptions.FlyteAssertion(
                "Currently, the partition values should not be present in the schema pushed to Hive."
            )
        if append_to_partition:
            raise _user_exceptions.FlyteAssertion(
                "Currently, partitions can only be overwritten, they cannot be appended."
            )
        if not partitions:
            raise _user_exceptions.FlyteAssertion(
                "Currently, partition values MUST be specified for writing to a table."
            )

        return _format_insert_partition_query(
            remote_location=self.remote_location,
            table_name=table_name,
            partition_string=partition_string,
        )
Пример #2
0
    def compare_dataframe_to_schema(self,
                                    data_frame,
                                    column_subset=None,
                                    read=False):
        """
        Do necessary type checking of a pandas data frame.  Raise exception if it doesn't match.
        :param pandas.DateFrame data_frame: data frame to type check
        :param list[Text] column_subset:
        :param bool read: Used to alter error message for more clarity.
        """
        all_columns = list(data_frame.columns.values)
        schema_column_names = list(self.type.sdk_columns.keys())

        # Skip checking if we have a generic schema type (no specified columns)
        if not schema_column_names:
            return

        # If we specify a subset of columns, ensure they all exist and then only take those columns
        if column_subset is not None:
            schema_column_names = []
            failed_columns = []
            for column in column_subset:
                if column not in self.type.sdk_columns:
                    failed_columns.append(column)
                else:
                    schema_column_names.append(column)

            if len(failed_columns) > 0:
                additional_msg = ""
                raise _user_exceptions.FlyteAssertion(
                    "{} was/where requested but could not be found in the schema: {}.{}"
                    .format(failed_columns, self.type.sdk_columns,
                            additional_msg))

        if not all(c in all_columns for c in schema_column_names):
            raise _user_exceptions.FlyteTypeException(
                expected_type=self.type.sdk_columns,
                received_type=data_frame.columns,
                additional_msg=
                "Mismatch between the data frame's column names {} and schema's column names {} "
                "with strict_names=True.".format(all_columns,
                                                 schema_column_names),
            )

        # This only iterates if the Schema has specified columns.
        for name in schema_column_names:
            literal_type = self.type.sdk_columns[name].to_flyte_literal_type()
            dtype = data_frame[name].dtype

            # TODO np.issubdtype is deprecated. Replace it
            if all(not _np.issubdtype(dtype, allowed_type) for allowed_type in
                   get_supported_literal_types_to_pandas_types()
                   [literal_type]):
                if read:
                    read_or_write_msg = "read data frame object from schema"
                else:
                    read_or_write_msg = "write data frame object to schema"
                additional_msg = (
                    "Cannot {read_write} because the types do not match. Column "
                    "'{name}' did not pass type checking.  Note: If your "
                    "column contains null values, the types might not transition as expected between parquet and "
                    "pandas.  For more information, see: "
                    "http://arrow.apache.org/docs/python/pandas.html#arrow-pandas-conversion"
                    .format(read_write=read_or_write_msg, name=name))
                raise _user_exceptions.FlyteTypeException(
                    expected_type=get_supported_literal_types_to_pandas_types(
                    )[literal_type],
                    received_type=dtype,
                    additional_msg=additional_msg,
                )
Пример #3
0
 def from_python_std(cls, literal_type, t_value, upstream_nodes=None):
     """
     :param flytekit.models.types.LiteralType literal_type:
     :param T t_value:
     :param list[flytekit.common.nodes.SdkNode] upstream_nodes: [Optional] Keeps track of the nodes upstream,
         if applicable.
     :rtype: BindingData
     """
     scalar = None
     collection = None
     promise = None
     map = None
     downstream_sdk_type = _type_helpers.get_sdk_type_from_literal_type(
         literal_type)
     if isinstance(t_value, _promise.Input):
         if not downstream_sdk_type.is_castable_from(t_value.sdk_type):
             _user_exceptions.FlyteTypeException(
                 t_value.sdk_type,
                 downstream_sdk_type,
                 additional_msg="When binding workflow input: {}".format(
                     t_value),
             )
         promise = t_value.promise
     elif isinstance(t_value, _promise.NodeOutput):
         if not downstream_sdk_type.is_castable_from(t_value.sdk_type):
             _user_exceptions.FlyteTypeException(
                 t_value.sdk_type,
                 downstream_sdk_type,
                 additional_msg="When binding node output: {}".format(
                     t_value),
             )
         promise = t_value
         if upstream_nodes is not None:
             upstream_nodes.append(t_value.sdk_node)
     elif isinstance(t_value, list):
         if not issubclass(downstream_sdk_type, _containers.ListImpl):
             raise _user_exceptions.FlyteTypeException(
                 type(t_value),
                 downstream_sdk_type,
                 received_value=t_value,
                 additional_msg="Cannot bind a list to a non-list type.",
             )
         collection = _literal_models.BindingDataCollection([
             BindingData.from_python_std(
                 downstream_sdk_type.sub_type.to_flyte_literal_type(),
                 v,
                 upstream_nodes=upstream_nodes,
             ) for v in t_value
         ])
     elif isinstance(t_value, dict) and (
             not issubclass(downstream_sdk_type, _primitives.Generic)
             or BindingData._has_sub_bindings(t_value)):
         # TODO: This behavior should be embedded in the type engine.  Someone should be able to alter behavior of
         # TODO: binding logic by injecting their own type engine.  The same goes for the list check above.
         raise NotImplementedError(
             "TODO: Cannot use map bindings at the moment")
     else:
         sdk_value = downstream_sdk_type.from_python_std(t_value)
         scalar = sdk_value.scalar
         collection = sdk_value.collection
         map = sdk_value.map
     return cls(scalar=scalar,
                collection=collection,
                map=map,
                promise=promise)