def get_write_partition_to_hive_table_query( self, table_name, partitions=None, schema_to_table_name_map=None, partitions_in_table=False, append_to_partition=False, ): """ Returns a Hive query string that will update the metatable to point to the data as the new partition. :param Text table_name: :param dict[Text, T] partitions: A dictionary mapping table partition key names to the values matching this partition. :param dict[Text, Text] schema_to_table_name_map: Mapping of names in current schema to table in which it is being inserted. Currently not supported. Must be None. :param bool partitions_in_table: Whether or not the partition columns exist in the data being submitted. Currently not supported. Must be false :param bool append_to_partition: Whether or not to append new values to a partition. Currently not supported. :return: Text """ partition_string = "" where_string = "" identity_dict = {n: n for n in _six.iterkeys(self.type.sdk_columns)} identity_dict.update(schema_to_table_name_map or {}) schema_to_table_name_map = identity_dict table_to_schema_name_map = { v: k for k, v in _six.iteritems(schema_to_table_name_map) } if partitions: partition_conditions = [] for partition_name, partition_value in _six.iteritems(partitions): if not isinstance(partition_name, (str, _six.text_type)): raise _user_exceptions.FlyteTypeException( expected_type={str, _six.text_type}, received_type=type(partition_name), received_value=partition_name, additional_msg="All partition names must be type str.", ) if type(partition_value) not in _ALLOWED_PARTITION_TYPES: raise _user_exceptions.FlyteTypeException( expected_type=_ALLOWED_PARTITION_TYPES, received_type=type(partition_value), received_value=partition_value, additional_msg= "Partition {name} has an unsupported type.".format( name=partition_name), ) # We need the string to be quoted in the query, so let's take repr of it. if isinstance(partition_value, (str, _six.text_type)): partition_value = repr(partition_value) partition_conditions.append( "{partition_name} = {partition_value}".format( partition_name=partition_name, partition_value=partition_value)) partition_formatter = "PARTITION (\n\t{conditions}\n)" partition_string = partition_formatter.format( conditions=",\n\t".join(partition_conditions)) if partitions_in_table and partitions: where_clauses = [] for partition_name, partition_value in partitions: where_clauses.append( "\n\t\t{schema_name} = {value_str} AND ".format( schema_name=table_to_schema_name_map[partition_name], value_str=partition_value, )) where_string = "WHERE\n\t\t{where_clauses}".format( where_clauses=" AND\n\t\t".join(where_clauses)) if where_string or partitions_in_table: raise _user_exceptions.FlyteAssertion( "Currently, the partition values should not be present in the schema pushed to Hive." ) if append_to_partition: raise _user_exceptions.FlyteAssertion( "Currently, partitions can only be overwritten, they cannot be appended." ) if not partitions: raise _user_exceptions.FlyteAssertion( "Currently, partition values MUST be specified for writing to a table." ) return _format_insert_partition_query( remote_location=self.remote_location, table_name=table_name, partition_string=partition_string, )
def compare_dataframe_to_schema(self, data_frame, column_subset=None, read=False): """ Do necessary type checking of a pandas data frame. Raise exception if it doesn't match. :param pandas.DateFrame data_frame: data frame to type check :param list[Text] column_subset: :param bool read: Used to alter error message for more clarity. """ all_columns = list(data_frame.columns.values) schema_column_names = list(self.type.sdk_columns.keys()) # Skip checking if we have a generic schema type (no specified columns) if not schema_column_names: return # If we specify a subset of columns, ensure they all exist and then only take those columns if column_subset is not None: schema_column_names = [] failed_columns = [] for column in column_subset: if column not in self.type.sdk_columns: failed_columns.append(column) else: schema_column_names.append(column) if len(failed_columns) > 0: additional_msg = "" raise _user_exceptions.FlyteAssertion( "{} was/where requested but could not be found in the schema: {}.{}" .format(failed_columns, self.type.sdk_columns, additional_msg)) if not all(c in all_columns for c in schema_column_names): raise _user_exceptions.FlyteTypeException( expected_type=self.type.sdk_columns, received_type=data_frame.columns, additional_msg= "Mismatch between the data frame's column names {} and schema's column names {} " "with strict_names=True.".format(all_columns, schema_column_names), ) # This only iterates if the Schema has specified columns. for name in schema_column_names: literal_type = self.type.sdk_columns[name].to_flyte_literal_type() dtype = data_frame[name].dtype # TODO np.issubdtype is deprecated. Replace it if all(not _np.issubdtype(dtype, allowed_type) for allowed_type in get_supported_literal_types_to_pandas_types() [literal_type]): if read: read_or_write_msg = "read data frame object from schema" else: read_or_write_msg = "write data frame object to schema" additional_msg = ( "Cannot {read_write} because the types do not match. Column " "'{name}' did not pass type checking. Note: If your " "column contains null values, the types might not transition as expected between parquet and " "pandas. For more information, see: " "http://arrow.apache.org/docs/python/pandas.html#arrow-pandas-conversion" .format(read_write=read_or_write_msg, name=name)) raise _user_exceptions.FlyteTypeException( expected_type=get_supported_literal_types_to_pandas_types( )[literal_type], received_type=dtype, additional_msg=additional_msg, )
def from_python_std(cls, literal_type, t_value, upstream_nodes=None): """ :param flytekit.models.types.LiteralType literal_type: :param T t_value: :param list[flytekit.common.nodes.SdkNode] upstream_nodes: [Optional] Keeps track of the nodes upstream, if applicable. :rtype: BindingData """ scalar = None collection = None promise = None map = None downstream_sdk_type = _type_helpers.get_sdk_type_from_literal_type( literal_type) if isinstance(t_value, _promise.Input): if not downstream_sdk_type.is_castable_from(t_value.sdk_type): _user_exceptions.FlyteTypeException( t_value.sdk_type, downstream_sdk_type, additional_msg="When binding workflow input: {}".format( t_value), ) promise = t_value.promise elif isinstance(t_value, _promise.NodeOutput): if not downstream_sdk_type.is_castable_from(t_value.sdk_type): _user_exceptions.FlyteTypeException( t_value.sdk_type, downstream_sdk_type, additional_msg="When binding node output: {}".format( t_value), ) promise = t_value if upstream_nodes is not None: upstream_nodes.append(t_value.sdk_node) elif isinstance(t_value, list): if not issubclass(downstream_sdk_type, _containers.ListImpl): raise _user_exceptions.FlyteTypeException( type(t_value), downstream_sdk_type, received_value=t_value, additional_msg="Cannot bind a list to a non-list type.", ) collection = _literal_models.BindingDataCollection([ BindingData.from_python_std( downstream_sdk_type.sub_type.to_flyte_literal_type(), v, upstream_nodes=upstream_nodes, ) for v in t_value ]) elif isinstance(t_value, dict) and ( not issubclass(downstream_sdk_type, _primitives.Generic) or BindingData._has_sub_bindings(t_value)): # TODO: This behavior should be embedded in the type engine. Someone should be able to alter behavior of # TODO: binding logic by injecting their own type engine. The same goes for the list check above. raise NotImplementedError( "TODO: Cannot use map bindings at the moment") else: sdk_value = downstream_sdk_type.from_python_std(t_value) scalar = sdk_value.scalar collection = sdk_value.collection map = sdk_value.map return cls(scalar=scalar, collection=collection, map=map, promise=promise)