def map_with_saved_model(model_dir, input_columns, tags, signature_name=None, output_keys_in_signature=None): """Applies a SavedModel to some columns. Applies a SavedModel to `input_columns`. The SavedModel is specified with `model_dir`, `tags` and `signature_name`. Note that the SavedModel will be converted to an all-constants graph, so ops requiring graph collections, such as table lookup (which requires a table init op being added to TABLE_INITIALIZERS collection), are not supported. Args: model_dir: A path containing a SavedModel. input_columns: Input `Column`s used as model input tensors. If there are multiple inputs from model signature, it is a map with keys as the names from signature and values as input `Column`s. If there is only one input from signature, it needs to be the input `Column`. tags: The tags specifying which metagraph to load from the SavedModel. signature_name: Specify signature of the loaded model. The default value None can be used if there is only one signature in the MetaGraphDef. output_keys_in_signature: A list of strings which should be a subset of the outputs in the signature of the SavedModel. The returned `Column`s will correspond to specified output tensors, in the same order. The default value None can be used if there is only one output from signature. Returns: Like tft.map, returns a `Column` representing the application of the SavedModel. Raises: ValueError: if `input_columns` is invalid type, or `signature_name` is None but the SavedModel contains multiple signature, or `input_columns` do not match the signature inputs, or `output_keys_in_signature` is not a subset of the signature outputs. """ if isinstance(input_columns, dict): # Sort input columns so the pipeline is deterministic. input_keys_in_signature_sorted = sorted(input_columns.keys()) input_columns_sorted = [ input_columns[k] for k in input_keys_in_signature_sorted ] elif isinstance(input_columns, api.Column): input_keys_in_signature_sorted = None input_columns_sorted = [input_columns] else: raise ValueError( 'Expect "input_columns" to be dict or tft.Column but got %s.' % type(input_columns)) tensor_fn = impl_helper.make_tensor_func_from_saved_model( model_dir, tags, signature_name=signature_name, input_keys_in_signature=input_keys_in_signature_sorted, output_keys_in_signature=output_keys_in_signature) return api.map(tensor_fn, *input_columns_sorted)
def preprocessing_fn(inputs): return { 'dense_out': mappers.scale_to_0_1(inputs['dense_1']), 'sparse_out': api.map(lambda x: tf.sparse_reshape(x, (1, 10)), inputs['sparse']) }
def size(x): """Computes the total size of instances in a `Column`. Args: x: An input `Column' wrapping a `Tensor`. Returns: A `Statistic`. """ if not isinstance(x.tensor, tf.Tensor): raise TypeError('Expected a Tensor, but got %r' % x.tensor) # Note: Calling `sum` defined in this module, not the builtin. return sum(api.map(tf.ones_like, x))
def scale_to_0_1(x): """Returns a column which is the input column scaled to have range [0,1]. Args: x: A `Column` representing a numeric value. Returns: A `Column` representing the input column scaled to [0, 1]. """ # A TITO function that scales x. def scale(x, min_value, max_value): return (x - min_value) / (max_value - min_value) return api.map(scale, x, analyzers.min(x), analyzers.max(x))
def size(x, reduce_instance_dims=True): """Computes the total size of instances in a `Column`. Args: x: An input `Column' wrapping a `Tensor`. reduce_instance_dims: By default collapses the batch and instance dimensions to arrive at a single scalar output. If False, only collapses the batch dimension and outputs a vector of the same shape as the output. Returns: A `Statistic`. """ if not isinstance(x.tensor, tf.Tensor): raise TypeError('Expected a Tensor, but got %r' % x.tensor) # Note: Calling `sum` defined in this module, not the builtin. return sum(api.map(tf.ones_like, x), reduce_instance_dims)
def string_to_int(x, default_value=-1, top_k=None, frequency_threshold=None): """Generates a vocabulary for `x` and maps it to an integer with this vocab. Args: x: A `Column` representing a string value or values. default_value: The value to use for out-of-vocabulary values. top_k: Limit the generated vocabulary to the first `top_k` elements. If set to None, the full vocabulary is generated. frequency_threshold: Limit the generated vocabulary only to elements whose frequency is >= to the supplied threshold. If set to None, the full vocabulary is generated. Returns: A `Column` where each string value is mapped to an integer where each unique string value is mapped to a different integer and integers are consecutive and starting from 0. Raises: ValueError: If `top_k` or `count_threshold` is negative. """ if top_k is not None: top_k = long(top_k) if top_k < 0: raise ValueError('top_k must be non-negative, but got: %r' % top_k) if frequency_threshold is not None: frequency_threshold = long(frequency_threshold) if frequency_threshold < 0: raise ValueError( 'frequency_threshold must be non-negative, but got: %r' % frequency_threshold) def map_to_int(x, vocab): table = lookup.string_to_index_table_from_tensor( vocab, default_value=default_value) return table.lookup(x) return api.map( map_to_int, x, analyzers.uniques(x, top_k=top_k, frequency_threshold=frequency_threshold))
def map_with_checkpoint(input_tensor_func, input_columns, checkpoint, include=None, exclude=None): """Applies a tensor-in-tensor-out function with variables to some columns. Variable values are loaded from the given checkpoint path. Note that the input_tensor_func, together with the checkpoint, will be converted to an all-constants graph, so ops requiring graph collections, such as table lookup (which requires a table init op being added to TABLE_INITIALIZERS collection), are not supported. Args: input_tensor_func: A tensor-in-tensor-out function that may contain variables. input_columns: A list of `Column`s to apply the `input_tensor_func` to. checkpoint: The checkpoint path to load variables from. include: An optional list/tuple of scope strings for filtering which variables from the VARIABLES collection to include. If None, all variables will be included. exclude: An optional list/tuple of scope strings for filtering which variables from the VARIABLES collection to exclude. If None, no variables will be excluded. Returns: Like tft.map, returns a `Column` representing the output of the `input_tensor_func`. Raises: ValueError if the input tensor-in-tensor-out function adds to TABLE_INITIALIZERS collections. """ tensor_func = impl_helper.make_tensor_func_from_checkpoint( input_tensor_func, checkpoint, include, exclude) return api.map(tensor_func, *input_columns)
def scale_by_min_max(x, output_min=0.0, output_max=1.0): """Scale a numerical column into the range [output_min, output_max]. Args: x: A `Column` representing a numeric value. output_min: The minimum of the range of output values. output_max: The maximum of the range of output values. Returns: A `Column` representing the input column scaled to [output_min, output_max]. Raises: ValueError: If output_min, output_max have the wrong order. """ if output_min >= output_max: raise ValueError('output_min must be less than output_max') # A TITO function that scales x. def _scale(x, min_x_value, max_x_value): return ((((x - min_x_value) * (output_max - output_min)) / (max_x_value - min_x_value)) + output_min) return api.map(_scale, x, analyzers.min(x), analyzers.max(x))
def preprocessing_fn(inputs): return { 'x_sum': api.map(tf.reduce_sum, inputs['x']), }
def preprocessing_fn(inputs): return { 'z': api.map(lambda x, y: x * y + x + y, inputs['x'], analyzers.min(inputs['y'])) }
def preprocessing_fn(inputs): return { 'z': api.map(tf.sparse_add, inputs['x'], inputs['y']) }
def string_to_int(x, default_value=-1, top_k=None, frequency_threshold=None): """Generates a vocabulary for `x` and maps it to an integer with this vocab. Args: x: A `Column` representing a string value or values. default_value: The value to use for out-of-vocabulary values. top_k: Limit the generated vocabulary to the first `top_k` elements. If set to None, the full vocabulary is generated. frequency_threshold: Limit the generated vocabulary only to elements whose frequency is >= to the supplied threshold. If set to None, the full vocabulary is generated. Returns: A `Column` where each string value is mapped to an integer where each unique string value is mapped to a different integer and integers are consecutive and starting from 0. Raises: ValueError: If `top_k` or `count_threshold` is negative. """ if top_k is not None: top_k = long(top_k) if top_k < 0: raise ValueError('top_k must be non-negative, but got: %r' % top_k) if frequency_threshold is not None: frequency_threshold = long(frequency_threshold) if frequency_threshold < 0: raise ValueError( 'frequency_threshold must be non-negative, but got: %r' % frequency_threshold) def map_to_int(x, vocab): """Maps string tensor into indexes using vocab. It uses a dummy vocab when the input vocab is empty. Args: x : a Tensor/SparseTensor of string. vocab : a Tensor/SparseTensor containing unique string values within x. Returns: a Tensor/SparseTensor of indexes (int) of the same shape as x. """ def fix_vocab_if_needed(vocab): num_to_add = 1 - tf.minimum(tf.size(vocab), 1) return tf.concat([ vocab, tf.fill(tf.reshape(num_to_add, (1, )), '__dummy_value__index_zero__') ], 0) table = lookup.string_to_index_table_from_tensor( fix_vocab_if_needed(vocab), default_value=default_value) return table.lookup(x) return api.map( map_to_int, x, analyzers.uniques(x, top_k=top_k, frequency_threshold=frequency_threshold))
def tfidf_weights(x, vocab_size): """Maps the terms in x to their (1/doc_length) * inverse document frequency. Args: x: A `Column` representing int64 values (most likely that are the result of calling string_to_int on a tokenized string). vocab_size: An int - the count of vocab used to turn the string into int64s including any OOV buckets Returns: A `Column` where each int value is mapped to a double equal to (1 if that term appears in that row, 0 otherwise / the number of terms in that row) * the log of (the number of rows in `x` / (1 + the number of rows in `x` where the term appears at least once)) NOTE: This is intented to be used with the feature_column 'sum' combiner to arrive at the true term frequncies. """ def _map_to_vocab_range(x): """Enforces that the vocab_ids in x are positive.""" return tf.SparseTensor(indices=x.indices, values=tf.mod(x.values, vocab_size), dense_shape=x.dense_shape) def _map_to_doc_contains_term(x): """Creates a SparseTensor with 1s at every doc/term pair index. Args: x : a SparseTensor of int64 representing string indices in vocab. Returns: a SparseTensor with 1s at indices <doc_index_in_batch>, <term_index_in_vocab> for every term/doc pair. """ # Construct intermediary sparse tensor with indices # [<doc>, <term_index_in_doc>, <vocab_id>] and tf.ones values. split_indices = tf.to_int64( tf.split(x.indices, axis=1, num_or_size_splits=2)) expanded_values = tf.to_int64(tf.expand_dims(x.values, 1)) next_index = tf.concat( [split_indices[0], split_indices[1], expanded_values], axis=1) next_values = tf.ones_like(x.values) vocab_size_as_tensor = tf.constant([vocab_size], dtype=tf.int64) next_shape = tf.concat([x.dense_shape, vocab_size_as_tensor], 0) next_tensor = tf.SparseTensor(indices=tf.to_int64(next_index), values=next_values, dense_shape=next_shape) # Take the intermediar tensor and reduce over the term_index_in_doc # dimension. This produces a tensor with indices [<doc_id>, <term_id>] # and values [count_of_term_in_doc] and shape batch x vocab_size term_count_per_doc = tf.sparse_reduce_sum_sparse(next_tensor, 1) one_if_doc_contains_term = tf.SparseTensor( indices=term_count_per_doc.indices, values=tf.to_double(tf.greater(term_count_per_doc.values, 0)), dense_shape=term_count_per_doc.dense_shape) return one_if_doc_contains_term def _map_to_tfidf(x, reduced_term_freq, corpus_size): """Calculates the inverse document frequency of terms in the corpus. Args: x : a SparseTensor of int64 representing string indices in vocab. reduced_term_freq: A dense tensor of shape (vocabSize,) that represents the count of the number of documents with each term. corpus_size: A scalar count of the number of documents in the corpus Returns: The tf*idf values """ # Add one to the reduced term freqnencies to avoid dividing by zero. idf = tf.log( tf.to_double(corpus_size) / (1.0 + tf.to_double(reduced_term_freq))) dense_doc_sizes = tf.to_double( tf.sparse_reduce_sum( tf.SparseTensor(indices=x.indices, values=tf.ones_like(x.values), dense_shape=x.dense_shape), 1)) # For every term in x, divide the idf by the doc size. # The two gathers both result in shape <sum_doc_sizes> idf_over_doc_size = (tf.gather(idf, x.values) / tf.gather(dense_doc_sizes, x.indices[:, 0])) return tf.SparseTensor(indices=x.indices, values=idf_over_doc_size, dense_shape=x.dense_shape) cleaned_input = api.map(_map_to_vocab_range, x) docs_with_terms = api.map(_map_to_doc_contains_term, cleaned_input) def count_docs_with_term(term_frequency): # Sum w/in batch. count_of_doc_inter = tf.SparseTensor( indices=term_frequency.indices, values=tf.ones_like(term_frequency.values), dense_shape=term_frequency.dense_shape) out = tf.sparse_reduce_sum(count_of_doc_inter, axis=0) return tf.expand_dims(out, 0) count_docs_with_term_column = api.map(count_docs_with_term, docs_with_terms) # Expand dims to get around the min_tensor_rank checks sizes = api.map(lambda y: tf.expand_dims(tf.shape(y)[0], 0), cleaned_input) return api.map( _map_to_tfidf, cleaned_input, analyzers.sum(count_docs_with_term_column, reduce_instance_dims=False), analyzers.sum(sizes))