Exemplo n.º 1
0
def map_with_saved_model(model_dir,
                         input_columns,
                         tags,
                         signature_name=None,
                         output_keys_in_signature=None):
    """Applies a SavedModel to some columns.

  Applies a SavedModel to `input_columns`. The SavedModel is specified with
  `model_dir`, `tags` and `signature_name`. Note that the SavedModel will be
  converted to an all-constants graph, so ops requiring graph collections, such
  as table lookup (which requires a table init op being added to
  TABLE_INITIALIZERS collection), are not supported.

  Args:
    model_dir: A path containing a SavedModel.
    input_columns: Input `Column`s used as model input tensors. If there are
       multiple inputs from model signature, it is a map with keys as the names
       from signature and values as input `Column`s. If there is only one input
       from signature, it needs to be the input `Column`.
    tags: The tags specifying which metagraph to load from the SavedModel.
    signature_name: Specify signature of the loaded model. The default value
       None can be used if there is only one signature in the MetaGraphDef.
    output_keys_in_signature: A list of strings which should be a subset of
       the outputs in the signature of the SavedModel. The returned `Column`s
       will correspond to specified output tensors, in the same order. The
       default value None can be used if there is only one output from
       signature.

  Returns:
    Like tft.map, returns a `Column` representing the application of the
    SavedModel.

  Raises:
    ValueError: if
    `input_columns` is invalid type, or
    `signature_name` is None but the SavedModel contains multiple signature, or
    `input_columns` do not match the signature inputs, or
    `output_keys_in_signature` is not a subset of the signature outputs.
  """

    if isinstance(input_columns, dict):
        # Sort input columns so the pipeline is deterministic.
        input_keys_in_signature_sorted = sorted(input_columns.keys())
        input_columns_sorted = [
            input_columns[k] for k in input_keys_in_signature_sorted
        ]
    elif isinstance(input_columns, api.Column):
        input_keys_in_signature_sorted = None
        input_columns_sorted = [input_columns]
    else:
        raise ValueError(
            'Expect "input_columns" to be dict or tft.Column but got %s.' %
            type(input_columns))
    tensor_fn = impl_helper.make_tensor_func_from_saved_model(
        model_dir,
        tags,
        signature_name=signature_name,
        input_keys_in_signature=input_keys_in_signature_sorted,
        output_keys_in_signature=output_keys_in_signature)
    return api.map(tensor_fn, *input_columns_sorted)
Exemplo n.º 2
0
 def preprocessing_fn(inputs):
     return {
         'dense_out':
         mappers.scale_to_0_1(inputs['dense_1']),
         'sparse_out':
         api.map(lambda x: tf.sparse_reshape(x, (1, 10)),
                 inputs['sparse'])
     }
Exemplo n.º 3
0
def size(x):
    """Computes the total size of instances in a `Column`.

  Args:
    x: An input `Column' wrapping a `Tensor`.

  Returns:
    A `Statistic`.
  """
    if not isinstance(x.tensor, tf.Tensor):
        raise TypeError('Expected a Tensor, but got %r' % x.tensor)

    # Note: Calling `sum` defined in this module, not the builtin.
    return sum(api.map(tf.ones_like, x))
Exemplo n.º 4
0
def scale_to_0_1(x):
    """Returns a column which is the input column scaled to have range [0,1].

  Args:
    x: A `Column` representing a numeric value.

  Returns:
    A `Column` representing the input column scaled to [0, 1].
  """

    # A TITO function that scales x.
    def scale(x, min_value, max_value):
        return (x - min_value) / (max_value - min_value)

    return api.map(scale, x, analyzers.min(x), analyzers.max(x))
Exemplo n.º 5
0
def size(x, reduce_instance_dims=True):
    """Computes the total size of instances in a `Column`.

  Args:
    x: An input `Column' wrapping a `Tensor`.
    reduce_instance_dims: By default collapses the batch and instance dimensions
        to arrive at a single scalar output. If False, only collapses the batch
        dimension and outputs a vector of the same shape as the output.

  Returns:
    A `Statistic`.
  """
    if not isinstance(x.tensor, tf.Tensor):
        raise TypeError('Expected a Tensor, but got %r' % x.tensor)

    # Note: Calling `sum` defined in this module, not the builtin.
    return sum(api.map(tf.ones_like, x), reduce_instance_dims)
Exemplo n.º 6
0
def string_to_int(x, default_value=-1, top_k=None, frequency_threshold=None):
    """Generates a vocabulary for `x` and maps it to an integer with this vocab.

  Args:
    x: A `Column` representing a string value or values.
    default_value: The value to use for out-of-vocabulary values.
    top_k: Limit the generated vocabulary to the first `top_k` elements. If set
      to None, the full vocabulary is generated.
    frequency_threshold: Limit the generated vocabulary only to elements whose
      frequency is >= to the supplied threshold. If set to None, the full
      vocabulary is generated.

  Returns:
    A `Column` where each string value is mapped to an integer where each unique
    string value is mapped to a different integer and integers are consecutive
    and starting from 0.

  Raises:
    ValueError: If `top_k` or `count_threshold` is negative.
  """
    if top_k is not None:
        top_k = long(top_k)
        if top_k < 0:
            raise ValueError('top_k must be non-negative, but got: %r' % top_k)

    if frequency_threshold is not None:
        frequency_threshold = long(frequency_threshold)
        if frequency_threshold < 0:
            raise ValueError(
                'frequency_threshold must be non-negative, but got: %r' %
                frequency_threshold)

    def map_to_int(x, vocab):
        table = lookup.string_to_index_table_from_tensor(
            vocab, default_value=default_value)
        return table.lookup(x)

    return api.map(
        map_to_int, x,
        analyzers.uniques(x,
                          top_k=top_k,
                          frequency_threshold=frequency_threshold))
Exemplo n.º 7
0
def map_with_checkpoint(input_tensor_func,
                        input_columns,
                        checkpoint,
                        include=None,
                        exclude=None):
    """Applies a tensor-in-tensor-out function with variables to some columns.

  Variable values are loaded from the given checkpoint path. Note that the
  input_tensor_func, together with the checkpoint, will be converted to an
  all-constants graph, so ops requiring graph collections, such as table lookup
  (which requires a table init op being added to TABLE_INITIALIZERS collection),
  are not supported.

  Args:
    input_tensor_func: A tensor-in-tensor-out function that may contain
       variables.
    input_columns: A list of `Column`s to apply the `input_tensor_func` to.
    checkpoint: The checkpoint path to load variables from.
    include: An optional list/tuple of scope strings for filtering which
       variables from the VARIABLES collection to include. If None, all
       variables will be included.
    exclude: An optional list/tuple of scope strings for filtering which
       variables from the VARIABLES collection to exclude. If None, no variables
       will be excluded.

  Returns:
    Like tft.map, returns a `Column` representing the output of the
    `input_tensor_func`.

  Raises:
    ValueError if the input tensor-in-tensor-out function adds to
       TABLE_INITIALIZERS collections.
  """

    tensor_func = impl_helper.make_tensor_func_from_checkpoint(
        input_tensor_func, checkpoint, include, exclude)

    return api.map(tensor_func, *input_columns)
Exemplo n.º 8
0
def scale_by_min_max(x, output_min=0.0, output_max=1.0):
    """Scale a numerical column into the range [output_min, output_max].

  Args:
    x: A `Column` representing a numeric value.
    output_min: The minimum of the range of output values.
    output_max: The maximum of the range of output values.

  Returns:
    A `Column` representing the input column scaled to [output_min, output_max].

  Raises:
    ValueError: If output_min, output_max have the wrong order.
  """
    if output_min >= output_max:
        raise ValueError('output_min must be less than output_max')

    # A TITO function that scales x.
    def _scale(x, min_x_value, max_x_value):
        return ((((x - min_x_value) * (output_max - output_min)) /
                 (max_x_value - min_x_value)) + output_min)

    return api.map(_scale, x, analyzers.min(x), analyzers.max(x))
Exemplo n.º 9
0
 def preprocessing_fn(inputs):
   return {
       'x_sum': api.map(tf.reduce_sum, inputs['x']),
   }
Exemplo n.º 10
0
 def preprocessing_fn(inputs):
   return {
       'z': api.map(lambda x, y: x * y + x + y,
                    inputs['x'], analyzers.min(inputs['y']))
   }
Exemplo n.º 11
0
 def preprocessing_fn(inputs):
   return {
       'z': api.map(tf.sparse_add, inputs['x'], inputs['y'])
   }
Exemplo n.º 12
0
def string_to_int(x, default_value=-1, top_k=None, frequency_threshold=None):
    """Generates a vocabulary for `x` and maps it to an integer with this vocab.

  Args:
    x: A `Column` representing a string value or values.
    default_value: The value to use for out-of-vocabulary values.
    top_k: Limit the generated vocabulary to the first `top_k` elements. If set
      to None, the full vocabulary is generated.
    frequency_threshold: Limit the generated vocabulary only to elements whose
      frequency is >= to the supplied threshold. If set to None, the full
      vocabulary is generated.

  Returns:
    A `Column` where each string value is mapped to an integer where each unique
    string value is mapped to a different integer and integers are consecutive
    and starting from 0.

  Raises:
    ValueError: If `top_k` or `count_threshold` is negative.
  """
    if top_k is not None:
        top_k = long(top_k)
        if top_k < 0:
            raise ValueError('top_k must be non-negative, but got: %r' % top_k)

    if frequency_threshold is not None:
        frequency_threshold = long(frequency_threshold)
        if frequency_threshold < 0:
            raise ValueError(
                'frequency_threshold must be non-negative, but got: %r' %
                frequency_threshold)

    def map_to_int(x, vocab):
        """Maps string tensor into indexes using vocab.

    It uses a dummy vocab when the input vocab is empty.

    Args:
      x : a Tensor/SparseTensor of string.
      vocab : a Tensor/SparseTensor containing unique string values within x.

    Returns:
      a Tensor/SparseTensor of indexes (int) of the same shape as x.
    """
        def fix_vocab_if_needed(vocab):
            num_to_add = 1 - tf.minimum(tf.size(vocab), 1)
            return tf.concat([
                vocab,
                tf.fill(tf.reshape(num_to_add,
                                   (1, )), '__dummy_value__index_zero__')
            ], 0)

        table = lookup.string_to_index_table_from_tensor(
            fix_vocab_if_needed(vocab), default_value=default_value)
        return table.lookup(x)

    return api.map(
        map_to_int, x,
        analyzers.uniques(x,
                          top_k=top_k,
                          frequency_threshold=frequency_threshold))
Exemplo n.º 13
0
def tfidf_weights(x, vocab_size):
    """Maps the terms in x to their (1/doc_length) * inverse document frequency.

  Args:
    x: A `Column` representing int64 values (most likely that are the result
        of calling string_to_int on a tokenized string).
    vocab_size: An int - the count of vocab used to turn the string into int64s
        including any OOV buckets

  Returns:
    A `Column` where each int value is mapped to a double equal to
    (1 if that term appears in that row, 0 otherwise / the number of terms in
    that row) * the log of (the number of rows in `x` / (1 + the number of
    rows in `x` where the term appears at least once))

  NOTE:
    This is intented to be used with the feature_column 'sum' combiner to arrive
    at the true term frequncies.
  """
    def _map_to_vocab_range(x):
        """Enforces that the vocab_ids in x are positive."""
        return tf.SparseTensor(indices=x.indices,
                               values=tf.mod(x.values, vocab_size),
                               dense_shape=x.dense_shape)

    def _map_to_doc_contains_term(x):
        """Creates a SparseTensor with 1s at every doc/term pair index.

    Args:
      x : a SparseTensor of int64 representing string indices in vocab.

    Returns:
      a SparseTensor with 1s at indices <doc_index_in_batch>,
          <term_index_in_vocab> for every term/doc pair.
    """
        # Construct intermediary sparse tensor with indices
        # [<doc>, <term_index_in_doc>, <vocab_id>] and tf.ones values.
        split_indices = tf.to_int64(
            tf.split(x.indices, axis=1, num_or_size_splits=2))
        expanded_values = tf.to_int64(tf.expand_dims(x.values, 1))
        next_index = tf.concat(
            [split_indices[0], split_indices[1], expanded_values], axis=1)

        next_values = tf.ones_like(x.values)
        vocab_size_as_tensor = tf.constant([vocab_size], dtype=tf.int64)
        next_shape = tf.concat([x.dense_shape, vocab_size_as_tensor], 0)

        next_tensor = tf.SparseTensor(indices=tf.to_int64(next_index),
                                      values=next_values,
                                      dense_shape=next_shape)

        # Take the intermediar tensor and reduce over the term_index_in_doc
        # dimension. This produces a tensor with indices [<doc_id>, <term_id>]
        # and values [count_of_term_in_doc] and shape batch x vocab_size
        term_count_per_doc = tf.sparse_reduce_sum_sparse(next_tensor, 1)

        one_if_doc_contains_term = tf.SparseTensor(
            indices=term_count_per_doc.indices,
            values=tf.to_double(tf.greater(term_count_per_doc.values, 0)),
            dense_shape=term_count_per_doc.dense_shape)

        return one_if_doc_contains_term

    def _map_to_tfidf(x, reduced_term_freq, corpus_size):
        """Calculates the inverse document frequency of terms in the corpus.

    Args:
      x : a SparseTensor of int64 representing string indices in vocab.
      reduced_term_freq: A dense tensor of shape (vocabSize,) that represents
          the count of the number of documents with each term.
      corpus_size: A scalar count of the number of documents in the corpus

    Returns:
      The tf*idf values
    """
        # Add one to the reduced term freqnencies to avoid dividing by zero.
        idf = tf.log(
            tf.to_double(corpus_size) /
            (1.0 + tf.to_double(reduced_term_freq)))

        dense_doc_sizes = tf.to_double(
            tf.sparse_reduce_sum(
                tf.SparseTensor(indices=x.indices,
                                values=tf.ones_like(x.values),
                                dense_shape=x.dense_shape), 1))

        # For every term in x, divide the idf by the doc size.
        # The two gathers both result in shape <sum_doc_sizes>
        idf_over_doc_size = (tf.gather(idf, x.values) /
                             tf.gather(dense_doc_sizes, x.indices[:, 0]))

        return tf.SparseTensor(indices=x.indices,
                               values=idf_over_doc_size,
                               dense_shape=x.dense_shape)

    cleaned_input = api.map(_map_to_vocab_range, x)

    docs_with_terms = api.map(_map_to_doc_contains_term, cleaned_input)

    def count_docs_with_term(term_frequency):
        # Sum w/in batch.
        count_of_doc_inter = tf.SparseTensor(
            indices=term_frequency.indices,
            values=tf.ones_like(term_frequency.values),
            dense_shape=term_frequency.dense_shape)
        out = tf.sparse_reduce_sum(count_of_doc_inter, axis=0)
        return tf.expand_dims(out, 0)

    count_docs_with_term_column = api.map(count_docs_with_term,
                                          docs_with_terms)

    # Expand dims to get around the min_tensor_rank checks
    sizes = api.map(lambda y: tf.expand_dims(tf.shape(y)[0], 0), cleaned_input)

    return api.map(
        _map_to_tfidf, cleaned_input,
        analyzers.sum(count_docs_with_term_column, reduce_instance_dims=False),
        analyzers.sum(sizes))