示例#1
0
def _example_serving_receiver_fn(
        tf_transform_output: tft.TFTransformOutput, schema: schema_pb2.Schema,
        label_key: Text) -> tf.estimator.export.ServingInputReceiver:
    """Build the serving in inputs.

  Parameters
  ----------
    tf_transform_output: A TFTransformOutput.
    schema: the schema of the input data.

  Returns
  -------
    Tensorflow graph which parses examples, applying tf-transform to them.

  """
    # Pull out the feature spec and throwout the label
    raw_feature_spec = _get_raw_feature_spec(schema)
    raw_feature_spec.pop(label_key)

    # Define the raw inputs from taken from the user
    receiver_tensors = {}
    for key in raw_feature_spec:
        absl.logging.info("KEY {}".format(key))
        dtype = raw_feature_spec[key].dtype
        receiver_tensors[key] = tf.compat.v1.placeholder(dtype=dtype,
                                                         shape=[None],
                                                         name='input_' + key)

    # Define the inputs into the the graph
    features = {}
    for key in receiver_tensors:
        batch_size = tf.shape(receiver_tensors[key])[0]
        indices = tf.cast(tf.expand_dims(tf.range(batch_size), -1), tf.int64)
        zeros = tf.zeros_like(indices)
        indices = tf.concat([indices, zeros], axis=1)
        features[key] = tf.SparseTensor(indices=indices,
                                        values=receiver_tensors[key],
                                        dense_shape=[batch_size, 1])

    # Transform the features.
    transformed_features = tf_transform_output.transform_raw_features(features)
    return tf.estimator.export.ServingInputReceiver(transformed_features,
                                                    receiver_tensors)
def _input_fn(
        file_pattern: Text,
        tf_transform_output: tft.TFTransformOutput,
        batch_size: int = 2) -> tf.data.Dataset:
    transformed_feature_spec = (
            tf_transform_output.transformed_feature_spec().copy())

    dataset = tf.data.experimental.make_batched_features_dataset(
            file_pattern=file_pattern,
            batch_size=batch_size,
            # Shuffle must be False, as Zip operation is not transitive
            shuffle=False,
            features=transformed_feature_spec,
            reader=_gzip_reader_fn)

    train_dataset = dataset.map(create_training_data)
    label_dataset = dataset.map(create_label_data)
    dataset = tf.data.Dataset.zip((train_dataset, label_dataset))

    return dataset
示例#3
0
def make_serving_signatures(model,
                            tf_transform_features: tft.TFTransformOutput,
                            serving_batch_size: Optional[int] = None):
    """Returns the serving signatures.

  Args:
    model: the model function to apply to the transformed features.
    tf_transform_features: The transformation to apply to the serialized
      tf.Example.
    serving_batch_size: an optional specification for a concrete serving batch
      size.

  Returns:
    The signatures to use for saving the mode. The 'serving_default' signature
    will be a concrete function that takes a serialized tf.Example, parses it,
    transformes the features and then applies the model.
  """

    model.tft_layer = tf_transform_features.transform_features_layer()

    @tf.function
    def serve_tf_examples_fn(serialized_tf_examples):
        """Returns the output to be used in the serving signature."""
        feature_spec = tf_transform_features.raw_feature_spec()
        feature_spec.pop(_LABEL_KEY)
        parsed_features = tf.io.parse_example(serialized_tf_examples,
                                              feature_spec)

        transformed_features = model.tft_layer(parsed_features)

        return model(transformed_features)

    return {
        'serving_default':
        serve_tf_examples_fn.get_concrete_function(
            tf.TensorSpec(shape=[serving_batch_size],
                          dtype=tf.string,
                          name='examples'))
    }
示例#4
0
    def input_fn(self, file_pattern: List[Text],
                 tf_transform_output: tft.TFTransformOutput):

        xf_feature_spec = tf_transform_output.transformed_feature_spec()

        xf_feature_spec = {
            x: xf_feature_spec[x]
            for x in xf_feature_spec if x.endswith('_xf')
        }

        dataset = tf.data.experimental.make_batched_features_dataset(
            file_pattern=file_pattern,
            batch_size=self.batch_size,
            features=xf_feature_spec,
            reader=self._gzip_reader_fn,
            num_epochs=self.epochs)

        def split_inputs_labels(x):
            inputs = {}
            labels = {}
            for e in x:
                if not e.startswith('label'):
                    inputs[e] = x[e]
                else:
                    labels[e] = x[e]

            labels = {
                label[len('label_'):-len('_xf')]: labels[label]
                for label in labels.keys()
            }

            return inputs, labels

        dataset = dataset.map(split_inputs_labels)

        dataset = dataset.map(lambda x, y: x)

        return dataset
示例#5
0
    def input_fn(self, file_pattern: List[Text],
                 tf_transform_output: tft.TFTransformOutput):
        """
        Feedforward input_fn for loading data from TFRecords saved to a
        location on disk.

        Args:
            file_pattern: File pattern matching saved TFRecords on disk.
            tf_transform_output: Output of the preceding Transform /
             Preprocessing component.

        Returns:
            dataset: tf.data.Dataset created out of the input files.
        """

        xf_feature_spec = tf_transform_output.transformed_feature_spec()

        dataset = tf.data.experimental.make_batched_features_dataset(
            file_pattern=file_pattern,
            batch_size=self.batch_size,
            features=xf_feature_spec,
            reader=self._gzip_reader_fn,
            num_epochs=1,
            drop_final_batch=True)

        def split_columns(x):
            inputs = {}
            labels = {}
            for e in x:
                if not naming_utils.check_if_transformed_label(e):
                    inputs[e] = x[e]
                else:
                    labels[e] = x[e]
            return inputs, labels

        dataset = dataset.map(split_columns)

        return dataset
示例#6
0
def _input_fn(file_pattern: List[Text],
              tf_transform_output: tft.TFTransformOutput,
              label_key: str,
              batch_size: int = 200) -> tf.data.Dataset:
    """Generates features and label for tuning/training.

    Args:
        file_pattern: List of paths or patterns of input tfrecord files.
        tf_transform_output: A TFTransformOutput.
        label_key: label key.
        batch_size: representing the number of consecutive elements of returned
          dataset to combine in a single batch

    Returns:
        A dataset that contains (features, indices) tuple where features is a
        dictionary of Tensors, and indices is a single Tensor of label indices.
    """
    transformed_feature_spec = (
        tf_transform_output.transformed_feature_spec().copy())

    dataset = tf.data.experimental.make_batched_features_dataset(
        file_pattern=file_pattern,
        batch_size=batch_size,
        features=transformed_feature_spec,
        reader=_gzip_reader_fn,
        label_key=_transformed_name(label_key))

    # If the input dataset is file-based but the number of files is less than
    # the number of workers, an error will be raised. Turning off auto shard
    # policy here so that Dataset will sharded by data instead of by file.
    options = tf.data.Options()
    options.experimental_distribute.auto_shard_policy = (
        tf.data.experimental.AutoShardPolicy.DATA)
    dataset = dataset.with_options(options)

    return dataset
示例#7
0
def _input_fn(file_pattern: List[Text],
              tf_transform_output: tft.TFTransformOutput,
              batch_size: int = 200) -> tf.data.Dataset:
    """Generates features and label for tuning/training.
    Args:
    file_pattern: List of paths or patterns of input tfrecord files.
    tf_transform_output: A TFTransformOutput.
    batch_size: representing the number of consecutive elements of returned
      dataset to combine in a single batch
    Returns:
    A dataset that contains (features, indices) tuple where features is a
      dictionary of Tensors, and indices is a single Tensor of label indices.
    """
    transformed_feature_spec = (
        tf_transform_output.transformed_feature_spec().copy())

    dataset = tf.data.experimental.make_batched_features_dataset(
        file_pattern=file_pattern,
        batch_size=batch_size,
        features=transformed_feature_spec,
        reader=_gzip_reader_fn,
        label_key=_transformed_name(_LABEL_KEY))

    return dataset
示例#8
0
if __name__ == '__main__':

    parser = argparse.ArgumentParser()

    parser.add_argument('--tfrecord-file', dest='tfrecord_file', required=True)
    parser.add_argument('--tft-artifacts-dir',
                        dest='tft_artifacts_dir',
                        required=True)
    args = parser.parse_args()

    tfrecords_list = utils.list_tfrecords(args.tfrecord_file)

    tft_metadata_dir = os.path.join(args.tft_artifacts_dir,
                                    transform_fn_io.TRANSFORM_FN_DIR)
    tft_metadata = TFTransformOutput(args.tft_artifacts_dir)

    input_fn_op = input_fn(tfrecords_list, tft_metadata, 1)

    input_fn_next = input_fn_op.make_one_shot_iterator().get_next()

    data_dict = {'community_area_code': [], 'target': []}

    stop = False
    with tf.Session() as sess:
        while True:
            try:
                batch_X, batch_Y = sess.run(input_fn_next)
                for k in batch_X.keys():
                    if np.any(np.isnan(batch_X[k])):
                        logger.info("{} {}".format(k, batch_X[k]))
示例#9
0
def get_deep_and_wide_columns(tft_transform_dir, embedding_size=8):
    """Creates deep and wide feature_column lists.
    Args:
            tf_transform_dir: (str), directory in which the tf-transform model was written
                                     during the preprocessing step.
            embedding_size: (int), the number of dimensions used to represent categorical
                                   features when providing them as inputs to the DNN.
    Returns:
            [tf.feature_column],[tf.feature_column]: deep and wide feature_column lists.
    """

    tft_output = TFTransformOutput(tft_transform_dir)
    transformed_feature_spec = tft_output.transformed_feature_spec()

    transformed_feature_spec.pop(my_metadata.transformed_name(my_metadata.LABEL_KEY))

    deep_columns = {}
    wide_columns = {}

    for transformed_key, tensor in transformed_feature_spec.items():
        #  Separate features by deep and wide
        if transformed_key in my_metadata.transformed_names(my_metadata.VOCAB_FEATURE_KEYS):
            if transformed_key not in my_metadata.transformed_names(my_metadata.CATEGORICAL_FEATURE_KEYS_TO_BE_REMOVED):
                wide_columns[transformed_key] = tf.feature_column.categorical_column_with_identity(
                    key=transformed_key,
                    num_buckets=tft_output.vocabulary_size_by_name(transformed_key) + my_metadata.OOV_SIZE
                )

        elif transformed_key in my_metadata.transformed_names(my_metadata.HASH_STRING_FEATURE_KEYS):
            if transformed_key not in my_metadata.transformed_names(my_metadata.CATEGORICAL_FEATURE_KEYS_TO_BE_REMOVED):
                wide_columns[transformed_key] = tf.feature_column.categorical_column_with_identity(
                    key=transformed_key,
                    num_buckets=my_metadata.HASH_STRING_FEATURE_KEYS[my_metadata.original_name(transformed_key)]
                )

        elif transformed_key in my_metadata.transformed_names(my_metadata.NUMERIC_FEATURE_KEYS):
            if transformed_key not in my_metadata.transformed_names(my_metadata.NUMERIC_FEATURE_KEYS_TO_BE_REMOVED):
                deep_columns[transformed_key] = tf.feature_column.numeric_column(transformed_key)

        elif (
                (transformed_key.endswith(my_metadata.transformed_name('_bucketized'))
                    and transformed_key.replace(
                            my_metadata.transformed_name('_bucketized'), '') in my_metadata.TO_BE_BUCKETIZED_FEATURE)):
            wide_columns[transformed_key] = tf.feature_column.categorical_column_with_identity(
                key=transformed_key,
                num_buckets=tft_output.num_buckets_for_transformed_feature(transformed_key)
            )

        else:
            raise LookupError('The couple (%s, %s) is not consistent with utils.my_metadata' % (key, tensor))

    # # #  creating new categorical features
    wide_columns.update(
        {
        'pickup_latitude_bucketized_xf_x_pickup_longitude_bucketized_xf' : tf.feature_column.crossed_column(
            ['pickup_latitude_bucketized_xf', 'pickup_longitude_bucketized_xf'],
            hash_bucket_size=int(1e3)),
        }
    )
    #
    # # creating new numeric features from categorical features
    deep_columns.update(
        {
            # Use indicator columns for low dimensional vocabularies
            'trip_start_day_xf_indicator': tf.feature_column.indicator_column(wide_columns['trip_start_day_xf']),

            # Use embedding columns for high dimensional vocabularies
            'company_xf_embedding':  tf.feature_column.embedding_column(
                wide_columns['company_xf'], dimension=embedding_size)
        }
    )

    return deep_columns.values(), wide_columns.values()