def _input_fn(): context_feature_columns, example_feature_columns, _ = ( _get_feature_columns()) context_feature_spec = tf.feature_column.make_parse_example_spec( list(context_feature_columns.values())) label_column = tf.feature_column.numeric_column( _LABEL_FEATURE, dtype=tf.float32, default_value=_PADDING_LABEL) weight_column = (_get_example_weight_feature_column() if weights_feature_name == _EXAMPLE_WEIGHT_FEATURE else None) example_fc_list = (list(example_feature_columns.values()) + [label_column] + ([weight_column] if weight_column else [])) example_feature_spec = tf.feature_column.make_parse_example_spec( example_fc_list) dataset = data.build_ranking_dataset( file_pattern=self._data_file, data_format=data.ELWC, batch_size=10, context_feature_spec=context_feature_spec, example_feature_spec=example_feature_spec, list_size=2, reader=tf.data.TFRecordDataset, size_feature_name=_SIZE) features = tf.compat.v1.data.make_one_shot_iterator( dataset).get_next() label = tf.squeeze(features.pop(_LABEL_FEATURE), axis=2) return features, label
def _make_dataset(self, batch_size, list_size, input_pattern, randomize_input=True, num_epochs=None): """Builds a dataset for the TF-Ranking model. Args: batch_size: (int) The number of input examples to process per batch. Use params['batch_size'] for TPUEstimator, and `batch_size` for Estimator. list_size: (int) The list size for an ELWC example. input_pattern: (str) File pattern for the input data. randomize_input: (bool) If true, randomize input example order. It should almost always be true except for unittest/debug purposes. num_epochs: (int) The number of times the input dataset must be repeated. None to repeat the data indefinitely. Returns: A tuple of (feature tensors, label tensor). """ context_feature_spec = tf.feature_column.make_parse_example_spec( self._context_feature_columns.values()) label_column = tf.feature_column.numeric_column( self._label_feature_name, dtype=self._label_feature_type, default_value=_PADDING_LABEL) example_feature_spec = tf.feature_column.make_parse_example_spec( list(self._example_feature_columns.values()) + [label_column]) dataset = tfr_data.build_ranking_dataset( file_pattern=input_pattern, data_format=tfr_data.ELWC, batch_size=batch_size, list_size=list_size, context_feature_spec=context_feature_spec, example_feature_spec=example_feature_spec, reader=self._dataset_reader, reader_args=None, num_epochs=num_epochs, shuffle=randomize_input, shuffle_buffer_size=1000, shuffle_seed=None, prefetch_buffer_size=10000, reader_num_threads=64, sloppy_ordering=True, drop_final_batch=False, num_parser_threads=None, size_feature_name=self._size_feature_name) return dataset.map(self._features_and_labels)
def test_build_ranking_dataset_reader_num_threads(self, reader_num_threads): with tf.Graph().as_default(): # Save EIE protos in a sstable file in a temp folder. serialized_example_in_examples = [ _example_in_example(CONTEXT_1, EXAMPLES_1).SerializeToString(), _example_in_example(CONTEXT_2, EXAMPLES_2).SerializeToString(), ] * 5 data_dir = tf.compat.v1.test.get_temp_dir() data_file = os.path.join(data_dir, "test_ranking_data.tfrecord") if tf.io.gfile.exists(data_file): tf.io.gfile.remove(data_file) with tf.io.TFRecordWriter(data_file) as writer: for serialized_eie in serialized_example_in_examples: writer.write(serialized_eie) batched_dataset = data_lib.build_ranking_dataset( file_pattern=data_file, data_format=data_lib.EIE, batch_size=2, list_size=2, context_feature_spec=CONTEXT_FEATURE_SPEC, example_feature_spec=EXAMPLE_FEATURE_SPEC, reader=tf.data.TFRecordDataset, shuffle=False, reader_num_threads=reader_num_threads) features = tf.compat.v1.data.make_one_shot_iterator( batched_dataset).get_next() self.assertAllEqual([2, 1], features["query_length"].get_shape().as_list()) self.assertAllEqual([2, 2, 1], features["utility"].get_shape().as_list()) self.assertAllEqual(sorted(features.keys()), ["query_length", "unigrams", "utility"]) with tf.compat.v1.Session() as sess: sess.run(tf.compat.v1.local_variables_initializer()) features = sess.run(features) self.assertAllEqual(features["unigrams"].dense_shape, [2, 2, 3]) self.assertAllEqual( features["unigrams"].indices, [[0, 0, 0], [0, 1, 0], [0, 1, 1], [0, 1, 2], [1, 0, 0]]) self.assertAllEqual( features["unigrams"].values, [b"tensorflow", b"learning", b"to", b"rank", b"gbdt"]) # For Tensors with dense values, values can be directly checked. self.assertAllEqual(features["query_length"], [[3], [2]]) self.assertAllEqual(features["utility"], [[[0.], [1.0]], [[0.], [-1.]]])
def _build_dataset(self, file_pattern: str, batch_size: int, list_size: Optional[int] = None, randomize_input: bool = True, num_epochs: Optional[int] = None) -> tf.data.Dataset: """Returns `tf.data.Dataset` for training or validating the model. Args: file_pattern: File pattern for input data. batch_size: Number of input examples to process per batch. list_size: The list size for an ELWC example. randomize_input: If true, randomize input example order. It should almost always be true except for unittest/debug purposes. num_epochs: Number of times the input dataset must be repeated. None to repeat the data indefinitely. Returns: A `tf.data.Dataset`. """ # TODO: Remove defaults common in Estimator pipeline and here. dataset = data.build_ranking_dataset( file_pattern=file_pattern, data_format=data.ELWC, batch_size=batch_size, list_size=list_size, context_feature_spec=dict( list(self._context_feature_spec.items()) + list(self._training_only_context_spec.items())), example_feature_spec=dict( list(self._example_feature_spec.items()) + list(self._training_only_example_spec.items())), mask_feature_name=self._mask_feature_name, reader=self._hparams.dataset_reader, reader_args=None, num_epochs=num_epochs, shuffle=randomize_input, shuffle_buffer_size=1000, shuffle_seed=None, prefetch_buffer_size=10000, reader_num_threads=64, sloppy_ordering=True, drop_final_batch=False, shuffle_examples=False) return dataset.map( self._features_and_labels, num_parallel_calls=tf.data.experimental.AUTOTUNE)
def _inner_input_fn(): context_feature_spec = tf.feature_column.make_parse_example_spec( list(context_feature_column().values())) label_column = tf.feature_column.numeric_column( _LABEL_FEATURE, default_value=_PADDING_LABEL) example_feature_spec = tf.feature_column.make_parse_example_spec( list(example_feature_columns().values()) + [label_column]) dataset = data.build_ranking_dataset( file_pattern=DATA_FILE, data_format=data.ELWC, batch_size=10, context_feature_spec=context_feature_spec, example_feature_spec=example_feature_spec, list_size=2, reader=tf.data.TFRecordDataset) features = tf.compat.v1.data.make_one_shot_iterator(dataset).get_next() label = tf.squeeze(features.pop(_LABEL_FEATURE), axis=2) return features, label