def train_and_evaluate(output_dir): review_column = feature_column.sparse_column_with_integerized_feature( const.REVIEW_COLUMN, bucket_size=vocab_size + 1, combiner='sum') weighted_reviews = feature_column.weighted_sparse_column( review_column, const.REVIEW_WEIGHT) estimator = learn.LinearClassifier( feature_columns=[weighted_reviews], n_classes=2, model_dir=output_dir, config=tf.contrib.learn.RunConfig(save_checkpoints_secs=30)) transformed_metadata = metadata_io.read_metadata( transformed_metadata_dir) raw_metadata = metadata_io.read_metadata(raw_metadata_dir) train_input_fn = input_fn_maker.build_training_input_fn( transformed_metadata, transformed_train_file_pattern, training_batch_size=train_batch_size, label_keys=[const.LABEL_COLUMN]) eval_input_fn = input_fn_maker.build_training_input_fn( transformed_metadata, transformed_test_file_pattern, training_batch_size=1, label_keys=[const.LABEL_COLUMN]) serving_input_fn = input_fn_maker.build_default_transforming_serving_input_fn( raw_metadata=raw_metadata, transform_savedmodel_dir=output_dir + '/transform_fn', raw_label_keys=[], raw_feature_keys=[const.REVIEW_COLUMN]) export_strategy = saved_model_export_utils.make_export_strategy( serving_input_fn, exports_to_keep=5, default_output_alternative_key=None) return tf.contrib.learn.Experiment(estimator=estimator, train_steps=train_num_epochs * num_train_instances / train_batch_size, eval_steps=num_test_instances, train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, export_strategies=export_strategy, min_eval_frequency=500)
def transformed_metadata(self): """A DatasetMetadata.""" if self._transformed_metadata is None: self._transformed_metadata = metadata_io.read_metadata( os.path.join(self._transform_output_dir, self.TRANSFORMED_METADATA_DIR)) return self._transformed_metadata
def make_input_function(working_dir, filebase, num_epochs=None, shuffle=True, batch_size=200): transformed_metadata = metadata_io.read_metadata( os.path.join(working_dir, transform_fn_io.TRANSFORMED_METADATA_DIR)) transformed_feature_spec = transformed_metadata.schema.as_feature_spec() def parse_tf_record(example_proto): parsed_features = tf.parse_single_example(example_proto, transformed_feature_spec) return parsed_features def input_func(): file_pattern = os.path.join(working_dir, filebase + '-*') file_names = tf.data.TFRecordDataset.list_files(file_pattern) dataset = file_names.flat_map( lambda x: tf.data.TFRecordDataset(x)).map(parse_tf_record) if shuffle: dataset = dataset.shuffle(buffer_size=batch_size * 10) dataset = dataset.repeat(num_epochs).batch(batch_size) iterator = dataset.make_one_shot_iterator() features = iterator.get_next() return features, features.pop(LABEL_KEY) return input_func
def is_classification(transformed_data_dir, target): """Whether the scenario is classification (vs regression). Returns: The number of classes if the target represents a classification problem, or None if it does not. """ transformed_metadata = metadata_io.read_metadata( os.path.join(transformed_data_dir, transform_fn_io.TRANSFORMED_METADATA_DIR)) transformed_feature_spec = transformed_metadata.schema.as_feature_spec() if target not in transformed_feature_spec: raise ValueError('Cannot find target "%s" in transformed data.' % target) feature = transformed_feature_spec[target] if (not isinstance(feature, tf.FixedLenFeature) or feature.shape != [] or feature.dtype not in TARGET_TYPES): raise ValueError('target "%s" is of invalid type.' % target) if feature.dtype in CLASSIFICATION_TARGET_TYPES: if feature.dtype == tf.bool: return 2 return get_vocab_size(transformed_data_dir, target) return None
def testWriteTransformFn(self): path = os.path.join(self.get_temp_dir(), 'output') with beam.Pipeline() as pipeline: # Create an empty directory for the source saved model dir. saved_model_dir = os.path.join(self.get_temp_dir(), 'source') file_io.recursive_create_dir(saved_model_dir) saved_model_dir_pcoll = ( pipeline | 'CreateSavedModelDir' >> beam.Create([saved_model_dir])) metadata = beam_metadata_io.BeamDatasetMetadata( _TEST_METADATA_WITH_FUTURES, { 'a': pipeline | 'CreateA' >> beam.Create([3]), }) _ = ((saved_model_dir_pcoll, metadata) | transform_fn_io.WriteTransformFn(path)) transformed_metadata_dir = os.path.join( path, transform_fn_io.TRANSFORMED_METADATA_DIR) metadata = metadata_io.read_metadata(transformed_metadata_dir) self.assertEqual(metadata, _TEST_METADATA) transform_fn_dir = os.path.join(path, transform_fn_io.TRANSFORM_FN_DIR) self.assertTrue(file_io.file_exists(transform_fn_dir)) self.assertTrue(file_io.is_directory(transform_fn_dir))
def input_fn(filenames, tf_transform_dir, batch_size=200): """Generates features and labels for training or evaluation. Args: filenames: [str] list of CSV files to read data from. tf_transform_dir: directory in which the tf-transform model was written during the preprocessing step. batch_size: int First dimension size of the Tensors returned by input_fn Returns: A (features, indices) tuple where features is a dictionary of Tensors, and indices is a single Tensor of label indices. """ metadata_dir = os.path.join(tf_transform_dir, transform_fn_io.TRANSFORMED_METADATA_DIR) transformed_metadata = metadata_io.read_metadata(metadata_dir) transformed_feature_spec = transformed_metadata.schema.as_feature_spec() transformed_features = tf.contrib.learn.io.read_batch_features( filenames, batch_size, transformed_feature_spec, reader=_gzip_reader_fn) # We pop the label because we do not want to use it as a feature while we're # training. return transformed_features, transformed_features.pop(taxi.LABEL_KEY)
def _input_fn(filenames, transform_output, batch_size=200): """Generates features and labels for training or evaluation. Args: filenames: [str] list of CSV files to read data from. transform_output: directory in which the tf-transform model was written during the preprocessing step. batch_size: int First dimension size of the Tensors returned by input_fn Returns: A (features, indices) tuple where features is a dictionary of Tensors, and indices is a single Tensor of label indices. """ metadata_dir = os.path.join(transform_output, transform_fn_io.TRANSFORMED_METADATA_DIR) transformed_metadata = metadata_io.read_metadata(metadata_dir) transformed_feature_spec = transformed_metadata.schema.as_feature_spec() transformed_features = tf.contrib.learn.io.read_batch_features( filenames, batch_size, transformed_feature_spec, reader=_gzip_reader_fn) # We pop the label because we do not want to use it as a feature while we're # training. return transformed_features, transformed_features.pop( _transformed_name(_LABEL_KEY))
def _make_training_input_fn(working_dir, filebase, batch_size): """Creates an input function reading from transformed data. Args: working_dir: Directory to read transformed data and metadata from and to write exported model to. filebase: Base filename (relative to `working_dir`) of examples. batch_size: Batch size. Returns: The input function for training or eval. """ transformed_metadata = metadata_io.read_metadata( os.path.join(working_dir, tft.TRANSFORMED_METADATA_DIR)) transformed_feature_spec = transformed_metadata.schema.as_feature_spec() def input_fn(): """Input function for training and eval.""" transformed_features = tf.contrib.learn.io.read_batch_features( os.path.join(working_dir, filebase + '*'), batch_size, transformed_feature_spec, tf.TFRecordReader) # Extract features and label from the transformed tensors. transformed_labels = transformed_features.pop(LABEL_KEY) return transformed_features, transformed_labels return input_fn
def make_serving_input_fn_for_base64_json(args): raw_metadata = metadata_io.read_metadata( os.path.join(args['metadata_path'], 'rawdata_metadata')) transform_savedmodel_dir = (os.path.join(args['metadata_path'], 'transform_fn')) return input_fn_maker.build_parsing_transforming_serving_input_receiver_fn( raw_metadata, transform_savedmodel_dir, exclude_raw_keys=[LABEL_COL])
def make_training_input_fn(transformed_data_dir, mode, batch_size, target_name, num_epochs=None): """Creates an input function reading from transformed data. Args: transformed_data_dir: Directory to read transformed data and metadata from. mode: 'train' or 'eval'. batch_size: Batch size. target_name: name of the target column. num_epochs: number of training data epochs. Returns: The input function for training or eval. """ transformed_metadata = metadata_io.read_metadata( os.path.join(transformed_data_dir, transform_fn_io.TRANSFORMED_METADATA_DIR)) transformed_feature_spec = transformed_metadata.schema.as_feature_spec() def _input_fn(): """Input function for training and eval.""" epochs = 1 if mode == 'eval' else num_epochs transformed_features = tf.contrib.learn.io.read_batch_features( os.path.join(transformed_data_dir, mode + '-*'), batch_size, transformed_feature_spec, tf.TFRecordReader, num_epochs=epochs) # Extract features and label from the transformed tensors. transformed_labels = transformed_features.pop(target_name) return transformed_features, transformed_labels return _input_fn
def test_read_features(self): # TODO(b/123241798): use TEST_TMPDIR basedir = tempfile.mkdtemp() schema_no_sparse_features = """ { "feature": [{ "name": "my_key", "fixedShape": { "axis": [{ "size": 2 }] }, "type": "INT", "domain": { "ints": {} }, "parsingOptions": { "tfOptions": { "fixedLenFeature": {} } } }] } """ self._write_schema_to_disk(basedir, schema_no_sparse_features) _ = metadata_io.read_metadata(basedir)
def run(args): #config = tf.estimator.RunConfig(save_checkpoints_steps=10) feature_spec = metadata_io.read_metadata( posixpath.join( args.input_dir, constants.TRANSFORMED_METADATA_DIR)).schema.as_feature_spec() train_input_fn = get_input_fn("{}*".format( posixpath.join(args.input_dir, constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX)), feature_spec, num_epochs=args.num_epochs, batch_size=args.batch_size) train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=args.train_steps) eval_input_fn = get_input_fn(posixpath.join( args.input_dir, constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX), feature_spec, num_epochs=1, batch_size=args.batch_size) eval_spec = tf.estimator.EvalSpec( input_fn=eval_input_fn #exporters=tf.estimator.FinalExporter( # name='export', # serving_input_receiver_fn=get_serving_input_fn(args.input_dir) #) ) linear_regressor = tf.estimator.LinearRegressor( feature_columns=_get_feature_columns(), model_dir=args.model_dir) tf.estimator.train_and_evaluate(linear_regressor, train_spec, eval_spec)
def testWriteTransformFn(self): path = os.path.join(self.get_temp_dir(), 'output') with beam.Pipeline() as pipeline: # Create an empty directory for the source saved model dir. saved_model_dir = os.path.join(self.get_temp_dir(), 'source') file_io.recursive_create_dir(saved_model_dir) saved_model_dir_pcoll = ( pipeline | 'CreateSavedModelDir' >> beam.Create([saved_model_dir])) metadata = _TEST_METADATA deferred_metadata = ( pipeline | 'CreateEmptyProperties' >> beam.Create([_FUTURES_DICT])) _ = ((saved_model_dir_pcoll, (metadata, deferred_metadata)) | transform_fn_io.WriteTransformFn(path)) transformed_metadata_dir = os.path.join(path, 'transformed_metadata') metadata = metadata_io.read_metadata(transformed_metadata_dir) self.assertEqual(metadata, _TEST_METADATA) transform_fn_dir = os.path.join(path, 'transform_fn') self.assertTrue(file_io.file_exists(transform_fn_dir)) self.assertTrue(file_io.is_directory(transform_fn_dir))
def testWriteMetadataIsRetryable(self): tft_test_case.skip_if_external_environment( 'Retries are currently not available on this environment.') original_write_metadata = beam_metadata_io.metadata_io.write_metadata write_metadata_called_list = [] def mock_write_metadata(metadata, path): """Mocks metadata_io.write_metadata to fail the first time it is called by this test, thus forcing a retry which should succeed.""" if not write_metadata_called_list: write_metadata_called_list.append(True) original_write_metadata(metadata, path) raise ArithmeticError('Some error') return original_write_metadata(metadata, path) # Write metadata to disk using WriteMetadata PTransform. with mock.patch( 'tensorflow_transform.tf_metadata.metadata_io.write_metadata', mock_write_metadata): with self._makeTestPipeline() as pipeline: path = self.get_temp_dir() _ = (test_metadata.COMPLETE_METADATA | beam_metadata_io.WriteMetadata(path, pipeline)) # Load from disk and check that it is as expected. metadata = metadata_io.read_metadata(path) self.assertEqual(metadata, test_metadata.COMPLETE_METADATA)
def make_experiment(output_dir): """Function that creates an experiment http://goo.gl/HcKHlT. Args: output_dir: The directory where the training output should be written. Returns: A `tf.contrib.learn.Experiment`. """ estimator = tf.contrib.learn.Estimator( model_fn=model_builder(hparams=args), model_dir=output_dir) train_input_fn = make_input_fn( mode=tf.contrib.learn.ModeKeys.TRAIN, eval_type=args.eval_type, data_file_pattern=args.train_data_paths, randomize_input=args.randomize_input, batch_size=args.batch_size, queue_capacity=4 * args.batch_size) eval_input_fn = make_input_fn( mode=tf.contrib.learn.ModeKeys.EVAL, eval_type=args.eval_type, data_file_pattern=args.eval_data_paths, batch_size=args.eval_batch_size, queue_capacity=4 * args.eval_batch_size) raw_metadata = metadata_io.read_metadata(args.raw_metadata_path) # Both ratings and candidate features are not needed for serving. raw_label_keys = [LABEL_RATING_SCORE] # For serving, we only need query features. raw_feature_keys = [QUERY_RATED_MOVIE_IDS, QUERY_RATED_MOVIE_SCORES, QUERY_RATED_GENRE_IDS, QUERY_RATED_GENRE_FREQS, QUERY_RATED_GENRE_AVG_SCORES] serving_input_fn = ( input_fn_maker.build_parsing_transforming_serving_input_fn( raw_metadata, args.transform_savedmodel, raw_label_keys=raw_label_keys, raw_feature_keys=raw_feature_keys)) export_strategy = tf.contrib.learn.utils.make_export_strategy( serving_input_fn, default_output_alternative_key=DEFAULT_OUTPUT_ALTERNATIVE) return tf.contrib.learn.Experiment( estimator=estimator, train_steps=(args.train_steps or args.num_epochs * args.train_set_size // args.batch_size), eval_steps=args.eval_steps, train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, eval_metrics=create_evaluation_metrics(args.eval_type), export_strategies=[export_strategy], # Do not remove this is needed until b/36498507 is fixed. min_eval_frequency=1000)
def make_serving_input_fn_for_base64_json(args): raw_metadata = metadata_io.read_metadata( os.path.join(args['metadata_path'], 'rawdata_metadata')) transform_savedmodel_dir = ( os.path.join(args['metadata_path'], 'transform_fn')) return input_fn_maker.build_parsing_transforming_serving_input_receiver_fn( raw_metadata, transform_savedmodel_dir, exclude_raw_keys = [LABEL_COLUMN])
def testWriteMetadataNonDeferred(self): # Write properties as metadata to disk. with beam.Pipeline() as pipeline: path = self.get_temp_dir() _ = (_TEST_METADATA_COMPLETE | beam_metadata_io.WriteMetadata(path, pipeline)) # Load from disk and check that it is as expected. metadata = metadata_io.read_metadata(path) self.assertMetadataEqual(metadata, _TEST_METADATA_COMPLETE)
def parse_tf_example(tf_example): transformed_metadata = metadata_io.read_metadata(os.path.join(params.Params.TRANSFORM_ARTIFACTS_DIR,"transformed_metadata")) transformed_feature_spec = transformed_metadata.schema.as_feature_spec() parsed_features = tf.parse_example(serialized=tf_example, features=transformed_feature_spec) target = parsed_features.pop(metadata.TARGET_FEATURE_NAME) return parsed_features, target
def _build_estimator(transform_output, config, hidden_units=None, warm_start_from=None): """Build an estimator for predicting the tipping behavior of taxi riders. Args: transform_output: directory in which the tf-transform model was written during the preprocessing step. config: tf.contrib.learn.RunConfig defining the runtime environment for the estimator (including model_dir). hidden_units: [int], the layer sizes of the DNN (input layer first) warm_start_from: Optional directory to warm start from. Returns: A dict of the following: - estimator: The estimator that will be used for training and eval. - train_spec: Spec for training. - eval_spec: Spec for eval. - eval_input_receiver_fn: Input function for eval. """ metadata_dir = os.path.join(transform_output, transform_fn_io.TRANSFORMED_METADATA_DIR) transformed_metadata = metadata_io.read_metadata(metadata_dir) transformed_feature_spec = transformed_metadata.schema.as_feature_spec() transformed_feature_spec.pop(_transformed_name(_LABEL_KEY)) real_valued_columns = [ tf.feature_column.numeric_column(key, shape=()) for key in _transformed_names(_DENSE_FLOAT_FEATURE_KEYS) ] categorical_columns = [ tf.feature_column.categorical_column_with_identity( key, num_buckets=_VOCAB_SIZE + _OOV_SIZE, default_value=0) for key in _transformed_names(_VOCAB_FEATURE_KEYS) ] categorical_columns += [ tf.feature_column.categorical_column_with_identity( key, num_buckets=_FEATURE_BUCKET_COUNT, default_value=0) for key in _transformed_names(_BUCKET_FEATURE_KEYS) ] categorical_columns += [ tf.feature_column.categorical_column_with_identity( # pylint: disable=g-complex-comprehension key, num_buckets=num_buckets, default_value=0) for key, num_buckets in zip( _transformed_names(_CATEGORICAL_FEATURE_KEYS), _MAX_CATEGORICAL_FEATURE_VALUES) ] return tf.estimator.DNNLinearCombinedClassifier( config=config, linear_feature_columns=categorical_columns, dnn_feature_columns=real_valued_columns, dnn_hidden_units=hidden_units or [100, 70, 50, 25], warm_start_from=warm_start_from)
def make_experiment(output_dir): """Function that creates an experiment http://goo.gl/HcKHlT. Args: output_dir: The directory where the training output should be written. Returns: A `tf.contrib.learn.Experiment`. """ estimator = tf.contrib.learn.Estimator( model_fn=model_builder(hparams=args), model_dir=output_dir) train_input_fn = make_input_fn(mode=tf.contrib.learn.ModeKeys.TRAIN, eval_type=args.eval_type, data_file_pattern=args.train_data_paths, randomize_input=args.randomize_input, batch_size=args.batch_size, queue_capacity=4 * args.batch_size) eval_input_fn = make_input_fn(mode=tf.contrib.learn.ModeKeys.EVAL, eval_type=args.eval_type, data_file_pattern=args.eval_data_paths, batch_size=args.eval_batch_size, queue_capacity=4 * args.eval_batch_size) raw_metadata = metadata_io.read_metadata(args.raw_metadata_path) # Both ratings and candidate features are not needed for serving. raw_label_keys = [LABEL_RATING_SCORE] # For serving, we only need query features. raw_feature_keys = [ QUERY_RATED_MOVIE_IDS, QUERY_RATED_MOVIE_SCORES, QUERY_RATED_GENRE_IDS, QUERY_RATED_GENRE_FREQS, QUERY_RATED_GENRE_AVG_SCORES ] serving_input_fn = ( input_fn_maker.build_parsing_transforming_serving_input_fn( raw_metadata, args.transform_savedmodel, raw_label_keys=raw_label_keys, raw_feature_keys=raw_feature_keys)) export_strategy = tf.contrib.learn.utils.make_export_strategy( serving_input_fn, default_output_alternative_key=DEFAULT_OUTPUT_ALTERNATIVE) return tf.contrib.learn.Experiment( estimator=estimator, train_steps=(args.train_steps or args.num_epochs * args.train_set_size // args.batch_size), eval_steps=args.eval_steps, train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, eval_metrics=create_evaluation_metrics(args.eval_type), export_strategies=[export_strategy], # Do not remove this is needed until b/36498507 is fixed. min_eval_frequency=1000)
def testWriteMetadataNonDeferred(self): # Write metadata to disk using WriteMetadata PTransform. with beam.Pipeline() as pipeline: path = self.get_temp_dir() _ = (test_metadata.COMPLETE_METADATA | beam_metadata_io.WriteMetadata(path, pipeline)) # Load from disk and check that it is as expected. metadata = metadata_io.read_metadata(path) self.assertEqual(metadata, test_metadata.COMPLETE_METADATA)
def test_write_and_read(self): # TODO(b/123241798): use TEST_TMPDIR basedir = tempfile.mkdtemp() original = dataset_metadata.DatasetMetadata( schema=test_common.get_test_schema()) metadata_io.write_metadata(original, basedir) reloaded = metadata_io.read_metadata(basedir) self.assertEqual(original, reloaded)
def testWriteMetadataDeferredProperties(self): # Write deferred properties as metadata to disk. with beam.Pipeline() as pipeline: path = self.get_temp_dir() deferred_metadata = pipeline | beam.Create([_FUTURES_DICT]) _ = ((_TEST_METADATA_WITH_FUTURES, deferred_metadata) | beam_metadata_io.WriteMetadata(path, pipeline)) # Load from disk and check that it is as expected. metadata = metadata_io.read_metadata(path) self.assertMetadataEqual(metadata, _TEST_METADATA)
def testWriteMetadataNonDeferredEmptyDict(self): # Write properties as metadata to disk. with beam.Pipeline() as pipeline: path = self.get_temp_dir() property_pcoll = pipeline | beam.Create([{}]) _ = ((_TEST_METADATA, property_pcoll) | beam_metadata_io.WriteMetadata(path, pipeline)) # Load from disk and check that it is as expected. metadata = metadata_io.read_metadata(path) self.assertMetadataEqual(metadata, _TEST_METADATA)
def expand(self, pvalue): # Read metadata in non-deferred manner. Note that since this reads the # whole metadata in a non-deferred manner, typically the roundtrip # # done = metadata | WriteMetadata(path) # metadata = p | ReadMetadata(path).must_follow(done) # # will fail as the metadata on disk will not be complete when the read is # done. return metadata_io.read_metadata(self._path)
def test_write_and_read(self): basedir = tempfile.mkdtemp() original = dataset_metadata.DatasetMetadata( schema=test_common.get_test_schema()) metadata_io.write_metadata(original, basedir, versions=_test_versions) reloaded = metadata_io.read_metadata(basedir, versions=_test_versions) generated_feature_spec = reloaded.schema.as_feature_spec() self.assertEqual(test_common.test_feature_spec, generated_feature_spec)
def expand(self, pvalue): transform_fn_path = os.path.join(self._path, TRANSFORM_FN_DIR) saved_model_dir_pcoll = ( pvalue.pipeline | 'CreateTransformFnPath' >> beam.Create([transform_fn_path])) metadata = metadata_io.read_metadata( os.path.join(self._path, TRANSFORMED_METADATA_DIR)) return saved_model_dir_pcoll, metadata
def test_write_and_read(self): # TODO(b/123241798): use TEST_TMPDIR basedir = tempfile.mkdtemp() original = dataset_metadata.DatasetMetadata( schema=test_common.get_test_schema()) metadata_io.write_metadata(original, basedir) reloaded = metadata_io.read_metadata(basedir) generated_feature_spec = reloaded.schema.as_feature_spec() self.assertEqual(test_common.test_feature_spec, generated_feature_spec)
def test_write_and_read(self): basedir = tempfile.mkdtemp() original_schema = schema_io_vtest.TestSchema( {'test_feature_1': 'bogus 1', 'test_feature_2': 'bogus 2'}) original = dataset_metadata.DatasetMetadata(schema=original_schema) metadata_io.write_metadata(original, basedir, versions=_test_versions) reloaded = metadata_io.read_metadata(basedir, versions=_test_versions) self.assertTrue('test_feature_1' in reloaded.schema.column_schemas) self.assertTrue('test_feature_2' in reloaded.schema.column_schemas) self.assertEqual(2, len(reloaded.schema.column_schemas))
def test_read_with_invalid_keys(self): basedir = tempfile.mkdtemp() version_basedir = os.path.join(basedir, 'v1-json') # Write a proto by hand to disk file_io.recursive_create_dir(version_basedir) file_io.write_string_to_file( os.path.join(version_basedir, 'schema.json'), _SCHEMA_WITH_INVALID_KEYS) with self.assertRaisesRegexp( ValueError, 'Keys of dense and sparse features overlapped.*'): _ = metadata_io.read_metadata(basedir, versions=_test_versions)
def expand(self, pvalue): transform_fn_path = os.path.join(self._path, 'transform_fn') saved_model_dir_pcoll = ( pvalue.pipeline | 'CreateTransformFnPath' >> beam.Create([transform_fn_path])) metadata = metadata_io.read_metadata( os.path.join(self._path, 'transformed_metadata')) deferred_metadata = ( pvalue.pipeline | 'CreateEmptyDeferredMetadata' >> beam.Create([{}])) return saved_model_dir_pcoll, (metadata, deferred_metadata)
def train_and_evaluate(transformed_train_filepattern, transformed_test_filepattern, transformed_metadata_dir, num_train_instances=NUM_TRAIN_INSTANCES, num_test_instances=NUM_TEST_INSTANCES): """Train the model on training data and evaluate on test data. Args: transformed_train_filepattern: File pattern for transformed training data shards transformed_test_filepattern: File pattern for transformed test data shards transformed_metadata_dir: Directory containing transformed data metadata num_train_instances: Number of instances in train set num_test_instances: Number of instances in test set Returns: The results from the estimator's 'evaluate' method """ # Wrap scalars as real valued columns. real_valued_columns = [feature_column.real_valued_column(key) for key in NUMERIC_COLUMNS] # Wrap categorical columns. Note the combiner is irrelevant since the input # only has one value set per feature per instance. one_hot_columns = [ feature_column.sparse_column_with_integerized_feature( key, bucket_size=bucket_size, combiner='sum') for key, bucket_size in zip(CATEGORICAL_COLUMNS, BUCKET_SIZES)] estimator = learn.LinearClassifier(real_valued_columns + one_hot_columns) transformed_metadata = metadata_io.read_metadata(transformed_metadata_dir) train_input_fn = input_fn_maker.build_training_input_fn( transformed_metadata, transformed_train_filepattern, training_batch_size=TRAIN_BATCH_SIZE, label_keys=[LABEL_COLUMN]) # Estimate the model using the default optimizer. estimator.fit( input_fn=train_input_fn, max_steps=TRAIN_NUM_EPOCHS * num_train_instances / TRAIN_BATCH_SIZE) # Evaluate model on test dataset. eval_input_fn = input_fn_maker.build_training_input_fn( transformed_metadata, transformed_test_filepattern, training_batch_size=1, label_keys=[LABEL_COLUMN]) return estimator.evaluate(input_fn=eval_input_fn, steps=num_test_instances)
def raw_metadata(self): """A DatasetMetadata. Note: raw_metadata is not guaranteed to exist in the output of tf.transform and hence using this could fail, if raw_metadata is not present in TFTransformOutput. Returns: A DatasetMetadata """ if self._raw_metadata is None: self._raw_metadata = metadata_io.read_metadata( os.path.join(self._transform_output_dir, self.RAW_METADATA_DIR)) return self._raw_metadata
def build_estimator(tf_transform_dir, config, hidden_units=None): """Build an estimator for predicting the tipping behavior of taxi riders. Args: tf_transform_dir: directory in which the tf-transform model was written during the preprocessing step. config: tf.contrib.learn.RunConfig defining the runtime environment for the estimator (including model_dir). hidden_units: [int], the layer sizes of the DNN (input layer first) Returns: Resulting DNNLinearCombinedClassifier. """ metadata_dir = os.path.join(tf_transform_dir, transform_fn_io.TRANSFORMED_METADATA_DIR) transformed_metadata = metadata_io.read_metadata(metadata_dir) transformed_feature_spec = transformed_metadata.schema.as_feature_spec() transformed_feature_spec.pop(taxi.transformed_name(taxi.LABEL_KEY)) real_valued_columns = [ tf.feature_column.numeric_column(key, shape=()) for key in taxi.transformed_names(taxi.DENSE_FLOAT_FEATURE_KEYS) ] categorical_columns = [ tf.feature_column.categorical_column_with_identity( key, num_buckets=taxi.VOCAB_SIZE + taxi.OOV_SIZE, default_value=0) for key in taxi.transformed_names(taxi.VOCAB_FEATURE_KEYS) ] categorical_columns += [ tf.feature_column.categorical_column_with_identity( key, num_buckets=taxi.FEATURE_BUCKET_COUNT, default_value=0) for key in taxi.transformed_names(taxi.BUCKET_FEATURE_KEYS) ] categorical_columns += [ tf.feature_column.categorical_column_with_identity( key, num_buckets=num_buckets, default_value=0) for key, num_buckets in zip( taxi.transformed_names(taxi.CATEGORICAL_FEATURE_KEYS), # taxi.MAX_CATEGORICAL_FEATURE_VALUES) ] return tf.estimator.DNNLinearCombinedClassifier( config=config, linear_feature_columns=categorical_columns, dnn_feature_columns=real_valued_columns, dnn_hidden_units=hidden_units or [100, 70, 50, 25])
def read_dataset(args, mode): batch_size = args['train_batch_size'] if mode == tf.estimator.ModeKeys.TRAIN: input_paths = args['train_data_paths'] else: input_paths = args['eval_data_paths'] transformed_metadata = metadata_io.read_metadata( os.path.join(args['metadata_path'], 'transformed_metadata')) return input_fn_maker.build_training_input_fn( metadata = transformed_metadata, file_pattern = ( input_paths[0] if len(input_paths) == 1 else input_paths), training_batch_size = batch_size, label_keys = [LABEL_COLUMN], reader = gzip_reader_fn, key_feature_name = KEY_FEATURE_COLUMN, randomize_input = (mode != tf.estimator.ModeKeys.EVAL), num_epochs = (1 if mode == tf.estimator.ModeKeys.EVAL else None))
def get_experiment(output_dir): """Function that creates an experiment http://goo.gl/HcKHlT. Args: output_dir: The directory where the training output should be written. Returns: A `tf.contrib.learn.Experiment`. """ columns = feature_columns(args.model_type, vocab_sizes, use_crosses) runconfig = tf.contrib.learn.RunConfig() cluster = runconfig.cluster_spec num_table_shards = max(1, runconfig.num_ps_replicas * 3) num_partitions = max(1, 1 + cluster.num_tasks('worker') if cluster and 'worker' in cluster.jobs else 0) model_dir = os.path.join(output_dir, MODEL_DIR) if args.model_type == LINEAR: estimator = tf.contrib.learn.LinearRegressor( model_dir=model_dir, feature_columns=columns, optimizer=tf.contrib.linear_optimizer.SDCAOptimizer( example_id_column=KEY_FEATURE_COLUMN, symmetric_l2_regularization=args.l2_regularization, num_loss_partitions=num_partitions, # workers num_table_shards=num_table_shards)) # ps elif args.model_type == DEEP: estimator = tf.contrib.learn.DNNRegressor( hidden_units=args.hidden_units, feature_columns=columns, model_dir=model_dir) transformed_metadata = metadata_io.read_metadata( args.transformed_metadata_path) raw_metadata = metadata_io.read_metadata(args.raw_metadata_path) serving_input_fn = ( input_fn_maker.build_parsing_transforming_serving_input_fn( raw_metadata, args.transform_savedmodel, raw_label_keys=[TARGET_FEATURE_COLUMN])) export_strategy = tf.contrib.learn.utils.make_export_strategy( serving_input_fn, exports_to_keep=5, default_output_alternative_key=None) train_input_fn = get_transformed_reader_input_fn( transformed_metadata, args.train_data_paths, args.batch_size, tf.contrib.learn.ModeKeys.TRAIN) eval_input_fn = get_transformed_reader_input_fn( transformed_metadata, args.eval_data_paths, args.batch_size, tf.contrib.learn.ModeKeys.EVAL) return tf.contrib.learn.Experiment( estimator=estimator, train_steps=(args.train_steps or args.num_epochs * args.train_set_size // args.batch_size), eval_steps=args.eval_steps, train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, export_strategies=export_strategy, min_eval_frequency=500)