def _run_batch_prediction(self, output_dir, use_target): reglinear.batch_predict( training_dir=self._train_output, prediction_input_file=(self._csv_eval_filename if use_target else self._csv_predict_filename), output_dir=output_dir, mode='evaluation' if use_target else 'prediction', batch_size=4, output_format='csv') # check errors file is empty errors = file_io.get_matching_files(os.path.join(output_dir, 'errors*')) self.assertEqual(len(errors), 1) self.assertEqual(os.path.getsize(errors[0]), 0) # check predictions files are not empty predictions = file_io.get_matching_files(os.path.join(output_dir, 'predictions*')) self.assertGreater(os.path.getsize(predictions[0]), 0) # check the schema is correct schema_file = os.path.join(output_dir, 'csv_schema.json') self.assertTrue(os.path.isfile(schema_file)) schema = json.loads(file_io.read_file_to_string(schema_file)) self.assertEqual(schema[0]['name'], 'key') self.assertEqual(schema[1]['name'], 'predicted') if use_target: self.assertEqual(schema[2]['name'], 'target') self.assertEqual(len(schema), 3) else: self.assertEqual(len(schema), 2)
def stereo_stream(dir): print("loading files...") left_files = sorted(file_io.get_matching_files("{}/left/*.png".format(dir))) right_files = sorted(file_io.get_matching_files("{}/right/*.png".format(dir))) norm_files = sorted(file_io.get_matching_files("{}/norms/*.png".format(dir))) envmap_files = sorted(file_io.get_matching_files("{}/envmaps/*.hdr".format(dir))) bg_files = sorted(file_io.get_matching_files("{}/bg/*.png".format(dir))) print("loaded files") left_shape = get_input_size(left_files[0]) right_shape = get_input_size(right_files[0]) norms_shape = get_input_size(norm_files[0]) envmap_shape = get_input_size(envmap_files[0]) bg_shape = get_input_size(bg_files[0]) assert len(left_files) == len(right_files) == len(envmap_files) == len(norm_files) == len(bg_files) assert left_shape == right_shape left = tf.data.Dataset.from_tensor_slices( tf.convert_to_tensor(left_files, dtype=tf.string)) right = tf.data.Dataset.from_tensor_slices( tf.convert_to_tensor(right_files, dtype=tf.string)) envmaps = tf.data.Dataset.from_tensor_slices( tf.convert_to_tensor(envmap_files, dtype=tf.string)) norms = tf.data.Dataset.from_tensor_slices( tf.convert_to_tensor(norm_files, dtype=tf.string)) bgs = tf.data.Dataset.from_tensor_slices( tf.convert_to_tensor(bg_files, dtype=tf.string)) print("prepared data size: {}".format(len(left_files))) return (left, left_shape), (right, right_shape), (envmaps, envmap_shape), (norms, norms_shape), (bgs, bg_shape)
def latest_checkpoint(checkpoint_dir, latest_filename=None): """Finds the filename of latest saved checkpoint file. Args: checkpoint_dir: Directory where the variables were saved. latest_filename: Optional name for the protocol buffer file that contains the list of most recent checkpoint filenames. See the corresponding argument to `Saver.save()`. Returns: The full path to the latest checkpoint or `None` if no checkpoint was found. """ # Pick the latest checkpoint based on checkpoint state. ckpt = get_checkpoint_state(checkpoint_dir, latest_filename) if ckpt and ckpt.model_checkpoint_path: # Look for either a V2 path or a V1 path, with priority for V2. v2_path = _prefix_to_checkpoint_path(ckpt.model_checkpoint_path, saver_pb2.SaverDef.V2) v1_path = _prefix_to_checkpoint_path(ckpt.model_checkpoint_path, saver_pb2.SaverDef.V1) if file_io.get_matching_files(v2_path) or file_io.get_matching_files( v1_path): return ckpt.model_checkpoint_path else: logging.error("Couldn't match files for checkpoint %s", ckpt.model_checkpoint_path) return None
def run_experiment(args): if args.restart_training: shutil.rmtree(args.job_dir, ignore_errors=True) content_size = style_size = (args.image_size, args.image_size) num_content_samples = sum(1 for f in file_io.get_matching_files(args.content_files) for _ in tf.python_io.tf_record_iterator(f)) num_style_samples = sum(1 for f in file_io.get_matching_files(args.style_files) for _ in tf.python_io.tf_record_iterator(f)) print("Number of training content samples: " + str(num_content_samples)) print("Number of training style samples: " + str(num_style_samples)) model_fn = create_model_fn(data_format=args.data_format) loss_model_fn = create_loss_model_fn(weights_path=args.vgg_path, data_format=args.data_format) loss_fn = create_loss_fn(data_format=args.data_format) estimator_fn = create_estimator_fn( model_fn=model_fn, loss_model_fn=loss_model_fn, loss_fn=loss_fn, data_format=args.data_format) config = tf.estimator.RunConfig( tf_random_seed=42, save_summary_steps=args.log_iter, save_checkpoints_steps=args.checkpoint_iter, log_step_count_steps=args.log_iter, model_dir=args.job_dir) params = tf_training.HParams( learning_rate=args.learning_rate, content_features=args.content_features, style_features=args.style_features, content_weight=args.content_weight, style_weight=args.style_weight) estimator = tf.estimator.Estimator( model_fn=estimator_fn, params=params, config=config) style_epochs = args.num_epochs * num_content_samples // args.batch_size train_inputs_fn = create_inputs_fn( content_tfrecords=args.content_files, style_tfrecords=args.style_files, content_size=content_size, style_size=style_size, content_epochs=args.num_epochs, style_epochs=style_epochs, batch_size=args.batch_size, shuffle_buffer_size=args.shuffle_buffer_size, scope="train_inputs") estimator.train(input_fn=train_inputs_fn)
def get_train_eval_files(input_dir): """Get preprocessed training and eval files.""" data_dir = _get_latest_data_dir(input_dir) train_pattern = os.path.join(data_dir, 'train*.tfrecord.gz') eval_pattern = os.path.join(data_dir, 'eval*.tfrecord.gz') train_files = file_io.get_matching_files(train_pattern) eval_files = file_io.get_matching_files(eval_pattern) return train_files, eval_files
def delete_backup(self): """Delete the backup directories. Delete the backup directories which should not exist after `fit()` successfully finishes. """ for pathname in file_io.get_matching_files(self.write_checkpoint_manager._prefix + '*'): _delete_file_or_dir(pathname) for pathname in file_io.get_matching_files(os.path.join(self.write_checkpoint_manager.directory, 'checkpoint')): _delete_file_or_dir(pathname)
def train_eval(traindir, evaldir, batchsize, bucket, epochs, outputdir, hidden_units, feat_eng_cols, job_dir, learn_rate, dropout, **kwargs): # define classifier config classifier_config=tf.estimator.RunConfig(save_checkpoints_steps=10) hidden_units = hidden_units.split(',') real_feature_columns, all_feature_columns = get_features(feat_eng_cols) optimizer = tf.train.ProximalAdagradOptimizer( learning_rate=float(learn_rate), l1_regularization_strength=0.1, l2_regularization_strength=0.01 ) # define classifier classifier = tf.estimator.DNNLinearCombinedClassifier( linear_feature_columns=all_feature_columns, dnn_feature_columns=real_feature_columns, dnn_hidden_units = hidden_units, n_classes=len(class_labels), label_vocabulary=class_labels, model_dir=job_dir, config=classifier_config, dnn_dropout=float(dropout), # dnn_optimizer=optimizer ) # load training and eval files traindata = [file for file in file_io.get_matching_files(traindir + '/trajectories.csv*')] evaldata = [file for file in file_io.get_matching_files(evaldir + '/trajectories.csv*')] # define training and eval params train_input = lambda: my_input_fn( traindata, batch_size=batchsize, epochs=epochs, perform_shuffle=True ) eval_input = lambda: my_input_fn( evaldata, perform_shuffle=False, epochs=1 ) # define training, eval spec for train and evaluate including train_spec = tf.estimator.TrainSpec(train_input, max_steps=1000 ) exporter = tf.estimator.LatestExporter('exporter',serving_input_fn) eval_spec = tf.estimator.EvalSpec(eval_input, name='trajectory-eval', exporters=[exporter] ) # run training and evaluation tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec)
def delete_backup(self): """Delete the backup directories. Delete the backup directories which should not exist after `fit()` successfully finishes. """ # pylint: disable=protected-access for pathname in file_io.get_matching_files( self.write_checkpoint_manager._prefix + '*'): file_io.delete_recursively(pathname) for pathname in file_io.get_matching_files( os.path.join(self.write_checkpoint_manager.directory, 'checkpoint')): file_io.delete_recursively(pathname)
def _get_training_validation_testing_dataset(bottleneck_dir, label_table, testing_dataset_size): """Gets the dataset for training, validation and testing. Args: bottleneck_dir: Directory containing the bottleneck TFRecords. label_table: A tensorflow Table of all labels found in dataset. testing_dataset_size: The number of images in the testing dataset. Returns: (training_dataset, validation_dataset, testing_dataset) tuple. training_dataset is a Dataset containing training images. validation_dataset is a Dataset containing validation images. testing_dataset is a Dataset containing testing images. """ def _full_tfrecord_parser(serialized_example): """Parses a tf.Example into (image, label index, bottleneck) Tensors.""" features = { 'image_path': tf.FixedLenFeature((), tf.string), 'label': tf.FixedLenFeature((), tf.string), 'bottleneck': tf.FixedLenFeature([INCEPTION_V3_BOTTLENECK_SIZE], tf.float32), } example = tf.parse_single_example(serialized_example, features=features) label_index = label_table.lookup(example['label']) return example['image_path'], label_index, example['bottleneck'] training_bottleneck_files = file_io.get_matching_files( os.path.join(bottleneck_dir, constants.TRAINING_DATASET + '*')) training_dataset = tf.data.TFRecordDataset(training_bottleneck_files).map( _full_tfrecord_parser).repeat().batch(FLAGS.train_batch_size) validation_bottleneck_files = file_io.get_matching_files( os.path.join(bottleneck_dir, constants.VALIDATION_DATASET + '*')) validation_dataset = tf.data.TFRecordDataset( validation_bottleneck_files).map(_full_tfrecord_parser).repeat().batch( FLAGS.validation_batch_size) testing_bottleneck_files = file_io.get_matching_files( os.path.join(bottleneck_dir, constants.TESTING_DATASET + '*')) testing_dataset = tf.data.TFRecordDataset(testing_bottleneck_files).map( _full_tfrecord_parser).batch(testing_dataset_size) return training_dataset, validation_dataset, testing_dataset
def begin(self): """ Restore parameters if a pre-trained model is available and we haven't trained previously. """ if not self.initialized: #checkpoint = tf.train.latest_checkpoint(self.model_path) all_checkpoints = file_io.get_matching_files( os.path.join(self.model_path, 'model.ckpt-*.index')) if not all_checkpoints: raise ValueError('No checkpoint files found matching %s.' % (self.model_path + '*')) all_checkpoints = [ x.replace('.index', '') for x in all_checkpoints ] all_checkpoints = sorted(all_checkpoints, key=lambda x: int(x.split('-')[-1])) checkpoint = all_checkpoints[-1] if checkpoint is None: logging.info('No pre-trained model is available at %s, ' 'training from scratch.' % self.model_path) else: logging.info( 'Pre-trained model {0} found in {1} - warmstarting.'. format(checkpoint, self.model_path)) tf.train.warm_start(checkpoint) self.initialized = True
def _batch_predict(args, cell): if args['cloud_config'] and not args['cloud']: raise ValueError('"cloud_config" is provided but no "--cloud". ' 'Do you want local run or cloud run?') if args['cloud']: parts = args['model'].split('.') if len(parts) != 2: raise ValueError('Invalid model name for cloud prediction. Use "model.version".') version_name = ('projects/%s/models/%s/versions/%s' % (Context.default().project_id, parts[0], parts[1])) cloud_config = args['cloud_config'] or {} job_id = cloud_config.pop('job_id', None) job_request = { 'version_name': version_name, 'data_format': 'TEXT', 'input_paths': file_io.get_matching_files(args['prediction_data']['csv']), 'output_path': args['output'], } job_request.update(cloud_config) job = datalab_ml.Job.submit_batch_prediction(job_request, job_id) _show_job_link(job) else: print('local prediction...') _local_predict.local_batch_predict(args['model'], args['prediction_data']['csv'], args['output'], args['format'], args['batch_size']) print('done.')
def create_object_test(): """Verifies file_io's object manipulation methods .""" starttime = int(round(time.time() * 1000)) dir_name = "%s/tf_gcs_test_%s" % (FLAGS.gcs_bucket_url, starttime) print("Creating dir %s." % dir_name) file_io.create_dir(dir_name) # Create a file in this directory. file_name = "%s/test_file.txt" % dir_name print("Creating file %s." % file_name) file_io.write_string_to_file(file_name, "test file creation.") list_files_pattern = "%s/test_file*.txt" % dir_name print("Getting files matching pattern %s." % list_files_pattern) files_list = file_io.get_matching_files(list_files_pattern) print(files_list) assert len(files_list) == 1 assert files_list[0] == file_name # Cleanup test files. print("Deleting file %s." % file_name) file_io.delete_file(file_name) # Delete directory. print("Deleting directory %s." % dir_name) file_io.delete_recursively(dir_name)
def _run_batch_prediction(self): """Run batch prediction using the cloudml engine prediction service. There is no local version of this step as it's the last step. """ job_name = 'test_mltoolbox_batchprediction_%s' % uuid.uuid4().hex cmd = [ 'gcloud ml-engine jobs submit prediction ' + job_name, '--data-format=TEXT', '--input-paths=' + self._csv_predict_filename, '--output-path=' + self._prediction_output, '--model-dir=' + os.path.join(self._train_output, 'model'), '--runtime-version=1.0', '--region=us-central1' ] self._logger.debug('Running subprocess: %s \n\n' % ' '.join(cmd)) subprocess.check_call(' '.join(cmd), shell=True) # async call. subprocess.check_call('gcloud ml-engine jobs stream-logs ' + job_name, shell=True) # check that there was no errors. error_files = file_io.get_matching_files( os.path.join(self._prediction_output, 'prediction.errors_stats*')) self.assertEqual(1, len(error_files)) error_str = file_io.read_file_to_string(error_files[0]) self.assertEqual('', error_str)
def input_fn(files, num_epochs=None, shuffle=False, shared_name=None): # get file names if file_io.is_directory(files[0]): file_names = file_io.get_matching_files(files[0] + '/*tfrecord') else: file_names = files # shuffle if required if shuffle: shuffle_fn(file_names) # queue with the file names that can be shared amongst workers during training filename_queue = tf.FIFOQueue(100, tf.string, shared_name=shared_name) enque_op = filename_queue.enqueue_many([tf.train.limit_epochs(file_names, num_epochs)]) close_op = filename_queue.close(cancel_pending_enqueues=True) # create queue runner and add it to queue runners qr = tf.train.QueueRunner(filename_queue, [enque_op], close_op, queue_closed_exception_types=(tf.errors.OutOfRangeError, tf.errors.CancelledError)) tf.train.add_queue_runner(qr) # read example from file reader = tf.TFRecordReader() _, example = reader.read(filename_queue) # parse example image, ground_truth, example_name = parse_example(example) return image, ground_truth, example_name
def get_checkpoint(self, last_global_step_val): if FLAGS.start_eval_from_ckpt: files = file_io.get_matching_files( join(self.train_dir, 'model.ckpt-*.index')) # No files if not files: return None, None files = sorted(files, key=self._get_global_step_from_ckpt) start_at = FLAGS.start_eval_from_ckpt if str(start_at).isdigit(): start_at = int(start_at) files = list( filter( lambda x: self._get_global_step_from_ckpt(x) > start_at, files)) for filename in files: filname_global_step = self._get_global_step_from_ckpt(filename) if last_global_step_val < filname_global_step: return filename[:-6], filname_global_step return None, None else: latest_checkpoint = tf.train.latest_checkpoint(self.train_dir) if latest_checkpoint is None: return None, None global_step = self._get_global_step_from_ckpt(latest_checkpoint) return latest_checkpoint, global_step
def __init__(self, mode, batch_size, img_size, preprocessfunc=None): self.mode = mode self.batch_size = batch_size self.img_size = img_size self.preprocessfunc = preprocessfunc self.data = [] self.file_location = join(GCP_paths['data'], mode) print('Loading data from {}'.format(self.file_location)) for fpath in file_io.get_matching_files( join(self.file_location, '*.jpg')): fname = fpath.split('/')[-1] if self.mode != 'test': split = fname[0:-4].split('_') labels = split[3][1:-1].split(',') labels = list(map(int, labels)) self.data.append({ 'id': int(split[1]), 'labels': labels, 'file': fpath }) else: self.data.append({'id': fname[0:-4], 'file': fpath}) self.data = pd.DataFrame(self.data, columns=['id', 'labels', 'file']) self.n_samples = len(self.data) self.n_batches = int(np.ceil(self.n_samples / self.batch_size)) print('SequenceFromGCP <{}>: {} samples'.format( self.mode, self.n_samples))
def input_fn(self, mode: tf.estimator.ModeKeys, params, data_dir): file_paths = file_io.get_matching_files( os.path.join(data_dir, "part-r-*")) #返回一个列表 data_set = tf.data.TFRecordDataset(file_paths, buffer_size=800 * 1024) batch_size = params["batch_size"] def parse(raw): context_dic,seq_dic =\ tf.parse_single_sequence_example(serialized=raw ,context_features=self.conttext_spec ,sequence_features=self.sequence_spec) label = context_dic.pop(params["item_relevance"]) label = label * 3 #seq_dic[self.params["user_car_serial"]] = context_dic[self.params["user_car_serial"]] for k, v in six.iteritems(seq_dic): context_dic[k] = v return context_dic, label if mode == tf.estimator.ModeKeys.TRAIN: data_set = data_set.repeat(None).map(parse).batch( batch_size) #.prefetch(buffer_size=None) elif mode == tf.estimator.ModeKeys.EVAL: data_set = data_set.repeat(None) \ .take(3000).map(parse).batch(60)#.prefetch(buffer_size=None) it = data_set.make_one_shot_iterator() feature, label = it.get_next() label = tf.sparse_tensor_to_dense(label, default_value=-1) return feature, label
def main(argv): if FLAGS.output_type not in VALID_OUTPUT_TYPES: raise ValueError('output_type "%s" not in allowed types: %s' % (FLAGS.output_type, VALID_OUTPUT_TYPES)) # Exclude argv[0], which is the current binary. patterns = argv[1:] if not patterns: raise ValueError('PNG file glob(s) must be specified') input_paths = [] for pattern in patterns: pattern_paths = file_io.get_matching_files(pattern) if not pattern_paths: raise ValueError('Pattern "%s" failed to match any files' % pattern) input_paths.extend(pattern_paths) start = time.time() output = run( input_paths, FLAGS.glyphs_saved_model, output_notesequence=FLAGS.output_type == 'NoteSequence') end = time.time() sys.stderr.write('OMR elapsed time: %.2f\n' % (end - start)) if FLAGS.output_type == 'MusicXML': output_bytes = conversions.score_to_musicxml(output) else: if FLAGS.text_format: output_bytes = text_format.MessageToString(output).encode('utf-8') else: output_bytes = output.SerializeToString() file_io.write_string_to_file(FLAGS.output, output_bytes)
def testAPIBackwardsCompatibility(self): # Extract all API stuff. visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor() public_api_visitor = public_api.PublicAPIVisitor(visitor) public_api_visitor.do_not_descend_map[''].append('contrib') traverse.traverse(tf, public_api_visitor) proto_dict = visitor.GetProtos() # Read all golden files. expression = os.path.join( resource_loader.get_root_dir_with_all_resources(), _KeyToFilePath('*')) golden_file_list = file_io.get_matching_files(expression) def _ReadFileToProto(filename): """Read a filename, create a protobuf from its contents.""" ret_val = api_objects_pb2.TFAPIObject() text_format.Merge(file_io.read_file_to_string(filename), ret_val) return ret_val golden_proto_dict = { _FileNameToKey(filename): _ReadFileToProto(filename) for filename in golden_file_list } # Diff them. Do not fail if called with update. # If the test is run to update goldens, only report diffs but do not fail. self._AssertProtoDictEquals( golden_proto_dict, proto_dict, verbose=FLAGS.verbose_diffs, update_goldens=FLAGS.update_goldens)
def _get_list_checkpoint(self, n_export, model_dir): """Get the checkpoints that we want to export. Args: n_export: Number of models to export. model_dir: Directory containing the checkpoints. Returns: List of checkpoint path. If n_export==1, we take only the last checkpoint. Otherwise, we consider the list of steps for each for which we have a checkpoint. Then we choose n_export number of checkpoints such as their steps are as equidistant as possible. """ checkpoints = file_io.get_matching_files( os.path.join(model_dir, 'model.ckpt-*.index')) checkpoints = [x.replace('.index', '') for x in checkpoints] checkpoints = sorted(checkpoints, key=lambda x: int(x.split('-')[-1])) if n_export == 1: return [checkpoints[-1]] # We want to cover a distance of (len(checkpoints) - 1): for 3 points, we have a distance of 2. # with a number of points of (n_export -1): because 1 point is set at the end. step = float(len(checkpoints) - 1) / (n_export - 1) if step <= 1: # Fewer checkpoints available than the desired number. return checkpoints checkpoints_to_export = [ checkpoints[int(i * step)] for i in range(n_export - 1) ] checkpoints_to_export.append(checkpoints[-1]) return checkpoints_to_export
def _load_tf_custom_op(model_path): """Loads a custom TF OP (in .so format) from /assets.extra directory.""" assets_dir = os.path.join(model_path, _CUSTOM_OP_DIRECTORY_NAME) if file_io.is_directory(assets_dir): custom_ops_pattern = os.path.join(assets_dir, _CUSTOM_OP_SUFFIX) for custom_op_path_original in file_io.get_matching_files( custom_ops_pattern): logging.info("Found custom op file: %s", custom_op_path_original) if custom_op_path_original.startswith("gs://"): if not os.path.isdir(_CUSTOM_OP_LOCAL_DIR): os.makedirs(_CUSTOM_OP_LOCAL_DIR) custom_op_path_local = os.path.join( _CUSTOM_OP_LOCAL_DIR, os.path.basename(custom_op_path_original)) logging.info("Copying custop op from: %s to: %s", custom_op_path_original, custom_op_path_local) file_io.copy(custom_op_path_original, custom_op_path_local, True) else: custom_op_path_local = custom_op_path_original try: import tensorflow as tf # pylint: disable=g-import-not-at-top logging.info("Loading custom op: %s", custom_op_path_local) logging.info("TF Version: %s", tf.__version__) tf.load_op_library(custom_op_path_local) except RuntimeError as e: logging.exception( "Failed to load custom op: %s with error: %s. Prediction " "will likely fail due to missing operations.", custom_op_path_local, e)
def _get_image_label_info(bottleneck_dir): # type: str -> (int, List[str]) """Calculates the number of images and unique labels in dataset. Args: bottleneck_dir: Directory containing the bottleneck TFRecords. Returns: image_count: Total number of images in dataset. label_list: List of labels found in dataset. This function will parse the TFRecords found in bottleneck dir and will only return the labels. """ labels = OrderedDict() dataset_to_image_count = defaultdict(int) bottleneck_files = file_io.get_matching_files( os.path.join(bottleneck_dir, '*')) for bottleneck_file in bottleneck_files: for it in tf.compat.v1.io.tf_record_iterator(bottleneck_file): example = tf.train.Example() example.ParseFromString(it) label = example.features.feature['label'].bytes_list.value[0] labels[label] = True dataset = example.features.feature['dataset'].bytes_list.value[0] dataset_to_image_count[dataset] += 1 label_list = [] for key in labels: label_list.append(key) return dataset_to_image_count, label_list
def _GetBaseApiMap(self): """Get a map from graph op name to its base ApiDef. Returns: Dictionary mapping graph op name to corresponding ApiDef. """ # Convert base ApiDef in Multiline format to Proto format. converted_base_api_dir = os.path.join( test.get_temp_dir(), 'temp_base_api_defs') subprocess.check_call( [os.path.join(resource_loader.get_root_dir_with_all_resources(), _CONVERT_FROM_MULTILINE_SCRIPT), _BASE_API_DIR, converted_base_api_dir]) name_to_base_api_def = {} base_api_files = file_io.get_matching_files( os.path.join(converted_base_api_dir, 'api_def_*.pbtxt')) for base_api_file in base_api_files: if file_io.file_exists(base_api_file): api_defs = api_def_pb2.ApiDefs() text_format.Merge( file_io.read_file_to_string(base_api_file), api_defs) for api_def in api_defs.op: name_to_base_api_def[api_def.graph_op_name] = api_def return name_to_base_api_def
def _load_metadata_files(self): """Load and parse metadata files in the dump root. Check that all metadata files have a common tfdbg_run_id, and raise a ValueError if their tfdbg_run_ids differ. Returns: A list of metadata file paths in ascending order of their starting wall_time timestamp. """ metadata_paths = file_io.get_matching_files( os.path.join(self._dump_root, "*%s" % self._METADATA_SUFFIX)) if not metadata_paths: raise ValueError("Cannot find any tfdbg metadata file in directory: %s" % self._dump_root) wall_times = [] run_ids = [] tensorflow_versions = [] file_versions = [] for metadata_path in metadata_paths: reader = tf_record.tf_record_random_reader(metadata_path) try: record = reader.read(0)[0] debug_event = debug_event_pb2.DebugEvent.FromString(record) wall_times.append(debug_event.wall_time) run_ids.append(debug_event.debug_metadata.tfdbg_run_id) tensorflow_versions.append( debug_event.debug_metadata.tensorflow_version) file_versions.append(debug_event.debug_metadata.file_version) finally: reader.close() self._starting_wall_time = wall_times[0] self._tfdbg_run_id = run_ids[0] self._tensorflow_version = tensorflow_versions[0] self._file_version = file_versions[0] if len(metadata_paths) == 1: # Fast path for a common case (only one DebugEvent file set.) return metadata_paths num_no_id = len([run_id for run_id in run_ids if not run_id]) if num_no_id: paths_without_run_id = [ metadata_path for metadata_path, run_id in zip(metadata_paths, run_ids) if not run_id ] raise ValueError( "Found %d tfdbg metadata files and %d of them do not " "have tfdbg run ids. The metadata files without run ids are: %s" % (len(run_ids), num_no_id, paths_without_run_id)) elif len(set(run_ids)) != 1: raise ValueError( "Unexpected: Found multiple (%d) tfdbg2 runs in directory %s" % (len(set(run_ids)), self._dump_root)) # Return the metadata files in ascending order of their timestamps. paths_and_timestamps = sorted( zip(metadata_paths, wall_times), key=lambda t: t[1]) self._starting_wall_time = paths_and_timestamps[0][1] return [path[0] for path in paths_and_timestamps]
def get_all_checkpoints(output_dir): """docstring.""" ckpt = cm.get_checkpoint_state(output_dir, None) res = [] if not ckpt: return None for path in ckpt.all_model_checkpoint_paths: # Look for either a V2 path or a V1 path, with priority for V2. v2_path = cm._prefix_to_checkpoint_path(path, saver_pb2.SaverDef.V2) v1_path = cm._prefix_to_checkpoint_path(path, saver_pb2.SaverDef.V1) if file_io.get_matching_files(v2_path) or file_io.get_matching_files( v1_path): res.append(path) else: tf.logging.error("Couldn't match files for checkpoint %s", path) return res
def raw_training_input_fn(): """Training input function that reads raw data and applies transforms.""" if isinstance(raw_data_file_pattern, six.string_types): filepath_list = [raw_data_file_pattern] else: filepath_list = raw_data_file_pattern files = [] for path in filepath_list: files.extend(file_io.get_matching_files(path)) filename_queue = tf.train.string_input_producer( files, num_epochs=num_epochs, shuffle=randomize_input) csv_id, csv_lines = tf.TextLineReader().read_up_to(filename_queue, training_batch_size) queue_capacity = (reader_num_threads + 3) * training_batch_size + min_after_dequeue if randomize_input: _, batch_csv_lines = tf.train.shuffle_batch( tensors=[csv_id, csv_lines], batch_size=training_batch_size, capacity=queue_capacity, min_after_dequeue=min_after_dequeue, enqueue_many=True, num_threads=reader_num_threads, allow_smaller_final_batch=allow_smaller_final_batch) else: _, batch_csv_lines = tf.train.batch( tensors=[csv_id, csv_lines], batch_size=training_batch_size, capacity=queue_capacity, enqueue_many=True, num_threads=reader_num_threads, allow_smaller_final_batch=allow_smaller_final_batch) csv_header, record_defaults = csv_header_and_defaults(features, schema, stats, keep_target=True) parsed_tensors = tf.decode_csv(batch_csv_lines, record_defaults, name='csv_to_tensors') raw_features = dict(zip(csv_header, parsed_tensors)) transform_fn = make_preprocessing_fn(analysis_output_dir, features, keep_target=True) transformed_tensors = transform_fn(raw_features) # Expand the dims of non-sparse tensors. This is needed by tf.learn. transformed_features = {} for k, v in six.iteritems(transformed_tensors): if isinstance(v, tf.Tensor) and v.get_shape().ndims == 1: transformed_features[k] = tf.expand_dims(v, -1) else: transformed_features[k] = v # Remove the target tensor, and return it directly target_name = get_target_name(features) if not target_name or target_name not in transformed_features: raise ValueError('Cannot find target transform in features') transformed_target = transformed_features.pop(target_name) return transformed_features, transformed_target
def __init__(self, dump_root): if not file_io.is_directory(dump_root): raise ValueError("Specified dump_root is not a directory: %s" % dump_root) metadata_paths = file_io.get_matching_files( os.path.join(dump_root, "*.metadata")) if not metadata_paths: raise ValueError("Cannot find any metadata file in directory: %s" % dump_root) elif len(metadata_paths) > 1: raise ValueError( "Unexpected: Found multiple (%d) metadata in directory: %s" % (len(metadata_paths), dump_root)) self._metadata_path = compat.as_bytes(metadata_paths[0]) self._metadata_reader = None prefix = metadata_paths[0][:-len(".metadata")] self._source_files_path = compat.as_bytes("%s.source_files" % prefix) self._stack_frames_path = compat.as_bytes("%s.stack_frames" % prefix) self._graphs_path = compat.as_bytes("%s.graphs" % prefix) self._execution_path = compat.as_bytes("%s.execution" % prefix) self._graph_execution_traces_path = compat.as_bytes( "%s.graph_execution_traces" % prefix) self._readers = dict() # A map from file path to reader. # A map from file path to current reading offset. self._reader_offsets = dict() # Lock for reader creation. self._readers_lock = threading.Lock() # Locks for read operation on individual readers. self._reader_read_locks = dict() self._offsets = dict()
def checkBackwardsCompatibility(self, root, golden_file_pattern, api_version): # Extract all API stuff. visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor() public_api_visitor = public_api.PublicAPIVisitor(visitor) public_api_visitor.do_not_descend_map['tf'].append('contrib') public_api_visitor.do_not_descend_map['tf.GPUOptions'] = ['Experimental'] traverse.traverse(root, public_api_visitor) proto_dict = visitor.GetProtos() # Read all golden files. golden_file_list = file_io.get_matching_files(golden_file_pattern) def _ReadFileToProto(filename): """Read a filename, create a protobuf from its contents.""" ret_val = api_objects_pb2.TFAPIObject() text_format.Merge(file_io.read_file_to_string(filename), ret_val) return ret_val golden_proto_dict = { _FileNameToKey(filename): _ReadFileToProto(filename) for filename in golden_file_list } # Diff them. Do not fail if called with update. # If the test is run to update goldens, only report diffs but do not fail. self._AssertProtoDictEquals( golden_proto_dict, proto_dict, verbose=FLAGS.verbose_diffs, update_goldens=FLAGS.update_goldens, api_version=api_version)
def testAPIBackwardsCompatibility(self): # Extract all API stuff. visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor() public_api_visitor = public_api.PublicAPIVisitor(visitor) public_api_visitor.do_not_descend_map['tf'].append('contrib') public_api_visitor.do_not_descend_map['tf.GPUOptions'] = ['Experimental'] traverse.traverse(tf, public_api_visitor) proto_dict = visitor.GetProtos() # Read all golden files. expression = os.path.join( resource_loader.get_root_dir_with_all_resources(), _KeyToFilePath('*')) golden_file_list = file_io.get_matching_files(expression) def _ReadFileToProto(filename): """Read a filename, create a protobuf from its contents.""" ret_val = api_objects_pb2.TFAPIObject() text_format.Merge(file_io.read_file_to_string(filename), ret_val) return ret_val golden_proto_dict = { _FileNameToKey(filename): _ReadFileToProto(filename) for filename in golden_file_list } # Diff them. Do not fail if called with update. # If the test is run to update goldens, only report diffs but do not fail. self._AssertProtoDictEquals( golden_proto_dict, proto_dict, verbose=FLAGS.verbose_diffs, update_goldens=FLAGS.update_goldens)
def _checkBackwardsCompatibility(self, root, golden_file_patterns, api_version, additional_private_map=None, omit_golden_symbols_map=None): # Extract all API stuff. visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor() public_api_visitor = public_api.PublicAPIVisitor(visitor) public_api_visitor.private_map['tf'].append('contrib') if api_version == 2: public_api_visitor.private_map['tf'].append('enable_v2_behavior') public_api_visitor.do_not_descend_map['tf.GPUOptions'] = [ 'Experimental' ] # Do not descend into these numpy classes because their signatures may be # different between internal and OSS. public_api_visitor.do_not_descend_map['tf.experimental.numpy'] = [ 'bool_', 'complex_', 'complex128', 'complex64', 'float_', 'float16', 'float32', 'float64', 'inexact', 'int_', 'int16', 'int32', 'int64', 'int8', 'object_', 'string_', 'uint16', 'uint32', 'uint64', 'uint8', 'unicode_', 'iinfo' ] if FLAGS.only_test_core_api: public_api_visitor.do_not_descend_map['tf'].extend( _NON_CORE_PACKAGES) if additional_private_map: public_api_visitor.private_map.update(additional_private_map) traverse.traverse(root, public_api_visitor) proto_dict = visitor.GetProtos() # Read all golden files. golden_file_list = file_io.get_matching_files(golden_file_patterns) if FLAGS.only_test_core_api: golden_file_list = _FilterNonCoreGoldenFiles(golden_file_list) def _ReadFileToProto(filename): """Read a filename, create a protobuf from its contents.""" ret_val = api_objects_pb2.TFAPIObject() text_format.Merge(file_io.read_file_to_string(filename), ret_val) return ret_val golden_proto_dict = { _FileNameToKey(filename): _ReadFileToProto(filename) for filename in golden_file_list } golden_proto_dict = _FilterGoldenProtoDict(golden_proto_dict, omit_golden_symbols_map) # Diff them. Do not fail if called with update. # If the test is run to update goldens, only report diffs but do not fail. self._AssertProtoDictEquals(golden_proto_dict, proto_dict, verbose=FLAGS.verbose_diffs, update_goldens=FLAGS.update_goldens, api_version=api_version)
def _run_transform(self): """Runs DataFlow for makint tf.example files. Only the train file uses DataFlow, the eval file runs beam locally to save time. """ cloud = True extra_args = [] if cloud: extra_args = [ '--cloud', '--job-name=test-mltoolbox-df-%s' % uuid.uuid4().hex, '--project-id=%s' % self._get_default_project_id(), '--num-workers=3' ] cmd = [ 'python %s' % os.path.join(CODE_PATH, 'transform.py'), '--csv-file-pattern=' + self._csv_train_filename, '--output-dir-from-analysis-step=' + self._analysis_output, '--output-filename-prefix=features_train', '--output-dir=' + self._transform_output, '--shuffle' ] + extra_args self._logger.debug('Running subprocess: %s \n\n' % ' '.join(cmd)) subprocess.check_call(' '.join(cmd), shell=True) # Don't wate time running a 2nd DF job, run it locally. cmd = [ 'python %s' % os.path.join(CODE_PATH, 'transform.py'), '--csv-file-pattern=' + self._csv_eval_filename, '--output-dir-from-analysis-step=' + self._analysis_output, '--output-filename-prefix=features_eval', '--output-dir=' + self._transform_output ] self._logger.debug('Running subprocess: %s \n\n' % ' '.join(cmd)) subprocess.check_call(' '.join(cmd), shell=True) # Check the files were made train_files = file_io.get_matching_files( os.path.join(self._transform_output, 'features_train*')) eval_files = file_io.get_matching_files( os.path.join(self._transform_output, 'features_eval*')) self.assertNotEqual([], train_files) self.assertNotEqual([], eval_files)
def load_dataset(directory): files = gcsfile.get_matching_files(directory + "/*") labels = list( map(lambda filename: int(os.path.basename(filename)[0:1] == '1'), files)) boxes = tf.zeros(shape=[len(files), 4]) return tf.contrib.data.Dataset.from_tensor_slices( (tf.constant(files), tf.constant(labels), boxes)), len(files)
def testGetMatchingFiles(self): dir_path = os.path.join(self._base_dir, "temp_dir") file_io.create_dir(dir_path) files = ["file1.txt", "file2.txt", "file3.txt"] for name in files: file_path = os.path.join(dir_path, name) file_io.FileIO(file_path, mode="w").write("testing") expected_match = [os.path.join(dir_path, name) for name in files] self.assertItemsEqual( file_io.get_matching_files(os.path.join(dir_path, "file*.txt")), expected_match) self.assertItemsEqual(file_io.get_matching_files(tuple()), []) files_subset = [ os.path.join(dir_path, files[0]), os.path.join(dir_path, files[2]) ] self.assertItemsEqual( file_io.get_matching_files(files_subset), files_subset) file_io.delete_recursively(dir_path) self.assertFalse(file_io.file_exists(os.path.join(dir_path, "file3.txt")))
def _run_transform(self): """Runs DataFlow for makint tf.example files. Only the train file uses DataFlow, the eval file runs beam locally to save time. """ cloud = True extra_args = [] if cloud: extra_args = ['--cloud', '--job-name=test-mltoolbox-df-%s' % uuid.uuid4().hex, '--project-id=%s' % self._get_default_project_id(), '--num-workers=3'] cmd = ['python %s' % os.path.join(CODE_PATH, 'transform.py'), '--csv=' + self._csv_train_filename, '--analysis=' + self._analysis_output, '--prefix=features_train', '--output=' + self._transform_output, '--shuffle'] + extra_args self._logger.debug('Running subprocess: %s \n\n' % ' '.join(cmd)) subprocess.check_call(' '.join(cmd), shell=True) # Don't wate time running a 2nd DF job, run it locally. cmd = ['python %s' % os.path.join(CODE_PATH, 'transform.py'), '--csv=' + self._csv_eval_filename, '--analysis=' + self._analysis_output, '--prefix=features_eval', '--output=' + self._transform_output] self._logger.debug('Running subprocess: %s \n\n' % ' '.join(cmd)) subprocess.check_call(' '.join(cmd), shell=True) # Check the files were made train_files = file_io.get_matching_files( os.path.join(self._transform_output, 'features_train*')) eval_files = file_io.get_matching_files( os.path.join(self._transform_output, 'features_eval*')) self.assertNotEqual([], train_files) self.assertNotEqual([], eval_files)
def testGetMatchingFiles(self): dir_path = os.path.join(self._base_dir, "temp_dir") file_io.create_dir(dir_path) files = ["file1.txt", "file2.txt", "file3.txt"] for name in files: file_path = os.path.join(dir_path, name) file_io.write_string_to_file(file_path, "testing") expected_match = [os.path.join(dir_path, name) for name in files] self.assertItemsEqual( file_io.get_matching_files(os.path.join(dir_path, "file*.txt")), expected_match) file_io.delete_recursively(dir_path) self.assertFalse(file_io.file_exists(os.path.join(dir_path, "file3.txt")))
def checkpoint_exists(checkpoint_prefix): """Checks whether a V1 or V2 checkpoint exists with the specified prefix. This is the recommended way to check if a checkpoint exists, since it takes into account the naming difference between V1 and V2 formats. Args: checkpoint_prefix: the prefix of a V1 or V2 checkpoint, with V2 taking priority. Typically the result of `Saver.save()` or that of `tf.train.latest_checkpoint()`, regardless of sharded/non-sharded or V1/V2. Returns: A bool, true iff a checkpoint referred to by `checkpoint_prefix` exists. """ pathname = _prefix_to_checkpoint_path(checkpoint_prefix, saver_pb2.SaverDef.V2) if file_io.get_matching_files(pathname): return True elif file_io.get_matching_files(checkpoint_prefix): return True else: return False
def testNewAPIBackwardsCompatibility(self): # Extract all API stuff. visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor() public_api_visitor = public_api.PublicAPIVisitor(visitor) public_api_visitor.do_not_descend_map['tf'].append('contrib') public_api_visitor.do_not_descend_map['tf.GPUOptions'] = ['Experimental'] # TODO(annarev): Make slide_dataset available in API. public_api_visitor.private_map['tf'] = ['slide_dataset'] traverse.traverse(api, public_api_visitor) proto_dict = visitor.GetProtos() # Read all golden files. expression = os.path.join( resource_loader.get_root_dir_with_all_resources(), _KeyToFilePath('*')) golden_file_list = file_io.get_matching_files(expression) def _ReadFileToProto(filename): """Read a filename, create a protobuf from its contents.""" ret_val = api_objects_pb2.TFAPIObject() text_format.Merge(file_io.read_file_to_string(filename), ret_val) return ret_val golden_proto_dict = { _FileNameToKey(filename): _ReadFileToProto(filename) for filename in golden_file_list } # user_ops is an empty module. It is currently available in TensorFlow API # but we don't keep empty modules in the new API. # We delete user_ops from golden_proto_dict to make sure assert passes # when diffing new API against goldens. # TODO(annarev): remove user_ops from goldens once we switch to new API. tf_module = golden_proto_dict['tensorflow'].tf_module for i in range(len(tf_module.member)): if tf_module.member[i].name == 'user_ops': del tf_module.member[i] break # Diff them. Do not fail if called with update. # If the test is run to update goldens, only report diffs but do not fail. self._AssertProtoDictEquals( golden_proto_dict, proto_dict, verbose=FLAGS.verbose_diffs, update_goldens=False, additional_missing_object_message= 'Check if tf_export decorator/call is missing for this symbol.')
def _checkBackwardsCompatibility(self, root, golden_file_pattern, api_version, additional_private_map=None, omit_golden_symbols_map=None): # Extract all API stuff. visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor() public_api_visitor = public_api.PublicAPIVisitor(visitor) public_api_visitor.private_map['tf'] = ['contrib'] if api_version == 2: public_api_visitor.private_map['tf'].append('enable_v2_behavior') public_api_visitor.do_not_descend_map['tf.GPUOptions'] = ['Experimental'] if FLAGS.only_test_core_api: public_api_visitor.do_not_descend_map['tf'].extend(_NON_CORE_PACKAGES) if additional_private_map: public_api_visitor.private_map.update(additional_private_map) traverse.traverse(root, public_api_visitor) proto_dict = visitor.GetProtos() # Read all golden files. golden_file_list = file_io.get_matching_files(golden_file_pattern) if FLAGS.only_test_core_api: golden_file_list = _FilterNonCoreGoldenFiles(golden_file_list) def _ReadFileToProto(filename): """Read a filename, create a protobuf from its contents.""" ret_val = api_objects_pb2.TFAPIObject() text_format.Merge(file_io.read_file_to_string(filename), ret_val) return ret_val golden_proto_dict = { _FileNameToKey(filename): _ReadFileToProto(filename) for filename in golden_file_list } golden_proto_dict = _FilterGoldenProtoDict(golden_proto_dict, omit_golden_symbols_map) # Diff them. Do not fail if called with update. # If the test is run to update goldens, only report diffs but do not fail. self._AssertProtoDictEquals( golden_proto_dict, proto_dict, verbose=FLAGS.verbose_diffs, update_goldens=FLAGS.update_goldens, api_version=api_version)
def get_latest_checkpoint(): index_files = file_io.get_matching_files(os.path.join(FLAGS.train_dir, 'model.ckpt-*.index')) # No files if not index_files: return None # Index file path with the maximum step size. latest_index_file = sorted( [(int(os.path.basename(f).split("-")[-1].split(".")[0]), f) for f in index_files])[-1][1] # Chop off .index suffix and return return latest_index_file[:-6]
def copy_data_to_tmp(input_files): """Copies data to /tmp/ and returns glob matching the files.""" files = [] for e in input_files: for path in e.split(','): files.extend(file_io.get_matching_files(path)) for path in files: if not path.startswith('gs://'): return input_files tmp_path = os.path.join('/tmp/', str(uuid.uuid4())) os.makedirs(tmp_path) subprocess.check_call(['gsutil', '-m', '-q', 'cp', '-r'] + files + [tmp_path]) return [os.path.join(tmp_path, '*')]
def read_examples(input_files, batch_size, shuffle, num_epochs=None): """Creates readers and queues for reading example protos.""" files = [] for e in input_files: for path in e.split(','): files.extend(file_io.get_matching_files(path)) thread_count = multiprocessing.cpu_count() # The minimum number of instances in a queue from which examples are drawn # randomly. The larger this number, the more randomness at the expense of # higher memory requirements. min_after_dequeue = 1000 # When batching data, the queue's capacity will be larger than the batch_size # by some factor. The recommended formula is (num_threads + a small safety # margin). For now, we use a single thread for reading, so this can be small. queue_size_multiplier = thread_count + 3 # Convert num_epochs == 0 -> num_epochs is None, if necessary num_epochs = num_epochs or None # Build a queue of the filenames to be read. filename_queue = tf.train.string_input_producer(files, num_epochs, shuffle) options = tf.python_io.TFRecordOptions( compression_type=tf.python_io.TFRecordCompressionType.GZIP) example_id, encoded_example = tf.TFRecordReader(options=options).read_up_to( filename_queue, batch_size) if shuffle: capacity = min_after_dequeue + queue_size_multiplier * batch_size return tf.train.shuffle_batch( [example_id, encoded_example], batch_size, capacity, min_after_dequeue, enqueue_many=True, num_threads=thread_count) else: capacity = queue_size_multiplier * batch_size return tf.train.batch( [example_id, encoded_example], batch_size, capacity=capacity, enqueue_many=True, num_threads=thread_count)
def local_batch_predict(model_dir, csv_file_pattern, output_dir, output_format, batch_size=100): """ Batch Predict with a specified model. It does batch prediction, saves results to output files and also creates an output schema file. The output file names are input file names prepended by 'predict_results_'. Args: model_dir: The model directory containing a SavedModel (usually saved_model.pb). csv_file_pattern: a pattern of csv files as batch prediction source. output_dir: the path of the output directory. output_format: csv or json. batch_size: Larger batch_size improves performance but may cause more memory usage. """ file_io.recursive_create_dir(output_dir) csv_files = file_io.get_matching_files(csv_file_pattern) if len(csv_files) == 0: raise ValueError('No files found given ' + csv_file_pattern) with tf.Graph().as_default(), tf.Session() as sess: input_alias_map, output_alias_map = _tf_load_model(sess, model_dir) csv_tensor_name = list(input_alias_map.values())[0] output_schema = _get_output_schema(sess, output_alias_map) for csv_file in csv_files: output_file = os.path.join( output_dir, 'predict_results_' + os.path.splitext(os.path.basename(csv_file))[0] + '.' + output_format) with file_io.FileIO(output_file, 'w') as f: prediction_source = _batch_csv_reader(csv_file, batch_size) for batch in prediction_source: batch = [l.rstrip() for l in batch if l] predict_results = sess.run(fetches=output_alias_map, feed_dict={csv_tensor_name: batch}) formatted_results = _format_results(output_format, output_schema, predict_results) f.write('\n'.join(formatted_results) + '\n') file_io.write_string_to_file(os.path.join(output_dir, 'predict_results_schema.json'), json.dumps(output_schema, indent=2))
def testMatchingFilesPermission(self): # Create top level directory test_dir. dir_path = os.path.join(self._base_dir, "test_dir") file_io.create_dir(dir_path) # Create second level directories `noread` and `any`. noread_path = os.path.join(dir_path, "noread") file_io.create_dir(noread_path) any_path = os.path.join(dir_path, "any") file_io.create_dir(any_path) files = ["file1.txt", "file2.txt", "file3.txt"] for name in files: file_path = os.path.join(any_path, name) file_io.FileIO(file_path, mode="w").write("testing") file_path = os.path.join(noread_path, "file4.txt") file_io.FileIO(file_path, mode="w").write("testing") # Change noread to noread access. os.chmod(noread_path, 0) expected_match = [os.path.join(any_path, name) for name in files] self.assertItemsEqual( file_io.get_matching_files(os.path.join(dir_path, "*", "file*.txt")), expected_match) # Change noread back so that it could be cleaned during tearDown. os.chmod(noread_path, 0o777)
def _run_batch_prediction(self): """Run batch prediction using the cloudml engine prediction service. There is no local version of this step as it's the last step. """ job_name = 'test_mltoolbox_batchprediction_%s' % uuid.uuid4().hex cmd = ['gcloud ml-engine jobs submit prediction ' + job_name, '--data-format=TEXT', '--input-paths=' + self._csv_predict_filename, '--output-path=' + self._prediction_output, '--model-dir=' + os.path.join(self._train_output, 'model'), '--runtime-version=1.0', '--region=us-central1'] self._logger.debug('Running subprocess: %s \n\n' % ' '.join(cmd)) subprocess.check_call(' '.join(cmd), shell=True) # async call. subprocess.check_call('gcloud ml-engine jobs stream-logs ' + job_name, shell=True) # check that there was no errors. error_files = file_io.get_matching_files( os.path.join(self._prediction_output, 'prediction.errors_stats*')) self.assertEqual(1, len(error_files)) error_str = file_io.read_file_to_string(error_files[0]) self.assertEqual('', error_str)
def run_local_analysis(output_dir, csv_file_pattern, schema, inverted_features): """Use pandas to analyze csv files. Produces a stats file and vocab files. Args: output_dir: output folder csv_file_pattern: list of csv file paths, may contain wildcards schema: BQ schema list inverted_features: inverted_features dict Raises: ValueError: on unknown transfrorms/schemas """ sys.stdout.write('Expanding any file patterns...\n') sys.stdout.flush() header = [column['name'] for column in schema] input_files = [] for file_pattern in csv_file_pattern: input_files.extend(file_io.get_matching_files(file_pattern)) sys.stdout.write('file list computed.\n') sys.stdout.flush() # Make a copy of inverted_features and update the target transform to be # identity or one hot depending on the schema. inverted_features_target = copy.deepcopy(inverted_features) for name, transform_set in six.iteritems(inverted_features_target): if transform_set == set([constant.TARGET_TRANSFORM]): target_schema = next(col['type'].lower() for col in schema if col['name'] == name) if target_schema in constant.NUMERIC_SCHEMA: inverted_features_target[name] = {constant.IDENTITY_TRANSFORM} else: inverted_features_target[name] = {constant.ONE_HOT_TRANSFORM} # initialize the results def _init_numerical_results(): return {'min': float('inf'), 'max': float('-inf'), 'count': 0, 'sum': 0.0} numerical_results = collections.defaultdict(_init_numerical_results) vocabs = collections.defaultdict(lambda: collections.defaultdict(int)) num_examples = 0 # for each file, update the numerical stats from that file, and update the set # of unique labels. for input_file in input_files: sys.stdout.write('Analyzing file %s...\n' % input_file) sys.stdout.flush() with file_io.FileIO(input_file, 'r') as f: for line in csv.reader(f): if len(header) != len(line): raise ValueError('Schema has %d columns but a csv line only has %d columns.' % (len(header), len(line))) parsed_line = dict(zip(header, line)) num_examples += 1 for col_name, transform_set in six.iteritems(inverted_features_target): # All transforms in transform_set require the same analysis. So look # at the first transform. transform_name = next(iter(transform_set)) if transform_name in constant.TEXT_TRANSFORMS: split_strings = parsed_line[col_name].split(' ') # If a label is in the row N times, increase it's vocab count by 1. # This is needed for TFIDF, but it's also an interesting stat. for one_label in set(split_strings): # Filter out empty strings if one_label: vocabs[col_name][one_label] += 1 elif transform_name in constant.CATEGORICAL_TRANSFORMS: if parsed_line[col_name]: vocabs[col_name][parsed_line[col_name]] += 1 elif transform_name in constant.NUMERIC_TRANSFORMS: if not parsed_line[col_name].strip(): continue numerical_results[col_name]['min'] = ( min(numerical_results[col_name]['min'], float(parsed_line[col_name]))) numerical_results[col_name]['max'] = ( max(numerical_results[col_name]['max'], float(parsed_line[col_name]))) numerical_results[col_name]['count'] += 1 numerical_results[col_name]['sum'] += float(parsed_line[col_name]) sys.stdout.write('file %s analyzed.\n' % input_file) sys.stdout.flush() # Write the vocab files. Each label is on its own line. vocab_sizes = {} for name, label_count in six.iteritems(vocabs): # df is now: # label1,count # label2,count # ... # where label1 is the most frequent label, and label2 is the 2nd most, etc. df = pd.DataFrame([{'label': label, 'count': count} for label, count in sorted(six.iteritems(label_count), key=lambda x: x[1], reverse=True)], columns=['label', 'count']) csv_string = df.to_csv(index=False, header=False) file_io.write_string_to_file( os.path.join(output_dir, constant.VOCAB_ANALYSIS_FILE % name), csv_string) vocab_sizes[name] = {'vocab_size': len(label_count)} # Update numerical_results to just have min/min/mean for col_name in numerical_results: if float(numerical_results[col_name]['count']) == 0: raise ValueError('Column %s has a zero count' % col_name) mean = (numerical_results[col_name]['sum'] / float(numerical_results[col_name]['count'])) del numerical_results[col_name]['sum'] del numerical_results[col_name]['count'] numerical_results[col_name]['mean'] = mean # Write the stats file. numerical_results.update(vocab_sizes) stats = {'column_stats': numerical_results, 'num_examples': num_examples} file_io.write_string_to_file( os.path.join(output_dir, constant.STATS_FILE), json.dumps(stats, indent=2, separators=(',', ': ')))
def load_session_bundle_from_path(export_dir, target="", config=None, meta_graph_def=None): """Load session bundle from the given path. The function reads input from the export_dir, constructs the graph data to the default graph and restores the parameters for the session created. Args: export_dir: the directory that contains files exported by exporter. target: The execution engine to connect to. See target in tf.compat.v1.Session() config: A ConfigProto proto with configuration options. See config in tf.compat.v1.Session() meta_graph_def: optional object of type MetaGraphDef. If this object is present, then it is used instead of parsing MetaGraphDef from export_dir. Returns: session: a tensorflow session created from the variable files. meta_graph: a meta graph proto saved in the exporter directory. Raises: RuntimeError: if the required files are missing or contain unrecognizable fields, i.e. the exported model is invalid. """ if not meta_graph_def: meta_graph_filename = os.path.join(export_dir, constants.META_GRAPH_DEF_FILENAME) if not file_io.file_exists(meta_graph_filename): raise RuntimeError("Expected meta graph file missing %s" % meta_graph_filename) # Reads meta graph file. meta_graph_def = meta_graph_pb2.MetaGraphDef() meta_graph_def.ParseFromString( file_io.read_file_to_string(meta_graph_filename, binary_mode=True)) variables_filename = "" variables_filename_list = [] checkpoint_sharded = False variables_index_filename = os.path.join(export_dir, constants.VARIABLES_INDEX_FILENAME_V2) checkpoint_v2 = file_io.file_exists(variables_index_filename) # Find matching checkpoint files. if checkpoint_v2: # The checkpoint is in v2 format. variables_filename_pattern = os.path.join( export_dir, constants.VARIABLES_FILENAME_PATTERN_V2) variables_filename_list = file_io.get_matching_files( variables_filename_pattern) checkpoint_sharded = True else: variables_filename = os.path.join(export_dir, constants.VARIABLES_FILENAME) if file_io.file_exists(variables_filename): variables_filename_list = [variables_filename] else: variables_filename = os.path.join(export_dir, constants.VARIABLES_FILENAME_PATTERN) variables_filename_list = file_io.get_matching_files(variables_filename) checkpoint_sharded = True # Prepare the files to restore a session. if not variables_filename_list: restore_files = "" elif checkpoint_v2 or not checkpoint_sharded: # For checkpoint v2 or v1 with non-sharded files, use "export" to restore # the session. restore_files = constants.VARIABLES_FILENAME else: restore_files = constants.VARIABLES_FILENAME_PATTERN assets_dir = os.path.join(export_dir, constants.ASSETS_DIRECTORY) collection_def = meta_graph_def.collection_def graph_def = graph_pb2.GraphDef() if constants.GRAPH_KEY in collection_def: # Use serving graph_def in MetaGraphDef collection_def if exists graph_def_any = collection_def[constants.GRAPH_KEY].any_list.value if len(graph_def_any) != 1: raise RuntimeError("Expected exactly one serving GraphDef in : %s" % meta_graph_def) else: graph_def_any[0].Unpack(graph_def) # Replace the graph def in meta graph proto. meta_graph_def.graph_def.CopyFrom(graph_def) ops.reset_default_graph() sess = session.Session(target, graph=None, config=config) # Import the graph. saver = saver_lib.import_meta_graph(meta_graph_def) # Restore the session. if restore_files: saver.restore(sess, os.path.join(export_dir, restore_files)) init_op_tensor = None if constants.INIT_OP_KEY in collection_def: init_ops = collection_def[constants.INIT_OP_KEY].node_list.value if len(init_ops) != 1: raise RuntimeError("Expected exactly one serving init op in : %s" % meta_graph_def) init_op_tensor = ops.get_collection(constants.INIT_OP_KEY)[0] # Create asset input tensor list. asset_tensor_dict = {} if constants.ASSETS_KEY in collection_def: assets_any = collection_def[constants.ASSETS_KEY].any_list.value for asset in assets_any: asset_pb = manifest_pb2.AssetFile() asset.Unpack(asset_pb) asset_tensor_dict[asset_pb.tensor_binding.tensor_name] = os.path.join( assets_dir, asset_pb.filename) if init_op_tensor: # Run the init op. sess.run(fetches=[init_op_tensor], feed_dict=asset_tensor_dict) return sess, meta_graph_def
def _GetGoldenApiDefs(): old_api_def_files = file_io.get_matching_files(_GetApiDefFilePath('*')) return {file_path: file_io.read_file_to_string(file_path) for file_path in old_api_def_files}
def create_object_test(): """Verifies file_io's object manipulation methods .""" starttime_ms = int(round(time.time() * 1000)) dir_name = "%s/tf_gcs_test_%s" % (FLAGS.gcs_bucket_url, starttime_ms) print("Creating dir %s." % dir_name) file_io.create_dir(dir_name) num_files = 5 # Create files of 2 different patterns in this directory. files_pattern_1 = ["%s/test_file_%d.txt" % (dir_name, n) for n in range(num_files)] files_pattern_2 = ["%s/testfile%d.txt" % (dir_name, n) for n in range(num_files)] starttime_ms = int(round(time.time() * 1000)) files_to_create = files_pattern_1 + files_pattern_2 for file_name in files_to_create: print("Creating file %s." % file_name) file_io.write_string_to_file(file_name, "test file creation.") elapsed_ms = int(round(time.time() * 1000)) - starttime_ms print("Created %d files in %s milliseconds" % (len(files_to_create), elapsed_ms)) # Listing files of pattern1. list_files_pattern = "%s/test_file*.txt" % dir_name print("Getting files matching pattern %s." % list_files_pattern) starttime_ms = int(round(time.time() * 1000)) files_list = file_io.get_matching_files(list_files_pattern) elapsed_ms = int(round(time.time() * 1000)) - starttime_ms print("Listed files in %s milliseconds" % elapsed_ms) print(files_list) assert set(files_list) == set(files_pattern_1) # Listing files of pattern2. list_files_pattern = "%s/testfile*.txt" % dir_name print("Getting files matching pattern %s." % list_files_pattern) starttime_ms = int(round(time.time() * 1000)) files_list = file_io.get_matching_files(list_files_pattern) elapsed_ms = int(round(time.time() * 1000)) - starttime_ms print("Listed files in %s milliseconds" % elapsed_ms) print(files_list) assert set(files_list) == set(files_pattern_2) # Test renaming file. file_to_rename = "%s/oldname.txt" % dir_name file_new_name = "%s/newname.txt" % dir_name file_io.write_string_to_file(file_to_rename, "test file.") assert file_io.file_exists(file_to_rename) assert not file_io.file_exists(file_new_name) print("Will try renaming file %s to %s" % (file_to_rename, file_new_name)) starttime_ms = int(round(time.time() * 1000)) file_io.rename(file_to_rename, file_new_name) elapsed_ms = int(round(time.time() * 1000)) - starttime_ms print("File %s renamed to %s in %s milliseconds" % ( file_to_rename, file_new_name, elapsed_ms)) assert not file_io.file_exists(file_to_rename) assert file_io.file_exists(file_new_name) # Delete directory. print("Deleting directory %s." % dir_name) file_io.delete_recursively(dir_name)
def load_session_bundle_from_path(export_dir, target="", config=None): """Load session bundle from the given path. The function reads input from the export_dir, constructs the graph data to the default graph and restores the parameters for the session created. Args: export_dir: the directory that contains files exported by exporter. target: The execution engine to connect to. See target in tf.Session() config: A ConfigProto proto with configuration options. See config in tf.Session() Returns: session: a tensorflow session created from the variable files. meta_graph: a meta graph proto saved in the exporter directory. Raises: RuntimeError: if the required files are missing or contain unrecognizable fields, i.e. the exported model is invalid. """ meta_graph_filename = os.path.join(export_dir, constants.META_GRAPH_DEF_FILENAME) if not file_io.file_exists(meta_graph_filename): raise RuntimeError("Expected meta graph file missing %s" % meta_graph_filename) variables_filename = os.path.join(export_dir, constants.VARIABLES_FILENAME) if not file_io.file_exists(variables_filename): variables_filename = os.path.join( export_dir, constants.VARIABLES_FILENAME_PATTERN) if not file_io.get_matching_files(variables_filename): # If graph_util.convert_variables_to_constants() is called on a model # it won't have any variables, and that's OK. # # TODO(yxshi): verify that the graph_def in fact does not have any # reachable variables. variables_filename = None assets_dir = os.path.join(export_dir, constants.ASSETS_DIRECTORY) # Reads meta graph file. meta_graph_def = meta_graph_pb2.MetaGraphDef() meta_graph_def.ParseFromString(file_io.read_file_to_string( meta_graph_filename)) collection_def = meta_graph_def.collection_def graph_def = tf.GraphDef() if constants.GRAPH_KEY in collection_def: # Use serving graph_def in MetaGraphDef collection_def if exists graph_def_any = collection_def[constants.GRAPH_KEY].any_list.value if len(graph_def_any) != 1: raise RuntimeError( "Expected exactly one serving GraphDef in : %s" % meta_graph_def) else: graph_def_any[0].Unpack(graph_def) # Replace the graph def in meta graph proto. meta_graph_def.graph_def.CopyFrom(graph_def) tf.reset_default_graph() sess = tf.Session(target, graph=None, config=config) # Import the graph. saver = tf.train.import_meta_graph(meta_graph_def) # Restore the session. if variables_filename: saver.restore(sess, variables_filename) init_op_tensor = None if constants.INIT_OP_KEY in collection_def: init_ops = collection_def[constants.INIT_OP_KEY].node_list.value if len(init_ops) != 1: raise RuntimeError( "Expected exactly one serving init op in : %s" % meta_graph_def) init_op_tensor = tf.get_collection(constants.INIT_OP_KEY)[0] # Create asset input tensor list. asset_tensor_dict = {} if constants.ASSETS_KEY in collection_def: assets_any = collection_def[constants.ASSETS_KEY].any_list.value for asset in assets_any: asset_pb = manifest_pb2.AssetFile() asset.Unpack(asset_pb) asset_tensor_dict[asset_pb.tensor_binding.tensor_name] = os.path.join( assets_dir, asset_pb.filename) if init_op_tensor: # Run the init op. sess.run(fetches=[init_op_tensor], feed_dict=asset_tensor_dict) return sess, meta_graph_def
def transformed_training_input_fn(): """Training input function that reads transformed data.""" if isinstance(raw_data_file_pattern, six.string_types): filepath_list = [raw_data_file_pattern] else: filepath_list = raw_data_file_pattern files = [] for path in filepath_list: files.extend(file_io.get_matching_files(path)) filename_queue = tf.train.string_input_producer( files, num_epochs=num_epochs, shuffle=randomize_input) options = tf.python_io.TFRecordOptions( compression_type=tf.python_io.TFRecordCompressionType.GZIP) ex_id, ex_str = tf.TFRecordReader(options=options).read_up_to( filename_queue, training_batch_size) queue_capacity = (reader_num_threads + 3) * training_batch_size + min_after_dequeue if randomize_input: _, batch_ex_str = tf.train.shuffle_batch( tensors=[ex_id, ex_str], batch_size=training_batch_size, capacity=queue_capacity, min_after_dequeue=min_after_dequeue, enqueue_many=True, num_threads=reader_num_threads, allow_smaller_final_batch=allow_smaller_final_batch) else: _, batch_ex_str = tf.train.batch( tensors=[ex_id, ex_str], batch_size=training_batch_size, capacity=queue_capacity, enqueue_many=True, num_threads=reader_num_threads, allow_smaller_final_batch=allow_smaller_final_batch) feature_spec = {} feature_info = get_transformed_feature_info(features, schema) for name, info in six.iteritems(feature_info): if info['size'] is None: feature_spec[name] = tf.VarLenFeature(dtype=info['dtype']) else: feature_spec[name] = tf.FixedLenFeature(shape=[info['size']], dtype=info['dtype']) parsed_tensors = tf.parse_example(batch_ex_str, feature_spec) # Expand the dims of non-sparse tensors. This is needed by tf.learn. transformed_features = {} for k, v in six.iteritems(parsed_tensors): if isinstance(v, tf.Tensor) and v.get_shape().ndims == 1: transformed_features[k] = tf.expand_dims(v, -1) else: # Sparse tensor transformed_features[k] = v transformed_features = image_feature_engineering( features=features, feature_tensors_dict=transformed_features) # Remove the target tensor, and return it directly target_name = get_target_name(features) if not target_name or target_name not in transformed_features: raise ValueError('Cannot find target transform in features') transformed_target = transformed_features.pop(target_name) return transformed_features, transformed_target
def match_maybe_append(pathname): fnames = file_io.get_matching_files(pathname) if fnames: mtimes.append(file_io.stat(fnames[0]).mtime_nsec / 1e9) return True return False
def _delete_file_if_exists(filespec): """Deletes files matching `filespec`.""" for pathname in file_io.get_matching_files(filespec): file_io.delete_file(pathname)
def run_numerical_categorical_analysis(args, schema_list): """Makes the numerical and categorical analysis files. Args: args: the command line args schema_list: python object of the schema json file. Raises: ValueError: if schema contains unknown column types. """ header = [column['name'] for column in schema_list] input_files = file_io.get_matching_files(args.input_file_pattern) # Check the schema is valid for col_schema in schema_list: col_type = col_schema['type'].lower() if col_type != 'string' and col_type != 'integer' and col_type != 'float': raise ValueError('Schema contains an unsupported type %s.' % col_type) # initialize the results def _init_numerical_results(): return {'min': float('inf'), 'max': float('-inf'), 'count': 0, 'sum': 0.0} numerical_results = collections.defaultdict(_init_numerical_results) categorical_results = collections.defaultdict(set) # for each file, update the numerical stats from that file, and update the set # of unique labels. for input_file in input_files: with file_io.FileIO(input_file, 'r') as f: for line in f: parsed_line = dict(zip(header, line.strip().split(','))) for col_schema in schema_list: col_name = col_schema['name'] col_type = col_schema['type'] if col_type.lower() == 'string': categorical_results[col_name].update([parsed_line[col_name]]) else: # numerical column. # if empty, skip if not parsed_line[col_name].strip(): continue numerical_results[col_name]['min'] = ( min(numerical_results[col_name]['min'], float(parsed_line[col_name]))) numerical_results[col_name]['max'] = ( max(numerical_results[col_name]['max'], float(parsed_line[col_name]))) numerical_results[col_name]['count'] += 1 numerical_results[col_name]['sum'] += float(parsed_line[col_name]) # Update numerical_results to just have min/min/mean for col_schema in schema_list: if col_schema['type'].lower() != 'string': col_name = col_schema['name'] mean = numerical_results[col_name]['sum'] / numerical_results[col_name]['count'] del numerical_results[col_name]['sum'] del numerical_results[col_name]['count'] numerical_results[col_name]['mean'] = mean # Write the numerical_results to a json file. file_io.write_string_to_file( os.path.join(args.output_dir, NUMERICAL_ANALYSIS_FILE), json.dumps(numerical_results, indent=2, separators=(',', ': '))) # Write the vocab files. Each label is on its own line. for name, unique_labels in six.iteritems(categorical_results): labels = '\n'.join(list(unique_labels)) file_io.write_string_to_file( os.path.join(args.output_dir, CATEGORICAL_ANALYSIS_FILE % name), labels)
def testMultipleColumnsTransformed(self): """Test training starting from tf.example.""" output_dir = tempfile.mkdtemp() try: features = { 'num': {'transform': 'identity'}, 'num2': {'transform': 'key', 'source_column': 'num'}, 'target': {'transform': 'target'}, 'text': {'transform': 'bag_of_words'}, 'text2': {'transform': 'tfidf', 'source_column': 'text'}, 'text3': {'transform': 'key', 'source_column': 'text'}} schema = [ {'name': 'num', 'type': 'integer'}, {'name': 'target', 'type': 'float'}, {'name': 'text', 'type': 'string'}] data = ['1,2,hello world\n', '4,8,bye moon\n', '5,10,hello moon\n', '11,22,moon moon\n'] file_io.recursive_create_dir(output_dir) file_io.write_string_to_file(os.path.join(output_dir, 'schema.json'), json.dumps(schema, indent=2)) file_io.write_string_to_file(os.path.join(output_dir, 'features.json'), json.dumps(features, indent=2)) file_io.write_string_to_file(os.path.join(output_dir, 'data.csv'), ''.join(data)) cmd = ['python %s' % os.path.join(CODE_PATH, 'analyze.py'), '--output=' + os.path.join(output_dir, 'analysis'), '--csv=' + os.path.join(output_dir, 'data.csv'), '--schema=' + os.path.join(output_dir, 'schema.json'), '--features=' + os.path.join(output_dir, 'features.json')] subprocess.check_call(' '.join(cmd), shell=True) cmd = ['python %s' % os.path.join(CODE_PATH, 'transform.py'), '--output=' + os.path.join(output_dir, 'transform'), '--csv=' + os.path.join(output_dir, 'data.csv'), '--analysis=' + os.path.join(output_dir, 'analysis'), '--prefix=features'] subprocess.check_call(' '.join(cmd), shell=True) # Check tf.example file has the expected features file_list = file_io.get_matching_files(os.path.join(output_dir, 'transform', 'features*')) options = tf.python_io.TFRecordOptions( compression_type=tf.python_io.TFRecordCompressionType.GZIP) record_iter = tf.python_io.tf_record_iterator(path=file_list[0], options=options) tf_example = tf.train.Example() tf_example.ParseFromString(next(record_iter)) self.assertEqual(1, len(tf_example.features.feature['num'].int64_list.value)) self.assertEqual(1, len(tf_example.features.feature['num2'].int64_list.value)) self.assertEqual(1, len(tf_example.features.feature['target'].float_list.value)) self.assertEqual(2, len(tf_example.features.feature['text_ids'].int64_list.value)) self.assertEqual(2, len(tf_example.features.feature['text_weights'].float_list.value)) self.assertEqual(2, len(tf_example.features.feature['text2_ids'].int64_list.value)) self.assertEqual(2, len(tf_example.features.feature['text2_weights'].float_list.value)) self.assertEqual(1, len(tf_example.features.feature['text3'].bytes_list.value)) cmd = ['cd %s && ' % CODE_PATH, 'python -m trainer.task', '--train=' + os.path.join(output_dir, 'data.csv'), '--eval=' + os.path.join(output_dir, 'data.csv'), '--job-dir=' + os.path.join(output_dir, 'training'), '--analysis=' + os.path.join(output_dir, 'analysis'), '--model=linear_regression', '--train-batch-size=4', '--eval-batch-size=4', '--max-steps=200', '--learning-rate=0.1', '--transform'] subprocess.check_call(' '.join(cmd), shell=True) result = run_exported_model( model_path=os.path.join(output_dir, 'training', 'model'), csv_data=['20,hello moon']) # check keys were made self.assertEqual(20, result['num2']) self.assertEqual('hello moon', result['text3']) finally: shutil.rmtree(output_dir)
def _files(pattern): """Converts a file pattern to a list of files.""" files = file_io.get_matching_files(pattern) if not files: raise IOError('Unable to find input files.') return files