def _call_exception_handlers(exception): """Calls any installed exception handlers.""" for handler in EXCEPTION_HANDLERS: try: if handler.wants(exception): handler.handle(exception) except: # pylint: disable=bare-except try: # We don't want to stop for exceptions in the exception handlers but # we shouldn't hide them either. logging.error(traceback.format_exc()) except: # pylint: disable=bare-except # In case even the logging statement fails, ignore. pass
def main(_): """Run and handle retryable errors.""" proto_utils.uses_fast_cpp_protos_or_die() logging_level.set_from_flag() for _ in range(FLAGS.num_retries + 1): try: parse_and_run() return except tf.errors.UnavailableError as e: # An UnavailableError indicates a gRPC error, typically this is # retryable. logging.error('Caught UnavailableError %s; will retry.', e) except tf.errors.InternalError as e: # Retry on an InternalError. logging.error('Caught InternalError %s; will retry.', e)
def main(_): proto_utils.uses_fast_cpp_protos_or_die() if not FLAGS.dataset_config_pbtxt: logging.error('Need to specify --dataset_config_pbtxt') logging_level.set_from_flag() eval_loop( master=FLAGS.master, dataset_config_pbtxt=FLAGS.dataset_config_pbtxt, checkpoint_dir=FLAGS.checkpoint_dir, model_name=FLAGS.model_name, batch_size=FLAGS.batch_size, moving_average_decay=FLAGS.moving_average_decay, max_examples=FLAGS.max_examples, eval_dir=FLAGS.eval_dir, max_evaluations=FLAGS.max_evaluations, )
def evaluate_tfhub_module(module_spec, eval_tasks, use_tpu, num_averaging_runs, update_bn_accumulators=True, use_tags=True): """Evaluate model at given checkpoint_path. Args: module_spec: string, path to a TF hub module. eval_tasks: List of objects that inherit from EvalTask. use_tpu: Whether to use TPUs. num_averaging_runs: Determines how many times each metric is computed. Returns: Dict[Text, float] with all the computed results. Raises: NanFoundError: If generator output has any NaNs. """ # Make sure that the same latent variables are used for each evaluation. np.random.seed(42) dataset = datasets.get_dataset() num_test_examples = dataset.eval_test_samples batch_size = 64 num_batches = int(np.ceil(num_test_examples / batch_size)) # Load and update the generator. result_dict = {} fake_dsets = [] with tf.Graph().as_default(): tf.set_random_seed(42) with tf.Session() as sess: if use_tpu: sess.run(tf.contrib.tpu.initialize_system()) def sample_from_generator(): """Create graph for sampling images.""" generator = hub.Module( module_spec, name="gen_module", tags={"gen", "bs{}".format(batch_size)} if use_tags else None) logging.info("Generator inputs: %s", generator.get_input_info_dict()) z_dim = generator.get_input_info_dict()["z"].get_shape()[1].value z = z_generator(shape=[batch_size, z_dim]) if "labels" in generator.get_input_info_dict(): # Conditional GAN. assert dataset.num_classes labels = tf.random.uniform( [batch_size], maxval=dataset.num_classes, dtype=tf.int32) inputs = dict(z=z, labels=labels) else: # Unconditional GAN. assert "labels" not in generator.get_input_info_dict() inputs = dict(z=z) return generator(inputs=inputs, as_dict=True)["generated"] if use_tpu: generated = tf.contrib.tpu.rewrite(sample_from_generator) else: generated = sample_from_generator() tf.global_variables_initializer().run() if update_bn_accumulators and _update_bn_accumulators(sess, generated, num_accu_examples=204800): saver = tf.train.Saver() save_path = os.path.join(module_spec, "model-with-accu.ckpt") checkpoint_path = saver.save( sess, save_path=save_path) logging.info("Exported generator with accumulated batch stats to " "%s.", checkpoint_path) if not eval_tasks: logging.error("Task list is empty, returning.") return for i in range(num_averaging_runs): logging.info("Generating fake data set %d/%d.", i+1, num_averaging_runs) fake_dset = eval_utils.EvalDataSample( eval_utils.sample_fake_dataset(sess, generated, num_batches, batch_size)) fake_dsets.append(fake_dset) logging.info("Computing inception features for generated data %d/%d.", i+1, num_averaging_runs) activations, logits = eval_utils.inception_transform_np( fake_dset.images, batch_size) fake_dset.set_inception_features( activations=activations, logits=logits) fake_dset.set_num_examples(num_test_examples) # Free up some memory by releasing additional fake data samples. # For ImageNet128 50k images are ~9 GiB. This will blow up metrics # (such as fractal dimension) if num_averaging_runs > 1. fake_dset.discard_images() real_dset = eval_utils.EvalDataSample( eval_utils.get_real_images( dataset=dataset, num_examples=num_test_examples)) logging.info("Getting Inception features for real images.") real_dset.activations, _ = eval_utils.inception_transform_np( real_dset.images, batch_size) real_dset.set_num_examples(num_test_examples) # Run all the tasks and update the result dictionary with the task statistics. result_dict = {} for task in eval_tasks: task_results_dicts = [ task.run_after_session(fake_dset, real_dset) for fake_dset in fake_dsets ] # Average the score for each key. result_statistics = {} for key in task_results_dicts[0].keys(): scores_for_key = np.array([d[key] for d in task_results_dicts]) mean, std = np.mean(scores_for_key), np.std(scores_for_key) scores_as_string = "_".join([str(x) for x in scores_for_key]) result_statistics[key + "_mean"] = mean result_statistics[key + "_std"] = std result_statistics[key + "_list"] = scores_as_string logging.info("Computed results for task %s: %s", task, result_statistics) result_dict.update(result_statistics) return result_dict
# Copyright 2017 The TensorFlow Authors All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Loads icp op.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function from absl import logging import tensorflow as tf try: icp_op_module = tf.load_op_library('./ops/icp_op.so') icp = icp_op_module.icp except Exception: # pylint: disable=broad-except logging.error('Could not load object file for ICP op.') icp = None
def get_dataset_feature_statistics(builder, split): """Calculate statistics for the specified split.""" statistics = statistics_pb2.DatasetFeatureStatistics() # Make this to the best of our abilities. schema = schema_pb2.Schema() dataset = builder.as_dataset(split=split) # Just computing the number of examples for now. statistics.num_examples = 0 # Feature dictionaries. feature_to_num_examples = collections.defaultdict(int) feature_to_min = {} feature_to_max = {} np_dataset = dataset_utils.dataset_as_numpy(dataset) for example in tqdm.tqdm(np_dataset, unit=" examples"): statistics.num_examples += 1 assert isinstance(example, dict) feature_names = sorted(example.keys()) for feature_name in feature_names: # Update the number of examples this feature appears in. feature_to_num_examples[feature_name] += 1 feature_np = example[feature_name] # For compatibility in graph and eager mode, we can get PODs here and # everything may not be neatly wrapped up in numpy's ndarray. feature_dtype = type(feature_np) if isinstance(feature_np, np.ndarray): feature_dtype = feature_np.dtype.type feature_min, feature_max = None, None is_numeric = (np.issubdtype(feature_dtype, np.number) or feature_dtype == np.bool_) if is_numeric: feature_min = np.min(feature_np) feature_max = np.max(feature_np) # TODO(afrozm): What if shapes don't match? Populate ValueCount? Add # logic for that. # Set or update the min, max. if is_numeric: if ((feature_name not in feature_to_min) or (feature_to_min[feature_name] > feature_min)): feature_to_min[feature_name] = feature_min if ((feature_name not in feature_to_max) or (feature_to_max[feature_name] < feature_max)): feature_to_max[feature_name] = feature_max # Start here, we've processed all examples. output_shapes_dict = dataset.output_shapes output_types_dict = dataset.output_types for feature_name in sorted(feature_to_num_examples.keys()): # Try to fill in the schema. feature = schema.feature.add() feature.name = feature_name # TODO(afrozm): Make this work with nested structures, currently the Schema # proto has no support for it. maybe_feature_shape = output_shapes_dict[feature_name] if not isinstance(maybe_feature_shape, tf.TensorShape): logging.error( "Statistics generation doesn't work for nested structures yet") continue for dim in maybe_feature_shape.as_list(): # We denote `None`s as -1 in the shape proto. feature.shape.dim.add().size = dim if dim else -1 feature_type = output_types_dict[feature_name] feature.type = _FEATURE_TYPE_MAP.get(feature_type, schema_pb2.BYTES) common_statistics = statistics_pb2.CommonStatistics() common_statistics.num_non_missing = feature_to_num_examples[ feature_name] common_statistics.num_missing = (statistics.num_examples - common_statistics.num_non_missing) feature_name_statistics = statistics.features.add() feature_name_statistics.name = feature_name # TODO(afrozm): This can be skipped, since type information was added to # the Schema. feature_name_statistics.type = _SCHEMA_TYPE_MAP.get( feature.type, statistics_pb2.FeatureNameStatistics.BYTES) if feature.type == schema_pb2.INT or feature.type == schema_pb2.FLOAT: numeric_statistics = statistics_pb2.NumericStatistics() numeric_statistics.min = feature_to_min[feature_name] numeric_statistics.max = feature_to_max[feature_name] numeric_statistics.common_stats.CopyFrom(common_statistics) feature_name_statistics.num_stats.CopyFrom(numeric_statistics) else: # Let's shove it into BytesStatistics for now. bytes_statistics = statistics_pb2.BytesStatistics() bytes_statistics.common_stats.CopyFrom(common_statistics) feature_name_statistics.bytes_stats.CopyFrom(bytes_statistics) return statistics, schema
def _load_csv_files(manual_dir, dictionary_of_csv_files): """Load the ground-truth data from the given dictionary of CSV files. Args: manual_dir: Path of the directory containing the images. dictionary_of_csv_files: Dictionary containing the key and filepath of each CSV file to load. Returns: A dictionary containing the ground-truth loaded from the CSV files. """ # Data maps patients -> examples -> list of abnormalities data = {} for csv_key, csv_path in sorted(dictionary_of_csv_files.items()): with tf.io.gfile.GFile(csv_path, 'r') as f: csv_reader = csv.DictReader(f) for i, row in enumerate(csv_reader, 2): row = {k: v.strip() for k, v in row.items()} # Strip all cells. # Construct example ID from the study and series IDs. example_id = _DCIM_REGEX.sub(r'\g<study>/\g<series>', row['image file path']) # Get path to the for key in [ 'image file path', 'ROI mask file path', 'cropped image file path' ]: row[key] = row[key].replace('.dcm', '.png') row[key] = os.path.join(manual_dir, *row[key].split('/')) if not tf.io.gfile.exists(row[key]): raise ValueError( 'Error processing line %d from csv file %s: ' 'Image %r does not exist!' % (i, csv_path, row[key])) mask_file_path = row['ROI mask file path'] crop_file_path = row['cropped image file path'] full_image = _read_image(row['image file path']) mask_image = _read_image(mask_file_path) crop_image = _read_image(crop_file_path) if full_image.shape == crop_image.shape: # TODO(jpuigcerver): THIS ASSUMES THAT THE CROP/MASK COLUMNS ARE JUST # REVERSED. I've checked that this is the case for a couple of rows, # but this issue happens a lot across all CSV files. Contact the # owners of the dataset to ask about this problem. mask_file_path, crop_file_path = crop_file_path, mask_file_path elif full_image.shape != mask_image.shape: # TODO(jpuigcerver): Contact the owners of the dataset to ask about # this problem. logging.error( 'Error processing line %d from csv file %s: No suitable mask for ' 'the given image (expected size: %r, candidate sizes: %r). ' 'This abnormality will NOT be included in the dataset.', i, csv_path, full_image.shape, [mask_image.shape, crop_image.shape]) continue abnormality = { 'id': int(row['abnormality id']), 'mask': mask_file_path, 'assessment': row['assessment'], 'pathology': row['pathology'], 'subtlety': row['subtlety'], } if 'calc type' in row and 'calc distribution' in row: abnormality['type'] = 'calc' abnormality['calc_type'] = row['calc type'] abnormality['calc_distribution'] = row['calc distribution'] elif 'mass shape' in row and 'mass margins' in row: abnormality['type'] = 'mass' abnormality['mass_shape'] = row['mass shape'] abnormality['mass_margins'] = row['mass margins'] else: raise ValueError('CSV file is missing required columns.') example = { 'id': example_id, 'breast': row['left or right breast'], 'patient': row['patient_id'], 'image': row['image file path'], 'view': row['image view'], 'abnormalities': [abnormality], # Note: Useful to know whether the example is from train or test. 'csv_key': csv_key, } _append_example_to_data(data, example) return data
def create_optimizer_from_flags( prefix: Text, overrides: Optional[Mapping[Text, Union[Text, float, int, bool]]] = None ) -> tf.keras.optimizers.Optimizer: """Returns an optimizer based on prefixed flags. This method is inteded to be paired with `define_optimizer_flags` using the same `prefix`, to allow Python binaries to constructed TensorFlow optimizers parameterized by commandline flags. This method expects at least two flags to have been defined: * `--<prefix>_optimizer=<optimizer name>` * `--<prefix>_learning_rate` In addition to suites of flags for each optimizer: * `--<prefix>_<optimizer name>_<constructor_argument>` For example, if `prefix='client'` this method first reads the flags: * `--client_optimizer` * `--client_learning_rate` If the optimizer flag is `'sgd'`, then a `tf.keras.optimizer.SGD` optimizer is constructed using the values in the flags prefixed with `--client_sgd_`. NOTE: `kwargs` can be set using the `overrides` parameter. Args: prefix: The same string prefix passed to `define_optimizer_flags`. overrides: A mapping of `(string, value)` pairs that should override default flag values (but not user specified values from the commandline). Returns: A `tf.keras.optimizers.Optimizer`. """ if overrides is not None: if not isinstance(overrides, collections.Mapping): raise TypeError( '`overrides` must be a value of type `collections.Mapping`, ' 'found type: {!s}'.format(type(overrides))) else: overrides = {} def prefixed(basename): return '{}_{}'.format(prefix, basename) if prefix else basename optimizer_flag_name = prefixed('optimizer') if flags.FLAGS[optimizer_flag_name] is None: raise ValueError( 'Must specify flag --{!s}'.format(optimizer_flag_name)) optimizer_name = flags.FLAGS[optimizer_flag_name].value optimizer_cls = _SUPPORTED_OPTIMIZERS.get(optimizer_name) if optimizer_cls is None: # To support additional optimizers, implement it as a # `tf.keras.optimizers.Optimizer` and add to the `_SUPPORTED_OPTIMIZERS` # dict. logging.error( 'Unknown optimizer [%s], known optimziers are [%s]. To add ' 'support for an optimizer, add the optimzier class to the ' 'utils_impl._SUPPORTED_OPTIMIZERS list.', optimizer_name, list(_SUPPORTED_OPTIMIZERS.keys())) raise ValueError( '`{!s}` is not a valid optimizer for flag --{!s}, must be ' 'one of {!s}. See error log for details.'.format( optimizer_name, optimizer_flag_name, list(_SUPPORTED_OPTIMIZERS.keys()))) def _has_user_value(flag): """Check if a commandline flag has a user set value.""" return flag.present or flag.value != flag.default # Validate that the optimizers that weren't picked don't have flag values set. # Settings that won't be used likely means there is an expectation gap between # the user and the system and we should notify them. unused_flag_prefixes = [ prefixed(k) for k in _SUPPORTED_OPTIMIZERS.keys() if k != optimizer_name ] mistakenly_set_flags = [] for flag_name in flags.FLAGS: if not _has_user_value(flags.FLAGS[flag_name]): # Flag was not set by the user, skip it. continue # Otherwise the flag has a value set by the user. for unused_prefix in unused_flag_prefixes: if flag_name.startswith(unused_prefix): mistakenly_set_flags.append(flag_name) break if mistakenly_set_flags: raise ValueError('Commandline flags for optimizers other than [{!s}] ' '(value of --{!s}) are set. These would be ignored, ' 'were the flags set by mistake? Flags: {!s}'.format( optimizer_name, optimizer_flag_name, mistakenly_set_flags)) flag_prefix = prefixed(optimizer_name) prefix_len = len(flag_prefix) + 1 kwargs = dict(overrides) if overrides is not None else {} learning_rate_flag = flags.FLAGS[prefixed('learning_rate')] if _has_user_value(learning_rate_flag): kwargs['learning_rate'] = learning_rate_flag.value for flag_name in flags.FLAGS: if not flag_name.startswith(flag_prefix): continue arg_name = flag_name[prefix_len:] kwargs[arg_name] = flags.FLAGS[flag_name].value return optimizer_cls(**kwargs)
def train(self, train_input_fn: Callable[[params_dict.ParamsDict], tf.data.Dataset], eval_input_fn: Callable[[params_dict.ParamsDict], tf.data.Dataset] = None, model_dir: Text = None, total_steps: int = 1, iterations_per_loop: int = 1, train_metric_fn: Callable[[], Any] = None, eval_metric_fn: Callable[[], Any] = None, summary_writer_fn: Callable[[Text, Text], SummaryWriter] = SummaryWriter, init_checkpoint: Callable[[tf.keras.Model], Any] = None, custom_callbacks: List[tf.keras.callbacks.Callback] = None, save_config: bool = True): """Runs distributed training. Args: train_input_fn: (params: dict) -> tf.data.Dataset training data input function. eval_input_fn: (Optional) same type as train_input_fn. If not None, will trigger evaluting metric on eval data. If None, will not run eval step. model_dir: the folder path for model checkpoints. total_steps: total training steps. iterations_per_loop: train steps per loop. After each loop, this job will update metrics like loss and save checkpoint. train_metric_fn: metric_fn for evaluation in train_step. eval_metric_fn: metric_fn for evaluation in test_step. summary_writer_fn: function to create summary writer. init_checkpoint: function to load checkpoint. custom_callbacks: A list of Keras Callbacks objects to run during training. More specifically, `on_batch_begin()`, `on_batch_end()`, methods are invoked during training. save_config: bool. Whether to save params to model_dir. Returns: The training loss and eval metrics. """ assert train_input_fn is not None if train_metric_fn and not callable(train_metric_fn): raise ValueError('if `train_metric_fn` is specified, ' 'train_metric_fn must be a callable.') if eval_metric_fn and not callable(eval_metric_fn): raise ValueError('if `eval_metric_fn` is specified, ' 'eval_metric_fn must be a callable.') train_metric_fn = train_metric_fn or _no_metric eval_metric_fn = eval_metric_fn or _no_metric if custom_callbacks and iterations_per_loop != 1: logging.error( 'It is sematically wrong to run callbacks when ' 'iterations_per_loop is not one (%s)', iterations_per_loop) def _run_callbacks_on_batch_begin(batch): """Runs custom callbacks at the start of every step.""" if not custom_callbacks: return for callback in custom_callbacks: if callback: callback.on_batch_begin(batch) def _run_callbacks_on_batch_end(batch): """Runs custom callbacks at the end of every step.""" if not custom_callbacks: return for callback in custom_callbacks: if callback: callback.on_batch_end(batch) if save_config: self._save_config(model_dir) if FLAGS.save_checkpoint_freq: save_freq = FLAGS.save_checkpoint_freq else: save_freq = iterations_per_loop params = self._params strategy = self._strategy # To reduce unnecessary send/receive input pipeline operation, we place # input pipeline ops in worker task. train_iterator = self._get_input_iterator(train_input_fn, strategy) train_loss = None eval_metric_result = None with strategy.scope(): # To correctly place the model weights on accelerators, # model and optimizer should be created in scope. model = self.model_fn(params.as_dict()) if not hasattr(model, 'optimizer'): raise ValueError( 'User should set optimizer attribute to model ' 'inside `model_fn`.') optimizer = model.optimizer # Training loop starts here. checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer) latest_checkpoint_file = tf.train.latest_checkpoint(model_dir) initial_step = 0 if latest_checkpoint_file: logging.info( 'Checkpoint file %s found and restoring from ' 'checkpoint', latest_checkpoint_file) checkpoint.restore(latest_checkpoint_file) initial_step = optimizer.iterations.numpy() logging.info( 'Loading from checkpoint file completed. Init step %d', initial_step) elif init_checkpoint: logging.info('Restoring from init checkpoint function') init_checkpoint(model) logging.info('Loading from init checkpoint file completed') current_step = optimizer.iterations.numpy() checkpoint_name = self.checkpoint_name eval_metric = eval_metric_fn() train_metric = train_metric_fn() train_summary_writer = summary_writer_fn(model_dir, 'eval_train') test_summary_writer = summary_writer_fn(model_dir, 'eval_test') # Continue training loop. train_step = self._create_train_step(strategy=strategy, model=model, loss_fn=self.loss_fn(), optimizer=optimizer, metric=train_metric) test_step = None if eval_input_fn and eval_metric: test_step = self._create_test_step(strategy, model, metric=eval_metric) logging.info('Training started') last_save_checkpoint_step = current_step while current_step < total_steps: num_steps = _steps_to_run(current_step, total_steps, iterations_per_loop) _run_callbacks_on_batch_begin(current_step) train_loss = train_step( train_iterator, tf.convert_to_tensor(num_steps, dtype=tf.int32)) _run_callbacks_on_batch_end(current_step) current_step += num_steps train_loss = tf.nest.map_structure( lambda x: x.numpy().astype(float), train_loss) if not isinstance(train_loss, dict): train_loss = {'total_loss': train_loss} if np.isnan(train_loss['total_loss']): raise ValueError('total loss is NaN.') if train_metric: train_metric_result = train_metric.result() if isinstance(train_metric, tf.keras.metrics.Metric): train_metric_result = tf.nest.map_structure( lambda x: x.numpy().astype(float), train_metric_result) if not isinstance(train_metric_result, dict): train_metric_result = {'metric': train_metric_result} train_metric_result.update(train_loss) else: train_metric_result = train_loss if callable(optimizer.lr): train_metric_result.update( {'learning_rate': optimizer.lr(current_step).numpy()}) else: train_metric_result.update( {'learning_rate': optimizer.lr.numpy()}) logging.info( 'Train Step: %d/%d / loss = %s / training metric = %s', current_step, total_steps, train_loss, train_metric_result) train_summary_writer(metrics=train_metric_result, step=optimizer.iterations) # Saves model checkpoints and run validation steps at every # iterations_per_loop steps. # To avoid repeated model saving, we do not save after the last # step of training. if save_freq > 0 and current_step < total_steps and ( current_step - last_save_checkpoint_step) >= save_freq: _save_checkpoint(checkpoint, model_dir, checkpoint_name.format(step=current_step)) last_save_checkpoint_step = current_step if test_step: eval_iterator = self._get_input_iterator( eval_input_fn, strategy) eval_metric_result = self._run_evaluation( test_step, current_step, eval_metric, eval_iterator) logging.info('Step: %s evalation metric = %s.', current_step, eval_metric_result) test_summary_writer(metrics=eval_metric_result, step=optimizer.iterations) # Re-initialize evaluation metric, except the last step. if eval_metric and current_step < total_steps: eval_metric.reset_states() if train_metric and current_step < total_steps: train_metric.reset_states() # Reaches the end of training and saves the last checkpoint. if last_save_checkpoint_step < total_steps: _save_checkpoint(checkpoint, model_dir, checkpoint_name.format(step=current_step)) if test_step: logging.info( 'Running final evaluation after training is complete.') eval_iterator = self._get_input_iterator(eval_input_fn, strategy) eval_metric_result = self._run_evaluation(test_step, current_step, eval_metric, eval_iterator) logging.info('Final evaluation metric = %s.', eval_metric_result) test_summary_writer(metrics=eval_metric_result, step=optimizer.iterations) return train_loss, eval_metric_result
mask = mask_factory.create_mask(FLAGS.mask_type, base_model, mask_rng, FLAGS.mask_sparsity) if jax.host_id() == 0: mask_stats = symmetry.get_mask_stats(mask) logging.info('Mask stats: %s', str(mask_stats)) for label, value in mask_stats.items(): try: summary_writer.scalar(f'mask/{label}', value, 0) # This is needed because permutations (long int) can't be cast to float32. except (OverflowError, ValueError): summary_writer.text(f'mask/{label}', str(value), 0) logging.error('Could not write mask/%s to tensorflow summary as float32' ', writing as string instead.', label) if FLAGS.dump_json: mask_stats['permutations'] = str(mask_stats['permutations']) utils.dump_dict_json( mask_stats, path.join(experiment_dir, 'mask_stats.json')) mask = masked.propagate_masks(mask) if jax.host_id() == 0: mask_stats = symmetry.get_mask_stats(mask) logging.info('Propagated mask stats: %s', str(mask_stats)) for label, value in mask_stats.items(): try:
def _launch_aip_training( job_id: Text, project: Text, training_input: Dict[Text, Any], job_labels: Optional[Dict[Text, Text]] = None) -> None: """Launches and monitors a AIP custom training job. Args: job_id: the job ID of the AI Platform training job. project: the GCP project under which the training job will be executed. training_input: Training input argument for AI Platform training job. See https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#TrainingInput for the detailed schema. job_labels: the dict of labels that will be attached to this job. Raises: RuntimeError: if the Google Cloud AI Platform training job failed/cancelled. ConnectionError: if the status polling of the training job failed due to connection issue. """ # Configure AI Platform training job api_client = discovery.build('ml', 'v1') project_id = 'projects/{}'.format(project) job_spec = { 'jobId': job_id, 'trainingInput': training_input, 'labels': job_labels, } # Submit job to AIP Training logging.info('TrainingInput=%s', training_input) logging.info('Submitting job=\'%s\', project=\'%s\' to AI Platform.', job_id, project) request = api_client.projects().jobs().create(body=job_spec, parent=project_id) request.execute() # Wait for AIP Training job to finish job_name = '{}/jobs/{}'.format(project_id, job_id) request = api_client.projects().jobs().get(name=job_name) response = request.execute() retry_count = 0 # Monitors the long-running operation by polling the job state periodically, # and retries the polling when a transient connectivity issue is encountered. # # Long-running operation monitoring: # The possible states of "get job" response can be found at # https://cloud.google.com/ai-platform/training/docs/reference/rest/v1/projects.jobs#State # where SUCCEEDED/FAILED/CANCELLED are considered to be final states. # The following logic will keep polling the state of the job until the job # enters a final state. # # During the polling, if a connection error was encountered, the GET request # will be retried by recreating the Python API client to refresh the lifecycle # of the connection being used. See # https://github.com/googleapis/google-api-python-client/issues/218 # for a detailed description of the problem. If the error persists for # _CONNECTION_ERROR_RETRY_LIMIT consecutive attempts, the function will raise # ConnectionError. while response['state'] not in ('SUCCEEDED', 'FAILED', 'CANCELLED'): time.sleep(_POLLING_INTERVAL_IN_SECONDS) try: response = request.execute() retry_count = 0 # Handle transient connection error. except ConnectionError as err: if retry_count < _CONNECTION_ERROR_RETRY_LIMIT: retry_count += 1 logging.warning( 'ConnectionError (%s) encountered when polling job: %s. Trying to ' 'recreate the API client.', err, job_id) # Recreate the Python API client. api_client = discovery.build('ml', 'v1') request = api_client.projects().jobs().get(name=job_name) else: logging.error('Request failed after %s retries.', _CONNECTION_ERROR_RETRY_LIMIT) raise if response['state'] in ('FAILED', 'CANCELLED'): err_msg = 'Job \'{}\' did not succeed. Detailed response {}.'.format( job_name, response) logging.error(err_msg) raise RuntimeError(err_msg) # AIP training complete logging.info('Job \'%s\' successful.', job_name)
def main(argv): del argv # Unused. # Initialise Tink try: aead.register() except tink.TinkError as e: logging.error('Error initialising Tink: %s', e) return 1 if FLAGS.mode == 'generate': # [START generate-a-new-keyset] # Generate a new keyset try: key_template = aead.aead_key_templates.AES128_GCM keyset_handle = tink.KeysetHandle.generate_new(key_template) except tink.TinkError as e: logging.exception('Error creating primitive: %s', e) return 1 # [END generate-a-new-keyset] # [START store-a-cleartext-keyset] with open(FLAGS.keyset_path, 'wt') as keyset_file: try: cleartext_keyset_handle.write( tink.JsonKeysetWriter(keyset_file), keyset_handle) except tink.TinkError as e: logging.exception('Error writing key: %s', e) return 1 return 0 # [END store-a-cleartext-keyset] # Use the input keyset to encrypt/decrypt data # Read the keyset into a keyset_handle with open(FLAGS.keyset_path, 'rt') as keyset_file: try: text = keyset_file.read() keyset_handle = cleartext_keyset_handle.read( tink.JsonKeysetReader(text)) except tink.TinkError as e: logging.exception('Error reading key: %s', e) return 1 # Get the primitive try: cipher = keyset_handle.primitive(aead.Aead) except tink.TinkError as e: logging.error('Error creating primitive: %s', e) return 1 with open(FLAGS.input_path, 'rb') as input_file: input_data = input_file.read() if FLAGS.mode == 'decrypt': output_data = cipher.decrypt(input_data, b'envelope_example') elif FLAGS.mode == 'encrypt': output_data = cipher.encrypt(input_data, b'envelope_example') else: logging.error( 'Error mode not supported. Please choose "encrypt" or "decrypt".' ) return 1 with open(FLAGS.output_path, 'wb') as output_file: output_file.write(output_data)
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None): """Model definition entry. Args: features: the input image tensor with shape [batch_size, height, width, 3]. The height and width are fixed and equal. labels: the input labels in a dictionary. The labels include class targets and box targets which are dense label maps. The labels are generated from get_input_fn function in data/dataloader.py mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT. params: the dictionary defines hyperparameters of model. The default settings are in default_hparams function in this file. model: the model outputs class logits and box regression outputs. variable_filter_fn: the filter function that takes trainable_variables and returns the variable list after applying the filter rule. Returns: tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction. Raises: RuntimeError: if both ckpt and backbone_ckpt are set. """ utils.image('input_image', features) training_hooks = [] params['is_training_bn'] = (mode == tf.estimator.ModeKeys.TRAIN) if params['use_keras_model']: def model_fn(inputs): model = efficientdet_keras.EfficientDetNet( config=hparams_config.Config(params)) cls_out_list, box_out_list = model(inputs, params['is_training_bn']) cls_outputs, box_outputs = {}, {} for i in range(params['min_level'], params['max_level'] + 1): cls_outputs[i] = cls_out_list[i - params['min_level']] box_outputs[i] = box_out_list[i - params['min_level']] return cls_outputs, box_outputs else: model_fn = functools.partial(model, config=hparams_config.Config(params)) precision = utils.get_precision(params['strategy'], params['mixed_precision']) cls_outputs, box_outputs = utils.build_model_with_precision( precision, model_fn, features, params['is_training_bn']) levels = cls_outputs.keys() for level in levels: cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32) box_outputs[level] = tf.cast(box_outputs[level], tf.float32) # First check if it is in PREDICT mode. if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'image': features, } for level in levels: predictions['cls_outputs_%d' % level] = cls_outputs[level] predictions['box_outputs_%d' % level] = box_outputs[level] return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Set up training loss and learning rate. update_learning_rate_schedule_parameters(params) global_step = tf.train.get_or_create_global_step() learning_rate = learning_rate_schedule(params, global_step) # cls_loss and box_loss are for logging. only total_loss is optimized. det_loss, cls_loss, box_loss, box_iou_loss = detection_loss( cls_outputs, box_outputs, labels, params) reg_l2loss = reg_l2_loss(params['weight_decay']) total_loss = det_loss + reg_l2loss if mode == tf.estimator.ModeKeys.TRAIN: utils.scalar('lrn_rate', learning_rate) utils.scalar('trainloss/cls_loss', cls_loss) utils.scalar('trainloss/box_loss', box_loss) utils.scalar('trainloss/det_loss', det_loss) utils.scalar('trainloss/reg_l2_loss', reg_l2loss) utils.scalar('trainloss/loss', total_loss) if params['iou_loss_type']: utils.scalar('trainloss/box_iou_loss', box_iou_loss) train_epochs = tf.cast(global_step, tf.float32) / params['steps_per_epoch'] utils.scalar('train_epochs', train_epochs) moving_average_decay = params['moving_average_decay'] if moving_average_decay: ema = tf.train.ExponentialMovingAverage(decay=moving_average_decay, num_updates=global_step) ema_vars = utils.get_ema_vars() if mode == tf.estimator.ModeKeys.TRAIN: if params['optimizer'].lower() == 'sgd': optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=params['momentum']) elif params['optimizer'].lower() == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate) else: raise ValueError('optimizers should be adam or sgd') if params['strategy'] == 'tpu': optimizer = tf.tpu.CrossShardOptimizer(optimizer) if params['gradient_checkpointing']: from third_party.grad_checkpoint import memory_saving_gradients # pylint: disable=import-outside-toplevel from tensorflow.python.ops import gradients # pylint: disable=import-outside-toplevel # monkey patch tf.gradients to point to our custom version, # with automatic checkpoint selection def gradients_(ys, xs, grad_ys=None, **kwargs): return memory_saving_gradients.gradients( ys, xs, grad_ys, checkpoints=params['gradient_checkpointing_list'], **kwargs) gradients.__dict__["gradients"] = gradients_ # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) var_list = tf.trainable_variables() if variable_filter_fn: var_list = variable_filter_fn(var_list) if params.get('clip_gradients_norm', None): logging.info('clip gradients norm by %f', params['clip_gradients_norm']) grads_and_vars = optimizer.compute_gradients(total_loss, var_list) with tf.name_scope('clip'): grads = [gv[0] for gv in grads_and_vars] tvars = [gv[1] for gv in grads_and_vars] # First clip each variable's norm, then clip global norm. clip_norm = abs(params['clip_gradients_norm']) clipped_grads = [ tf.clip_by_norm(g, clip_norm) if g is not None else None for g in grads ] clipped_grads, _ = tf.clip_by_global_norm( clipped_grads, clip_norm) utils.scalar('gradient_norm', tf.linalg.global_norm(clipped_grads)) grads_and_vars = list(zip(clipped_grads, tvars)) with tf.control_dependencies(update_ops): train_op = optimizer.apply_gradients(grads_and_vars, global_step) else: with tf.control_dependencies(update_ops): train_op = optimizer.minimize(total_loss, global_step, var_list=var_list) if moving_average_decay: with tf.control_dependencies([train_op]): train_op = ema.apply(ema_vars) else: train_op = None eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(**kwargs): """Returns a dictionary that has the evaluation metrics.""" if params['nms_configs'].get('pyfunc', True): detections_bs = [] for index in range(kwargs['boxes'].shape[0]): nms_configs = params['nms_configs'] detections = tf.numpy_function( functools.partial(nms_np.per_class_nms, nms_configs=nms_configs), [ kwargs['boxes'][index], kwargs['scores'][index], kwargs['classes'][index], tf.slice(kwargs['image_ids'], [index], [1]), tf.slice(kwargs['image_scales'], [index], [1]), params['num_classes'], nms_configs['max_output_size'], ], tf.float32) detections_bs.append(detections) detections_bs = postprocess.transform_detections( tf.stack(detections_bs)) else: # These two branches should be equivalent, but currently they are not. # TODO(tanmingxing): enable the non_pyfun path after bug fix. nms_boxes, nms_scores, nms_classes, _ = postprocess.per_class_nms( params, kwargs['boxes'], kwargs['scores'], kwargs['classes'], kwargs['image_scales']) img_ids = tf.cast(tf.expand_dims(kwargs['image_ids'], -1), nms_scores.dtype) detections_bs = [ img_ids * tf.ones_like(nms_scores), nms_boxes[:, :, 1], nms_boxes[:, :, 0], nms_boxes[:, :, 3] - nms_boxes[:, :, 1], nms_boxes[:, :, 2] - nms_boxes[:, :, 0], nms_scores, nms_classes, ] detections_bs = tf.stack(detections_bs, axis=-1, name='detnections') if params.get('testdev_dir', None): logging.info('Eval testdev_dir %s', params['testdev_dir']) eval_metric = coco_metric.EvaluationMetric( testdev_dir=params['testdev_dir']) coco_metrics = eval_metric.estimator_metric_fn( detections_bs, tf.zeros([1])) else: logging.info('Eval val with groudtruths %s.', params['val_json_file']) eval_metric = coco_metric.EvaluationMetric( filename=params['val_json_file'], label_map=params['label_map']) coco_metrics = eval_metric.estimator_metric_fn( detections_bs, kwargs['groundtruth_data']) # Add metrics to output. cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics cls_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(cls_loss, 0), [ params['batch_size'], ]), [params['batch_size'], 1]) box_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(box_loss, 0), [ params['batch_size'], ]), [params['batch_size'], 1]) cls_outputs = postprocess.to_list(cls_outputs) box_outputs = postprocess.to_list(box_outputs) params['nms_configs']['max_nms_inputs'] = anchors.MAX_DETECTION_POINTS boxes, scores, classes = postprocess.pre_nms(params, cls_outputs, box_outputs) metric_fn_inputs = { 'cls_loss_repeat': cls_loss_repeat, 'box_loss_repeat': box_loss_repeat, 'image_ids': labels['source_ids'], 'groundtruth_data': labels['groundtruth_data'], 'image_scales': labels['image_scales'], 'boxes': boxes, 'scores': scores, 'classes': classes, } eval_metrics = (metric_fn, metric_fn_inputs) checkpoint = params.get('ckpt') or params.get('backbone_ckpt') if checkpoint and mode == tf.estimator.ModeKeys.TRAIN: # Initialize the model from an EfficientDet or backbone checkpoint. if params.get('ckpt') and params.get('backbone_ckpt'): raise RuntimeError( '--backbone_ckpt and --checkpoint are mutually exclusive') if params.get('backbone_ckpt'): var_scope = params['backbone_name'] + '/' if params['ckpt_var_scope'] is None: # Use backbone name as default checkpoint scope. ckpt_scope = params['backbone_name'] + '/' else: ckpt_scope = params['ckpt_var_scope'] + '/' else: # Load every var in the given checkpoint var_scope = ckpt_scope = '/' def scaffold_fn(): """Loads pretrained model through scaffold function.""" logging.info('restore variables from %s', checkpoint) var_map = utils.get_ckpt_var_map( ckpt_path=checkpoint, ckpt_scope=ckpt_scope, var_scope=var_scope, skip_mismatch=params['skip_mismatch']) tf.train.init_from_checkpoint(checkpoint, var_map) return tf.train.Scaffold() elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay: def scaffold_fn(): """Load moving average variables for eval.""" logging.info('Load EMA vars with ema_decay=%f', moving_average_decay) restore_vars_dict = ema.variables_to_restore(ema_vars) saver = tf.train.Saver(restore_vars_dict) return tf.train.Scaffold(saver=saver) else: scaffold_fn = None if params['strategy'] != 'tpu': # Profile every 1K steps. if params.get('profile', False): profile_hook = tf.estimator.ProfilerHook( save_steps=1000, output_dir=params['model_dir'], show_memory=True) training_hooks.append(profile_hook) # Report memory allocation if OOM class OomReportingHook(tf.estimator.SessionRunHook): def before_run(self, run_context): return tf.estimator.SessionRunArgs( fetches=[], options=tf.RunOptions( report_tensor_allocations_upon_oom=True)) training_hooks.append(OomReportingHook()) logging_hook = tf.estimator.LoggingTensorHook( { 'step': global_step, 'det_loss': det_loss, 'cls_loss': cls_loss, 'box_loss': box_loss, }, every_n_iter=params.get('iterations_per_loop', 100), ) training_hooks.append(logging_hook) if params["nvgpu_logging"]: try: from third_party.tools.nvgpu import gpu_memory_util_message # pylint: disable=import-outside-toplevel mem_message = tf.py_func(gpu_memory_util_message, [], [tf.string])[0] logging_hook_nvgpu = tf.estimator.LoggingTensorHook( tensors={ "mem_message": mem_message, }, every_n_iter=params.get('iterations_per_loop', 100), formatter=lambda x: x["mem_message"].decode("utf-8"), ) training_hooks.append(logging_hook_nvgpu) except: logging.error("nvgpu error: nvidia-smi format not recognized") if params['strategy'] == 'tpu': return tf.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, eval_metrics=eval_metrics, host_call=utils.get_tpu_host_call(global_step, params), scaffold_fn=scaffold_fn, training_hooks=training_hooks) else: eval_metric_ops = (eval_metrics[0]( **eval_metrics[1]) if eval_metrics else None) utils.get_tpu_host_call(global_step, params) return tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, eval_metric_ops=eval_metric_ops, scaffold=scaffold_fn() if scaffold_fn else None, training_hooks=training_hooks)
def collect_trajectories(env, policy_net_apply, policy_net_params, num_trajectories=1, policy="greedy", epsilon=0.1): """Collect trajectories with the given policy net and behaviour.""" trajectories = [] for _ in range(num_trajectories): rewards = [] actions = [] done = False observation = env.reset() # This is currently shaped (1, 1) + OBS, but new observations will keep # getting added to it, making it eventually (1, T+1) + OBS observation_history = observation[np.newaxis, np.newaxis, :] while not done: # Run the policy, to pick an action, shape is (1, t, A) because # observation_history is shaped (1, t) + OBS predictions = policy_net_apply(policy_net_params, observation_history) # We need the predictions for the last time-step, so squeeze the batch # dimension and take the last time-step. predictions = np.squeeze(predictions, axis=0)[-1] # Policy can be run in one of the following ways: # - Greedy # - Epsilon-Greedy # - Categorical-Sampling action = None if policy == "greedy": action = np.argmax(predictions) elif policy == "epsilon-greedy": # A schedule for epsilon is 1/k where k is the episode number sampled. if onp.random.random() < epsilon: # Choose an action at random. action = onp.random.randint(0, high=len(predictions)) else: # Return the best action. action = np.argmax(predictions) elif policy == "categorical-sampling": action = onp.argwhere(onp.random.multinomial(1, predictions) == 1) else: raise ValueError("Unknown policy: %s" % policy) # NOTE: Assumption, single batch. try: action = int(action) except TypeError as err: # Let's dump some information before we die off. logging.error("Cannot convert action into an integer: [%s]", err) logging.error("action.shape: [%s]", action.shape) logging.error("action: [%s]", action) logging.error("predictions.shape: [%s]", predictions.shape) logging.error("predictions: [%s]", predictions) logging.error("observation_history: [%s]", observation_history) logging.error("policy_net_params: [%s]", policy_net_params) log_params(policy_net_params, "policy_net_params") raise err observation, reward, done, _ = env.step(action) # observation is of shape OBS, so add extra dims and concatenate on the # time dimension. observation_history = np.concatenate( [observation_history, observation[np.newaxis, np.newaxis, :]], axis=1) rewards.append(reward) actions.append(action) # This means we are done assert done # observation_history is (1, T+1) + OBS, lets squeeze out the batch dim. observation_history = np.squeeze(observation_history, axis=0) trajectories.append( (observation_history, np.stack(actions), np.stack(rewards))) return trajectories
def main(argv): logging.info(f"Starting MAML training with {FLAGS.source} dataset.") ckpt_save_path = os.path.join(FLAGS.save_path, "ckpts") os.makedirs(ckpt_save_path, exist_ok=True) logging.info(f"Setting seed...") torch_utils.set_seed(FLAGS.seed) metadata = [f.serialize() for f in FLAGS.get_key_flags_for_module(sys.argv[0])] metadata = [m for m in metadata if m] # remove empty flags metadata = "\n\t" + "\n\t".join(metadata) logging.info(f"Current parameters: {metadata}") flag_file = os.path.join(ckpt_save_path, "flagfile.txt") FLAGS.flags_into_string() FLAGS.append_flags_into_file(flag_file) logging.info(f"Flags are stored to {flag_file}") logging.info("Loading data...") loaders = dataloaders.get_loaders( source_path=FLAGS.source, inner_batch_size=FLAGS.inner_batch_size ) logging.info("Instantiating model and optimizers...") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = GatedGraphNeuralNetwork( n_edge=1, in_dim=75, n_conv=FLAGS.n_conv, fc_dims=[FLAGS.fc_dims, 1], p_dropout=0.0 ) if FLAGS.init_path is not None: logging.info(f"Loading initializations from {FLAGS.init_path}") model = torch.load(FLAGS.init_path) model = model.to(device) meta_learner = MAML(model, lr=FLAGS.inner_lr, first_order=FLAGS.first_order, anil=FLAGS.anil) optimizer = optim.Adam(meta_learner.parameters(), FLAGS.meta_lr) if FLAGS.mode == "binary_classification": pos_weight = torch.tensor( [l.dataset.y.sum() / len(l.dataset.y) for l in loaders["meta_train"]["train"]] ).mean() criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight) elif FLAGS.mode == "regression": criterion = nn.MSELoss() else: logging.error(f"--mode {FLAGS.mode} is not supported. Choose from ['binary_classification', 'regression'].") sys.exit(1) metrics = FLAGS.metrics.split(",") if FLAGS.metrics else [] logging.info(f"Begin training!") meta_training( meta_learner=meta_learner, meta_steps=FLAGS.meta_steps, meta_batch_size=FLAGS.meta_batch_size, loaders=loaders, optimizer=optimizer, criterion=criterion, inner_steps=FLAGS.inner_steps, device=device, save_path=ckpt_save_path, ckpt_steps=FLAGS.ckpt_steps, metrics=metrics, )
def _wait_for_processes(self, wait_processes, kill_processes, timeout_secs): """Waits until all `wait_processes` finish, then kills `kill_processes`. Fails an assert if a process in `wait_processes` finishes unsuccessfully. The processes in `kill_processes` are assumed to never finish so they are killed. Args: wait_processes: A list of _ProcessInfo tuples. This function will wait for each to finish. kill_processes: A list of _ProcessInfo tuples. Each will be killed once every process in `wait_processes` is finished. timeout_secs: Seconds to wait before timing out and terminating processes. Returns: A list of strings, each which is a string of the stderr of a wait process. Raises: Exception: When waiting for tasks to finish times out. """ timer = _CountDownTimer(timeout_secs) wait_process_stderrs = [None] * len(wait_processes) finished_wait_processes = set() poll_count = {wait_process: 0.0 for wait_process in wait_processes} while len(finished_wait_processes) < len(wait_processes): if timer.secs_remaining() == 0: logging.error( "Timed out! Outputting logs of unfinished processes:") for i, wait_process in enumerate(wait_processes): if i in finished_wait_processes: continue wait_process.stderr.seek(0) wait_process_stderrs[i] = wait_process.stderr.read() logging.info( "stderr for incomplete %s (last %d chars): %s\n", wait_process.name, MAX_OUTPUT_CHARS, wait_process_stderrs[i][-MAX_OUTPUT_CHARS:]) raise Exception("Timed out waiting for tasks to complete.") for i, wait_process in enumerate(wait_processes): if i in finished_wait_processes: continue ret_code = wait_process.popen.poll() if ret_code is None: poll_count[wait_process] += 0.25 if ((poll_count[wait_process] / 10.) - int(poll_count[wait_process] / 10.)) == 0: logging.info("%d secs has elapsed for %s", poll_count[wait_process], wait_process.name) continue logging.info("%s finished", wait_process.name) wait_process.stderr.seek(0) wait_process_stderrs[i] = wait_process.stderr.read() logging.info("stderr for %s (last %d chars): %s\n", wait_process.name, MAX_OUTPUT_CHARS, wait_process_stderrs[i][-MAX_OUTPUT_CHARS:]) self.assertEqual(0, ret_code) finished_wait_processes.add(i) for kill_process in kill_processes: ret_code = kill_process.popen.poll() # Kill processes should not end until we kill them. # If it returns early, note the return code. if ret_code is not None: logging.error("kill process %s ended with ret_code %d", kill_process.name, ret_code) kill_process.stderr.seek(0) logging.info( "stderr for %s (last %d chars): %s\n", kill_process.name, MAX_OUTPUT_CHARS, kill_process.stderr.read()[-MAX_OUTPUT_CHARS:]) self.assertIsNone(ret_code) # Delay between polling loops. time.sleep(0.25) logging.info("All wait processes finished") for i, kill_process in enumerate(kill_processes): # Kill each kill process. kill_process.popen.kill() kill_process.popen.wait() kill_process.stderr.seek(0) logging.info("stderr for %s (last %d chars): %s\n", kill_process.name, MAX_OUTPUT_CHARS, kill_process.stderr.read()[-MAX_OUTPUT_CHARS:]) return wait_process_stderrs
def run_ncf(_): """Run NCF training and eval with Keras.""" keras_utils.set_session_config(enable_xla=FLAGS.enable_xla) if FLAGS.seed is not None: print("Setting tf seed") tf.random.set_seed(FLAGS.seed) model_helpers.apply_clean(FLAGS) if FLAGS.dtype == "fp16" and FLAGS.fp16_implementation == "keras": policy = tf.keras.mixed_precision.experimental.Policy( "mixed_float16", loss_scale=flags_core.get_loss_scale(FLAGS, default_for_fp16="dynamic")) tf.keras.mixed_precision.experimental.set_policy(policy) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=FLAGS.distribution_strategy, num_gpus=FLAGS.num_gpus, tpu_address=FLAGS.tpu) params = ncf_common.parse_flags(FLAGS) params["distribute_strategy"] = strategy if not keras_utils.is_v2_0() and strategy is not None: logging.error("NCF Keras only works with distribution strategy in TF 2.0") return if (params["keras_use_ctl"] and ( not keras_utils.is_v2_0() or strategy is None)): logging.error( "Custom training loop only works with tensorflow 2.0 and dist strat.") return if params["use_tpu"] and not params["keras_use_ctl"]: logging.error("Custom training loop must be used when using TPUStrategy.") return batch_size = params["batch_size"] time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps) callbacks = [time_callback] producer, input_meta_data = None, None generate_input_online = params["train_dataset_path"] is None if generate_input_online: # Start data producing thread. num_users, num_items, _, _, producer = ncf_common.get_inputs(params) producer.start() per_epoch_callback = IncrementEpochCallback(producer) callbacks.append(per_epoch_callback) else: assert params["eval_dataset_path"] and params["input_meta_data_path"] with tf.io.gfile.GFile(params["input_meta_data_path"], "rb") as reader: input_meta_data = json.loads(reader.read().decode("utf-8")) num_users = input_meta_data["num_users"] num_items = input_meta_data["num_items"] params["num_users"], params["num_items"] = num_users, num_items if FLAGS.early_stopping: early_stopping_callback = CustomEarlyStopping( "val_HR_METRIC", desired_value=FLAGS.hr_threshold) callbacks.append(early_stopping_callback) (train_input_dataset, eval_input_dataset, num_train_steps, num_eval_steps) = \ (ncf_input_pipeline.create_ncf_input_data( params, producer, input_meta_data, strategy)) steps_per_epoch = None if generate_input_online else num_train_steps with distribution_utils.get_strategy_scope(strategy): keras_model = _get_keras_model(params) optimizer = tf.keras.optimizers.Adam( learning_rate=params["learning_rate"], beta_1=params["beta1"], beta_2=params["beta2"], epsilon=params["epsilon"]) if FLAGS.fp16_implementation == "graph_rewrite": optimizer = \ tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite( optimizer, loss_scale=flags_core.get_loss_scale(FLAGS, default_for_fp16="dynamic")) elif FLAGS.dtype == "fp16" and params["keras_use_ctl"]: # When keras_use_ctl is False, instead Model.fit() automatically applies # loss scaling so we don't need to create a LossScaleOptimizer. optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer( optimizer, tf.keras.mixed_precision.experimental.global_policy().loss_scale) if params["keras_use_ctl"]: train_loss, eval_results = run_ncf_custom_training( params, strategy, keras_model, optimizer, callbacks, train_input_dataset, eval_input_dataset, num_train_steps, num_eval_steps, generate_input_online=generate_input_online) else: keras_model.compile(optimizer=optimizer, run_eagerly=FLAGS.run_eagerly) if not FLAGS.ml_perf: # Create Tensorboard summary and checkpoint callbacks. summary_dir = os.path.join(FLAGS.model_dir, "summaries") summary_callback = tf.keras.callbacks.TensorBoard(summary_dir) checkpoint_path = os.path.join(FLAGS.model_dir, "checkpoint") checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( checkpoint_path, save_weights_only=True) callbacks += [summary_callback, checkpoint_callback] history = keras_model.fit( train_input_dataset, epochs=FLAGS.train_epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks, validation_data=eval_input_dataset, validation_steps=num_eval_steps, verbose=2) logging.info("Training done. Start evaluating") eval_loss_and_metrics = keras_model.evaluate( eval_input_dataset, steps=num_eval_steps, verbose=2) logging.info("Keras evaluation is done.") # Keras evaluate() API returns scalar loss and metric values from # evaluation as a list. Here, the returned list would contain # [evaluation loss, hr sum, hr count]. eval_hit_rate = eval_loss_and_metrics[1] / eval_loss_and_metrics[2] # Format evaluation result into [eval loss, eval hit accuracy]. eval_results = [eval_loss_and_metrics[0], eval_hit_rate] if history and history.history: train_history = history.history train_loss = train_history["loss"][-1] stats = build_stats(train_loss, eval_results, time_callback) return stats
def generate(self) -> List[task_lib.Task]: """Generates tasks for executing the next executable nodes in the pipeline. The returned tasks must have `exec_task` populated. List may be empty if no nodes are ready for execution. Returns: A `list` of tasks to execute. """ layers = topsort.topsorted_layers( [node.pipeline_node for node in self._pipeline.nodes], get_node_id_fn=lambda node: node.node_info.id, get_parent_nodes=( lambda node: [self._node_map[n] for n in node.upstream_nodes]), get_child_nodes=( lambda node: [self._node_map[n] for n in node.downstream_nodes])) result = [] for layer_num, nodes in enumerate(layers): # Boolean that's set if there's at least one successfully executed node # in the current layer. completed_node_ids = set() for node in nodes: node_uid = task_lib.NodeUid.from_pipeline_node( self._pipeline, node) node_id = node.node_info.id if self._service_job_manager.is_pure_service_node( self._pipeline_state, node.node_info.id): if not self._upstream_nodes_executed(node): continue service_status = self._service_job_manager.ensure_node_services( self._pipeline_state, node_id) if service_status == service_jobs.ServiceStatus.SUCCESS: logging.info('Service node completed successfully: %s', node_uid) completed_node_ids.add(node_id) elif service_status == service_jobs.ServiceStatus.FAILED: logging.error('Failed service node: %s', node_uid) return [ task_lib.FinalizePipelineTask( pipeline_uid=self._pipeline_state.pipeline_uid, status=status_lib.Status( code=status_lib.Code.ABORTED, message= (f'Aborting pipeline execution due to service ' f'node failure; failed node uid: {node_uid}' ))) ] else: logging.info('Pure service node in progress: %s', node_uid) continue # If a task for the node is already tracked by the task queue, it need # not be considered for generation again. if self._is_task_id_tracked_fn( task_lib.exec_node_task_id_from_pipeline_node( self._pipeline, node)): continue node_executions = task_gen_utils.get_executions( self._mlmd_handle, node) if task_gen_utils.is_latest_execution_successful( node_executions): completed_node_ids.add(node_id) continue # If all upstream nodes are executed but current node is not executed, # the node is deemed ready for execution. if self._upstream_nodes_executed(node): task = self._generate_task(node, node_executions) if task_lib.is_finalize_pipeline_task(task): return [task] else: result.append(task) # If there are no completed nodes in the current layer, downstream nodes # need not be checked. if not completed_node_ids: break # If all nodes in the final layer are completed successfully , the # pipeline can be finalized. # TODO(goutham): If there are conditional eval nodes, not all nodes may be # executed in the final layer. Handle this case when conditionals are # supported. if layer_num == len(layers) - 1 and completed_node_ids == set( node.node_info.id for node in nodes): return [ task_lib.FinalizePipelineTask( pipeline_uid=self._pipeline_state.pipeline_uid, status=status_lib.Status(code=status_lib.Code.OK)) ] return result
def _check_same(ref: Any, tar: Any, rtol: float, atol: float) -> bool: """Checks that ref and tar have identical datastructures and values.""" # Check for matching types. if not isinstance(tar, type(ref)): logging.error( "Expected ref and tar to have the same type but got '%s' and '%s'", type(ref), type(tar)) return False if ref is None: # Nothing to compare (e.g. the called method had no outputs). return True # Recursive check for dicts. if isinstance(ref, dict): if ref.keys() != tar.keys(): logging.error( "Expected ref and tar to have the same keys, but got '%s' and '%s'", ref.keys(), tar.keys()) return False # Check that all of the dictionaries' values are the same. for key in ref: if not Trace._check_same(ref[key], tar[key], rtol, atol): return False # Recursive check for iterables. elif isinstance(ref, list) or isinstance(ref, tuple): if len(ref) != len(tar): logging.error( "Expected ref and tar to have the same length, but got %s and %s", len(ref), len(tar)) return False # Check that all of the iterables' values are the same. for i in range(len(ref)): if not Trace._check_same(ref[i], tar[i], rtol, atol): return False # Base check for numpy arrays. elif isinstance(ref, np.ndarray): if ref.dtype != tar.dtype: logging.error( "Expected ref and tar to have the same dtype, but got %s and %s", ref.dtype, tar.dtype) return False if np.issubdtype(ref.dtype, np.floating): same = np.allclose(ref, tar, rtol=rtol, atol=atol) if not same: abs_diff = np.max(np.abs(ref - tar)) rel_diff = np.max(np.abs(ref - tar) / np.max(tar)) logging.error( "Floating point difference between ref and tar was too large. " "Max abs diff: %s, atol: %s, max relative diff: %s, rtol: %s", abs_diff, atol, rel_diff, rtol) return same else: return np.array_equal(ref, tar) # Base check for native number types. elif isinstance(ref, (int, float)): return ref == tar # If outputs end up here then an extra branch for that type should be added. else: raise TypeError( f"Encountered results with unexpected type {type(ref)}") return True
def setup_project(config, project_yaml, output_yaml_path): """Run the full process for initalizing a single new project. Note: for projects that have already been deployed, only the updatable steps will be run. Args: config (ProjectConfig): The config of a single project to setup. project_yaml (str): Path of the project config YAML. output_yaml_path (str): Path to output resulting root config in JSON. Returns: A boolean, true if the project was deployed successfully, false otherwise. """ project_id = config.project['project_id'] steps = _SETUP_STEPS + config.extra_steps starting_step = field_generation.get_generated_fields_copy( project_id, config.root).get('failed_step', 1) deployed = field_generation.is_deployed(project_id, config.root) total_steps = len(steps) for step_num in range(starting_step, total_steps + 1): step = steps[step_num - 1] project_id = config.project['project_id'] logging.info('%s: step %d/%d (%s)', project_id, step_num, total_steps, step.description) if deployed and not step.updatable: logging.info('Step %d is not updatable, skipping', step_num) continue try: step.func(config) except Exception as e: # pylint: disable=broad-except traceback.print_exc() logging.error('%s: setup failed on step %s: %s', project_id, step_num, e) logging.error( 'Failure information has been written to --output_yaml_path. ' 'Please ensure the config at --project_yaml is updated with any ' 'changes from the config at --output_yaml_path and re-run the script' '(Note: only applicable if --output_yaml_path != --project_yaml)') # only record failed step if project was undeployed, an update can always # start from the beginning if not deployed: field_generation.get_generated_fields_ref( project_id, config.root)['failed_step'] = step_num field_generation.rewrite_generated_fields_back(project_yaml, output_yaml_path, config.root) return False field_generation.rewrite_generated_fields_back(project_yaml, output_yaml_path, config.root) # if this deployment was resuming from a previous failure, remove the # failed step as it is done if field_generation.is_generated_fields_exist(project_id, config.root): field_generation.get_generated_fields_ref(project_id, config.root, False).pop('failed_step', None) field_generation.rewrite_generated_fields_back(project_yaml, output_yaml_path, config.root) logging.info('Setup completed successfully.') return True
def answer(self, msg, error=False) -> None: logging.error(msg) self.write(json.dumps({'msg': msg, 'error': error}))
def main(argv): del argv # Unused. if FLAGS.enable_new_style_resources: logging.info('--enable_new_style_resources is true.') FLAGS.output_yaml_path = utils.normalize_path(FLAGS.output_yaml_path) if FLAGS.output_rules_path: FLAGS.output_rules_path = utils.normalize_path(FLAGS.output_rules_path) FLAGS.project_yaml = utils.normalize_path(FLAGS.project_yaml) if FLAGS.enable_new_style_resources: config_string = runner.run_command([ FLAGS.load_config_binary, '--config_path', FLAGS.project_yaml, ], get_output=True) yaml = ruamel.yaml.YAML() root_config = yaml.load(config_string) else: root_config = utils.load_config(FLAGS.project_yaml) if not root_config: logging.error('Error loading project YAML.') return logging.info('Validating project YAML against schema.') try: utils.validate_config_yaml(root_config) except jsonschema.exceptions.ValidationError as e: logging.error('Error in YAML config: %s', e) return want_projects = set(FLAGS.projects) def want_project(project_config_dict): if not project_config_dict: return False return want_projects == { '*' } or project_config_dict['project_id'] in want_projects projects = [] audit_logs_project = root_config.get('audit_logs_project') # Always deploy the remote audit logs project first (if present). if want_project(audit_logs_project): projects.append( ProjectConfig( root=root_config, project=audit_logs_project, audit_logs_project=None, extra_steps=[])) forseti_config = root_config.get('forseti') if forseti_config and want_project(forseti_config['project']): extra_steps = [ Step( func=install_forseti, description='Install Forseti', updatable=False, ), get_forseti_access_granter_step( forseti_config['project']['project_id']), ] if audit_logs_project: extra_steps.append( get_forseti_access_granter_step(audit_logs_project['project_id'])) forseti_project_config = ProjectConfig( root=root_config, project=forseti_config['project'], audit_logs_project=audit_logs_project, extra_steps=extra_steps) projects.append(forseti_project_config) for project_config in root_config.get('projects', []): if not want_project(project_config): continue extra_steps = [] if forseti_config: extra_steps.append( get_forseti_access_granter_step(project_config['project_id'])) projects.append( ProjectConfig( root=root_config, project=project_config, audit_logs_project=audit_logs_project, extra_steps=extra_steps)) validate_project_configs(root_config['overall'], projects) logging.info('Found %d projects to deploy', len(projects)) for config in projects: logging.info('Setting up project %s', config.project['project_id']) if not setup_project(config, FLAGS.project_yaml, FLAGS.output_yaml_path): # Don't attempt to deploy additional projects if one project failed. return if forseti_config: if FLAGS.enable_new_style_resources: call = [ FLAGS.rule_generator_binary, '--project_yaml_path', FLAGS.project_yaml, '--output_path', FLAGS.output_rules_path or '', ] logging.info('Running rule generator: %s', call) utils.call_go_binary(call) else: rule_generator.run(root_config, output_path=FLAGS.output_rules_path) logging.info( 'All projects successfully deployed. Please remember to sync ' 'any changes written to the config at --output_yaml_path with ' '--project_yaml before running the script again (Note: only applicable ' 'if --output_yaml_path != --project_yaml)')
HHblits default: False. alt: Show up to this many alternative alignments. p: Minimum Prob for a hit to be included in the output hhr file. HHblits default: 20. z: Hard cap on number of hits reported in the hhr file. HHblits default: 500. NB: The relevant HHblits flag is -Z not -z. Raises: RuntimeError: If HHblits binary not found within the path. """ self.binary_path = binary_path self.databases = databases for database_path in self.databases: if not glob.glob(database_path + '_*'): logging.error('Could not find HHBlits database %s', database_path) raise ValueError( f'Could not find HHBlits database {database_path}') self.n_cpu = n_cpu self.n_iter = n_iter self.e_value = e_value self.maxseq = maxseq self.realign_max = realign_max self.maxfilt = maxfilt self.min_prefilter_hits = min_prefilter_hits self.all_seqs = all_seqs self.alt = alt self.p = p self.z = z
# ============================================================================== import csv import sys from absl import logging def to_standard_format(input_file, output_file): logging.info("Save file to {}".format(output_file)) with open(input_file, encoding="utf-8") as csv_file, \ open(output_file, "w", encoding="utf-8") as out_file: csv_reader = csv.reader(csv_file) for row in csv_reader: if len(row) < 4: continue label = row[0] text = " ".join(row[1:]) out_file.write(label + "\t" + text + "\n") if __name__ == '__main__': logging.set_verbosity(logging.INFO) if len(sys.argv) != 3: logging.error("Usage {} input_file output_file".format(sys.argv[0])) sys.exit(-1) input_file = sys.argv[1] output_file = sys.argv[2] to_standard_format(input_file, output_file)
def main(_): logging_verbosity = logging_level_verbosity(FLAGS.logging_verbosity) logging.set_verbosity(logging_verbosity) logging.error('WARNING: This tool is deprecated in favor of ' 'https://github.com/tensorflow/hub/tree/master/' 'tensorflow_hub/tools/make_image_classifier') if not FLAGS.image_dir: logging.error('Must set flag --image_dir.') return -1 prepare_file_system() image_lists = create_image_lists(FLAGS.image_dir, FLAGS.testing_percentage, FLAGS.validation_percentage) class_count = len(image_lists.keys()) if class_count == 0: logging.error('No valid folders of images found at %s', FLAGS.image_dir) return -1 if class_count == 1: logging.error('Only one valid folder of images found at %s ' ' - multiple classes are needed for classification.', FLAGS.image_dir) return -1 do_distort_images = should_distort_images( FLAGS.flip_left_right, FLAGS.random_crop, FLAGS.random_scale, FLAGS.random_brightness) module_spec = hub.load_module_spec(FLAGS.tfhub_module) graph, bottleneck_tensor, resized_image_tensor, wants_quantization = ( create_module_graph(module_spec)) with graph.as_default(): (train_step, cross_entropy, bottleneck_input, ground_truth_input, final_tensor) = add_final_retrain_ops( class_count, FLAGS.final_tensor_name, bottleneck_tensor, wants_quantization, is_training=True) with tf.Session(graph=graph) as sess: init = tf.global_variables_initializer() sess.run(init) jpeg_data_tensor, decoded_image_tensor = add_jpeg_decoding(module_spec) if do_distort_images: (distorted_jpeg_data_tensor, distorted_image_tensor) = add_input_distortions( FLAGS.flip_left_right, FLAGS.random_crop, FLAGS.random_scale, FLAGS.random_brightness, module_spec) else: cache_bottlenecks(sess, image_lists, FLAGS.image_dir, FLAGS.bottleneck_dir, jpeg_data_tensor, decoded_image_tensor, resized_image_tensor, bottleneck_tensor, FLAGS.tfhub_module) evaluation_step, _ = add_evaluation_step(final_tensor, ground_truth_input) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train', sess.graph) validation_writer = tf.summary.FileWriter( FLAGS.summaries_dir + '/validation') train_saver = tf.train.Saver() for i in range(FLAGS.how_many_training_steps): if do_distort_images: (train_bottlenecks, train_ground_truth) = get_random_distorted_bottlenecks( sess, image_lists, FLAGS.train_batch_size, 'training', FLAGS.image_dir, distorted_jpeg_data_tensor, distorted_image_tensor, resized_image_tensor, bottleneck_tensor) else: (train_bottlenecks, train_ground_truth, _) = get_random_cached_bottlenecks( sess, image_lists, FLAGS.train_batch_size, 'training', FLAGS.bottleneck_dir, FLAGS.image_dir, jpeg_data_tensor, decoded_image_tensor, resized_image_tensor, bottleneck_tensor, FLAGS.tfhub_module) train_summary, _ = sess.run( [merged, train_step], feed_dict={bottleneck_input: train_bottlenecks, ground_truth_input: train_ground_truth}) train_writer.add_summary(train_summary, i) is_last_step = (i + 1 == FLAGS.how_many_training_steps) if (i % FLAGS.eval_step_interval) == 0 or is_last_step: train_accuracy, cross_entropy_value = sess.run( [evaluation_step, cross_entropy], feed_dict={bottleneck_input: train_bottlenecks, ground_truth_input: train_ground_truth}) logging.info('%s: Step %d: Train accuracy = %.1f%%', datetime.now(), i, train_accuracy * 100) logging.info('%s: Step %d: Cross entropy = %f', datetime.now(), i, cross_entropy_value) validation_bottlenecks, validation_ground_truth, _ = ( get_random_cached_bottlenecks( sess, image_lists, FLAGS.validation_batch_size, 'validation', FLAGS.bottleneck_dir, FLAGS.image_dir, jpeg_data_tensor, decoded_image_tensor, resized_image_tensor, bottleneck_tensor, FLAGS.tfhub_module)) validation_summary, validation_accuracy = sess.run( [merged, evaluation_step], feed_dict={bottleneck_input: validation_bottlenecks, ground_truth_input: validation_ground_truth}) validation_writer.add_summary(validation_summary, i) logging.info('%s: Step %d: Validation accuracy = %.1f%% (N=%d)', datetime.now(), i, validation_accuracy * 100, len(validation_bottlenecks)) intermediate_frequency = FLAGS.intermediate_store_frequency if (intermediate_frequency > 0 and (i % intermediate_frequency == 0) and i > 0): train_saver.save(sess, FLAGS.checkpoint_path) intermediate_file_name = (FLAGS.intermediate_output_graphs_dir + 'intermediate_' + str(i) + '.pb') logging.info('Save intermediate result to : %s', intermediate_file_name) save_graph_to_file(intermediate_file_name, module_spec, class_count) train_saver.save(sess, FLAGS.checkpoint_path) run_final_eval(sess, module_spec, class_count, image_lists, jpeg_data_tensor, decoded_image_tensor, resized_image_tensor, bottleneck_tensor) logging.info('Save final result to : %s', FLAGS.output_graph) if wants_quantization: logging.info('The model is instrumented for quantization with TF-Lite') save_graph_to_file(FLAGS.output_graph, module_spec, class_count) with tf.gfile.GFile(FLAGS.output_labels, 'w') as f: f.write('\n'.join(image_lists.keys()) + '\n') if FLAGS.saved_model_dir: export_model(module_spec, class_count, FLAGS.saved_model_dir)
def run_ncf(_): """Run NCF training and eval with Keras.""" keras_utils.set_session_config(enable_xla=FLAGS.enable_xla) if FLAGS.seed is not None: print("Setting tf seed") tf.random.set_seed(FLAGS.seed) # TODO(seemuch): Support different train and eval batch sizes if FLAGS.eval_batch_size != FLAGS.batch_size: logging.warning( "The Keras implementation of NCF currently does not support batch_size " "!= eval_batch_size ({} vs. {}). Overriding eval_batch_size to match " "batch_size".format(FLAGS.eval_batch_size, FLAGS.batch_size)) FLAGS.eval_batch_size = FLAGS.batch_size params = ncf_common.parse_flags(FLAGS) model_helpers.apply_clean(flags.FLAGS) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=FLAGS.distribution_strategy, num_gpus=FLAGS.num_gpus) params["distribute_strategy"] = strategy if not keras_utils.is_v2_0() and strategy is not None: logging.error( "NCF Keras only works with distribution strategy in TF 2.0") return if (params["keras_use_ctl"] and (not keras_utils.is_v2_0() or strategy is None)): logging.error( "Custom training loop only works with tensorflow 2.0 and dist strat." ) return # ncf_common rounds eval_batch_size (this is needed due to a reshape during # eval). This carries over that rounding to batch_size as well. This is the # per device batch size params["batch_size"] = params["eval_batch_size"] batch_size = params["batch_size"] time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps) callbacks = [time_callback] producer, input_meta_data = None, None generate_input_online = params["train_dataset_path"] is None if generate_input_online: # Start data producing thread. num_users, num_items, num_train_steps, num_eval_steps, producer = ( ncf_common.get_inputs(params)) producer.start() per_epoch_callback = IncrementEpochCallback(producer) callbacks.append(per_epoch_callback) else: assert params["eval_dataset_path"] and params["input_meta_data_path"] with tf.io.gfile.GFile(params["input_meta_data_path"], "rb") as reader: input_meta_data = json.loads(reader.read().decode("utf-8")) num_users = input_meta_data["num_users"] num_items = input_meta_data["num_items"] params["num_users"], params["num_items"] = num_users, num_items (train_input_dataset, eval_input_dataset, num_train_steps, num_eval_steps) = \ (ncf_input_pipeline.create_ncf_input_data( params, producer, input_meta_data)) steps_per_epoch = None if generate_input_online else num_train_steps if FLAGS.early_stopping: early_stopping_callback = CustomEarlyStopping( "val_HR_METRIC", desired_value=FLAGS.hr_threshold) callbacks.append(early_stopping_callback) with distribution_utils.get_strategy_scope(strategy): keras_model = _get_keras_model(params) optimizer = tf.keras.optimizers.Adam( learning_rate=params["learning_rate"], beta_1=params["beta1"], beta_2=params["beta2"], epsilon=params["epsilon"]) if params["keras_use_ctl"]: loss_object = tf.keras.losses.SparseCategoricalCrossentropy( reduction="sum", from_logits=True) train_input_iterator = strategy.make_dataset_iterator( train_input_dataset) eval_input_iterator = strategy.make_dataset_iterator( eval_input_dataset) @tf.function def train_step(): """Called once per step to train the model.""" def step_fn(features): """Computes loss and applied gradient per replica.""" with tf.GradientTape() as tape: softmax_logits = keras_model(features) labels = features[rconst.TRAIN_LABEL_KEY] loss = loss_object( labels, softmax_logits, sample_weight=features[rconst.VALID_POINT_MASK]) loss *= (1.0 / (batch_size * strategy.num_replicas_in_sync)) grads = tape.gradient(loss, keras_model.trainable_variables) # Converting gradients to dense form helps in perf on GPU for NCF grads = neumf_model.sparse_to_dense_grads( list(zip(grads, keras_model.trainable_variables))) optimizer.apply_gradients(grads) return loss per_replica_losses = strategy.experimental_run( step_fn, train_input_iterator) mean_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) return mean_loss @tf.function def eval_step(): """Called once per eval step to compute eval metrics.""" def step_fn(features): """Computes eval metrics per replica.""" softmax_logits = keras_model(features) in_top_k, metric_weights = metric_fn( softmax_logits, features[rconst.DUPLICATE_MASK], params) hr_sum = tf.reduce_sum(in_top_k * metric_weights) hr_count = tf.reduce_sum(metric_weights) return hr_sum, hr_count per_replica_hr_sum, per_replica_hr_count = ( strategy.experimental_run(step_fn, eval_input_iterator)) hr_sum = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_hr_sum, axis=None) hr_count = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_hr_count, axis=None) return hr_sum, hr_count time_callback.on_train_begin() for epoch in range(FLAGS.train_epochs): for cb in callbacks: cb.on_epoch_begin(epoch) # As NCF dataset is sampled with randomness, not repeating # data elements in each epoch has significant impact on # convergence. As so, offline-generated TF record files # contains all epoch worth of data. Thus we do not need # to initialize dataset when reading from tf record files. if generate_input_online: train_input_iterator.initialize() train_loss = 0 for step in range(num_train_steps): time_callback.on_batch_begin(step + epoch * num_train_steps) train_loss += train_step() time_callback.on_batch_end(step + epoch * num_train_steps) train_loss /= num_train_steps logging.info("Done training epoch %s, epoch loss=%s.", epoch + 1, train_loss) eval_input_iterator.initialize() hr_sum = 0 hr_count = 0 for _ in range(num_eval_steps): step_hr_sum, step_hr_count = eval_step() hr_sum += step_hr_sum hr_count += step_hr_count logging.info("Done eval epoch %s, hr=%s.", epoch + 1, hr_sum / hr_count) if (FLAGS.early_stopping and float(hr_sum / hr_count) > params["hr_threshold"]): break time_callback.on_train_end() eval_results = [None, hr_sum / hr_count] else: with distribution_utils.get_strategy_scope(strategy): # TODO(b/138957587): Remove when force_v2_in_keras_compile is on longer # a valid arg for this model. Also remove as a valid flag. if FLAGS.force_v2_in_keras_compile is not None: keras_model.compile(optimizer=optimizer, run_eagerly=FLAGS.run_eagerly, experimental_run_tf_function=FLAGS. force_v2_in_keras_compile) else: keras_model.compile(optimizer=optimizer, run_eagerly=FLAGS.run_eagerly) history = keras_model.fit(train_input_dataset, epochs=FLAGS.train_epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks, validation_data=eval_input_dataset, validation_steps=num_eval_steps, verbose=2) logging.info("Training done. Start evaluating") eval_results = keras_model.evaluate(eval_input_dataset, steps=num_eval_steps, verbose=2) logging.info("Keras evaluation is done.") if history and history.history: train_history = history.history train_loss = train_history["loss"][-1] stats = build_stats(train_loss, eval_results, time_callback) return stats
def main(argv): if len(argv) != 3 and len(argv) != 5: raise app.UsageError( 'Invalid arguments.\n' 'Usage: %s generate key-file.\n' 'Usage: %s encrypt/decrypt key-file ' 'input-file output-file.' % (argv[0], argv[0]) ) mode = argv[1] if mode not in ('encrypt', 'decrypt', 'generate'): raise app.UsageError( 'The first argument should be either encrypt, decrypt or generate') key_file_path = argv[2] input_file_path = argv[3] if len(argv) == 5 else None output_file_path = argv[4] if len(argv) == 5 else None # Initialise Tink try: aead.register() except tink.TinkError as e: logging.error('Error initialising Tink: %s', e) return 1 if mode == 'generate': # [START generate-a-new-keyset] # Generate a new keyset try: key_template = aead.aead_key_templates.AES128_GCM keyset_handle = tink.KeysetHandle.generate_new(key_template) except tink.TinkError as e: logging.exception('Error creating primitive: %s', e) return 1 # [END generate-a-new-keyset] # [START store-a-cleartext-keyset] with open(key_file_path, 'wt') as keyset_file: try: cleartext_keyset_handle.write( tink.JsonKeysetWriter(keyset_file), keyset_handle) except tink.TinkError as e: logging.exception('Error writing key: %s', e) return 1 return 0 # [END store-a-cleartext-keyset] # Use the input keyset to encrypt/decrypt data # Read the keyset into a keyset_handle with open(key_file_path, 'rt') as keyset_file: try: text = keyset_file.read() keyset_handle = cleartext_keyset_handle.read(tink.JsonKeysetReader(text)) except tink.TinkError as e: logging.exception('Error reading key: %s', e) return 1 # Get the primitive try: cipher = keyset_handle.primitive(aead.Aead) except tink.TinkError as e: logging.error('Error creating primitive: %s', e) return 1 with open(input_file_path, 'rb') as input_file: input_data = input_file.read() if mode == 'decrypt': output_data = cipher.decrypt(input_data, b'envelope_example') elif mode == 'encrypt': output_data = cipher.encrypt(input_data, b'envelope_example') else: logging.error( 'Error mode not supported. Please choose "encrypt" or "decrypt".') return 1 with open(output_file_path, 'wb') as output_file: output_file.write(output_data)
def launch(self) -> Optional[data_types.ExecutionInfo]: """Executes the component, includes driver, executor and publisher. Returns: The metadata of this execution that is registered in MLMD. It can be None if the driver decides not to run the execution. Raises: Exception: If the executor fails. """ logging.info('Running launcher for %s', self._pipeline_node) if self._system_node_handler: # If this is a system node, runs it and directly return. return self._system_node_handler.run(self._mlmd_connection, self._pipeline_node, self._pipeline_info, self._pipeline_runtime_spec) # Runs as a normal node. execution_preparation_result = self._prepare_execution() (execution_info, contexts, is_execution_needed) = ( execution_preparation_result.execution_info, execution_preparation_result.contexts, execution_preparation_result.is_execution_needed) if is_execution_needed: try: executor_watcher = None if self._executor_operator: # Create an execution watcher and save an in memory copy of the # Execution object to execution to it. Launcher calls executor # operator in process, thus there won't be race condition between the # execution watcher and the launcher to write to MLMD. executor_watcher = execution_watcher.ExecutionWatcher( port=portpicker.pick_unused_port(), mlmd_connection=self._mlmd_connection, execution=execution_preparation_result. execution_metadata, creds=grpc.local_server_credentials()) self._executor_operator.with_execution_watcher( executor_watcher.address) executor_watcher.start() executor_output = self._run_executor(execution_info) except Exception as e: # pylint: disable=broad-except execution_output = (e.executor_output if isinstance( e, _ExecutionFailedError) else None) self._publish_failed_execution(execution_info.execution_id, contexts, execution_output) logging.error('Execution %d failed.', execution_info.execution_id) raise finally: self._clean_up_stateless_execution_info(execution_info) if executor_watcher: executor_watcher.stop() logging.info('Execution %d succeeded.', execution_info.execution_id) self._clean_up_stateful_execution_info(execution_info) # TODO(b/182316162): Unify publisher handing so that post-execution # artifact logic is more cleanly handled. # Note that currently both the ExecutionInfo and ExecutorOutput are # consulted in `execution_publish_utils.publish_succeeded_execution()`. outputs_utils.tag_executor_output_with_version(executor_output) outputs_utils.tag_output_artifacts_with_version( execution_info.output_dict) logging.info('Publishing output artifacts %s for execution %s', execution_info.output_dict, execution_info.execution_id) self._publish_successful_execution(execution_info.execution_id, contexts, execution_info.output_dict, executor_output) return execution_info
# you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Loads icp op.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function from absl import logging import tensorflow as tf try: icp_op_module = tf.load_op_library('./ops/icp_op.so') icp = icp_op_module.icp except Exception: # pylint: disable=broad-except try: icp_op_module = tf.load_op_library('./icp_op.so') icp = icp_op_module.icp except Exception: logging.error('Could not load object file for ICP op.') icp = None
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') # Report failed checks as they occur and maintain a counter, instead of # raising exceptions right away, so all issues can be reported at once. num_failed_checks = 0 # Load dataset_spec, this should fail if it is absent or incorrect. if FLAGS.dataset_spec_file is None: dataset_spec = dataset_spec_lib.load_dataset_spec( FLAGS.dataset_records_path) else: with tf.io.gfile.GFile(FLAGS.dataset_spec_file, 'r') as f: dataset_spec = json.load( f, object_hook=dataset_spec_lib.as_dataset_spec) dataset_spec.initialize() # 1. Check dataset name dir_name = os.path.basename(os.path.abspath(FLAGS.dataset_records_path)) if dataset_spec.name != dir_name: num_failed_checks += 1 logging.error( 'The dataset name in "dataset_spec.json" (%s) does not match ' 'the name of the directory containing it (%s)', dataset_spec.name, dir_name) # 2. Check name and number of .tfrecords files num_classes = len(dataset_spec.class_names) try: expected_filenames = [ dataset_spec.file_pattern.format(class_id) for class_id in range(num_classes) ] except IndexError: num_failed_checks += 1 err_msg = ( 'The `file_pattern` (%s) did not accept the class number as its only ' 'formatting argument. Using the default (%s).') default_pattern = '{}.tfrecords' logging.error(err_msg, dataset_spec.file_pattern, default_pattern) expected_filenames = [ default_pattern.format(class_id) for class_id in range(num_classes) ] all_filenames = tf.io.gfile.listdir(FLAGS.dataset_records_path) # Heuristic to exclude obviously-not-tfrecords files. tfrecords_filenames = [ f for f in all_filenames if 'tfrecords' in f.lower() ] expected_set = set(expected_filenames) present_set = set(tfrecords_filenames) if set(expected_set) != set(present_set): num_failed_checks += 1 logging.error( 'The tfrecords files in %s do not match the dataset_spec.\n' 'Unexpected files present:\n' '%s\n' 'Expected files not present:\n' '%s', FLAGS.dataset_records_path, sorted(present_set - expected_set), sorted(expected_set - present_set)) # Iterate through each dataset, count examples and check set of targets. # List of (class_id, expected_count, actual_count) triples. bad_counts = [] # List of (filename, class_id, labels). bad_labels = [] for class_id, filename in enumerate(expected_filenames): expected_count = dataset_spec.get_total_images_per_class(class_id) if filename not in tfrecords_filenames: # The tfrecords does not exist, we use a negative count to denote it. bad_counts.append((class_id, expected_count, -1)) bad_labels.append((filename, class_id, set())) continue full_filepath = os.path.join(FLAGS.dataset_records_path, filename) try: count, labels = get_count_and_labels(full_filepath, FLAGS.label_field_name) except tf.errors.InvalidArgumentError: logging.exception( 'Unable to find label (%s) in the tf.Examples of file %s. ' 'Maybe try a different --label_field_name.', FLAGS.label_field_name, filename) # Fall back to counting examples only. count = count_records(full_filepath) labels = set() if count != expected_count: bad_counts.append((class_id, expected_count, count)) if labels != {class_id}: # labels could include class_id among other, incorrect labels. bad_labels.append((filename, class_id, labels)) # 3. Check number of examples if bad_counts: num_failed_checks += 1 logging.error( 'The number of tfrecords in the following files do not match ' 'the expected number of examples in that class.\n' '(filename, expected, actual) # -1 denotes a missing file.\n' '%s', bad_counts) # 4. Check the targets stored in the tfrecords files. if bad_labels: num_failed_checks += 1 logging.error( 'The labels stored inside the tfrecords (in field %s) do not ' 'all match the expected value (class_id).\n' '(filename, class_id, values)\n' '%s', FLAGS.label_field_name, bad_labels) # Report results if num_failed_checks: raise ValueError('%d checks failed. See the error-level logs.' % num_failed_checks)
def _ProcessHost(self, d): """Retrieves recovery data from an LDAP host and escrows to CauliflowerVest. Args: d: a single ldap.conn.result3() result dictionary. Raises: InvalidDistinguishedName: the given host had an invalid DN. InvalidGuid: the given host had an invalid GUID. """ dn = d['distinguishedName'][0] # Parse the hostname out of the distinguishedName, which is in this format: # CN=<timestamp>{<recovery_guid>},CN=<hostname>,OU=Workstations,... hostname = dn.split(',')[1][len('CN='):] # Ignore records with legacy DNs, as they have invalid RecoveryGUIDs, # and all have separate valid records. if INVALID_DN_REGEX.search(dn): raise InvalidDistinguishedName(dn) # Some msFVE-RecoveryGuid values may be invalid, so carefully attempt to # contruct the recovery_guid, and skip over objects which are invalid. try: recovery_guid = str( uuid.UUID(bytes_le=d['msFVE-RecoveryGuid'][0])).upper() volume_guid = str( uuid.UUID(bytes_le=d['msFVE-VolumeGuid'][0])).upper() except ValueError: raise InvalidGuid( '%s: %s' % (hostname, d['msFVE-RecoveryGuid'])) if FLAGS.redact_recovery_passwords: recovery_password = '******' else: recovery_password = d['msFVE-RecoveryPassword'][0] when_created = d['whenCreated'][0] try: datetime.datetime.strptime(when_created, '%Y%m%d%H%M%S.0Z') except ValueError: logging.error('Unknown whenCreated format: %r', when_created) when_created = '' parent_guid = None # msFVE-RecoveryObject distinguishedName is in the form of: # CN=<TIMESTAMP>{<UUID>},CN=<HOSTNAME>,DC=example,DC=com # where CN=<HOSTNAME>,.* is the parent's distinguishedName. # Given that the the msFVE-RecoveryObject is a child of the parent host, # split off the child to obtain the parent's DN. parent_dn = dn.split(',', 1)[1] # Alternatively: parent_dn = dn.replace('CN=%s,' % d['name'][0], '') ldap_filter = '(&(objectCategory=computer))' for host in self._QueryLdap(parent_dn, ldap_filter, scope=ldap.SCOPE_BASE): parent_guid = str(uuid.UUID(bytes_le=host['objectGUID'][0])).upper() metadata = { 'hostname': hostname, 'dn': dn, 'when_created': when_created, 'parent_guid': parent_guid, 'recovery_guid': recovery_guid, } self.client.UploadPassphrase(volume_guid, recovery_password, metadata) logging.info('Escrowed recovery password: %r', volume_guid)
def run_customized_training_loop( # pylint: disable=invalid-name _sentinel=None, # pylint: enable=invalid-name strategy=None, model_fn=None, loss_fn=None, model_dir=None, train_input_fn=None, steps_per_epoch=None, steps_per_loop=1, epochs=1, eval_input_fn=None, eval_steps=None, metric_fn=None, init_checkpoint=None, custom_callbacks=None, run_eagerly=False, sub_model_export_name=None): """Run BERT pretrain model training using low-level API. Arguments: _sentinel: Used to prevent positional parameters. Internal, do not use. strategy: Distribution strategy on which to run low level training loop. model_fn: Function that returns a tuple (model, sub_model). Caller of this function should add optimizer to the `model` via calling `model.compile()` API or manually setting `model.optimizer` attribute. Second element of the returned tuple(sub_model) is an optional sub model to be used for initial checkpoint -- if provided. loss_fn: Function with signature func(labels, logits) and returns a loss tensor. model_dir: Model directory used during training for restoring/saving model weights. train_input_fn: Function that returns a tf.data.Dataset used for training. steps_per_epoch: Number of steps to run per epoch. At the end of each epoch, model checkpoint will be saved and evaluation will be conducted if evaluation dataset is provided. steps_per_loop: Number of steps per graph-mode loop. In order to reduce communication in eager context, training logs are printed every steps_per_loop. epochs: Number of epochs to train. eval_input_fn: Function that returns evaluation dataset. If none, evaluation is skipped. eval_steps: Number of steps to run evaluation. Required if `eval_input_fn` is not none. metric_fn: A metrics function that returns a Keras Metric object to record evaluation result using evaluation dataset or with training dataset after every epoch. init_checkpoint: Optional checkpoint to load to `sub_model` returned by `model_fn`. custom_callbacks: A list of Keras Callbacks objects to run during training. More specifically, `on_batch_begin()`, `on_batch_end()`, methods are invoked during training. run_eagerly: Whether to run model training in pure eager execution. This should be disable for TPUStrategy. sub_model_export_name: If not None, will export `sub_model` returned by `model_fn` into checkpoint files. The name of intermediate checkpoint file is {sub_model_export_name}_step_{step}.ckpt and the last checkpint's name is {sub_model_export_name}.ckpt; if None, `sub_model` will not be exported as checkpoint. Returns: Trained model. Raises: ValueError: (1) When model returned by `model_fn` does not have optimizer attribute or when required parameters are set to none. (2) eval args are not specified correctly. (3) metric_fn must be a callable if specified. (4) sub_model_checkpoint_name is specified, but `sub_model` returned by `model_fn` is None. """ if _sentinel is not None: raise ValueError('only call `run_customized_training_loop()` ' 'with named arguments.') required_arguments = [ strategy, model_fn, loss_fn, model_dir, steps_per_epoch, train_input_fn ] if [arg for arg in required_arguments if arg is None]: raise ValueError('`strategy`, `model_fn`, `loss_fn`, `model_dir`, ' '`steps_per_loop` and `steps_per_epoch` are required ' 'parameters.') if steps_per_loop > steps_per_epoch: logging.error( 'steps_per_loop: %d is specified to be greater than ' ' steps_per_epoch: %d, we will use steps_per_epoch as' ' steps_per_loop.', steps_per_loop, steps_per_epoch) steps_per_loop = steps_per_epoch assert tf.executing_eagerly() if run_eagerly: if steps_per_loop > 1: raise ValueError( 'steps_per_loop is used for performance optimization. When you want ' 'to run eagerly, you cannot leverage graph mode loop.') if isinstance(strategy, tf.distribute.experimental.TPUStrategy): raise ValueError( 'TPUStrategy should not run eagerly as it heavily replies on graph' ' optimization for the distributed system.') if eval_input_fn and (eval_steps is None or metric_fn is None): raise ValueError( '`eval_step` and `metric_fn` are required when `eval_input_fn ` ' 'is not none.') if metric_fn and not callable(metric_fn): raise ValueError( 'if `metric_fn` is specified, metric_fn must be a callable.') total_training_steps = steps_per_epoch * epochs # To reduce unnecessary send/receive input pipeline operation, we place input # pipeline ops in worker task. train_iterator = _get_input_iterator(train_input_fn, strategy) with distribution_utils.get_strategy_scope(strategy): # To correctly place the model weights on accelerators, # model and optimizer should be created in scope. model, sub_model = model_fn() if not hasattr(model, 'optimizer'): raise ValueError('User should set optimizer attribute to model ' 'inside `model_fn`.') if sub_model_export_name and sub_model is None: raise ValueError('sub_model_export_name is specified as %s, but ' 'sub_model is None.' % sub_model_export_name) optimizer = model.optimizer use_float16 = isinstance( optimizer, tf.keras.mixed_precision.experimental.LossScaleOptimizer) if init_checkpoint: logging.info( 'Checkpoint file %s found and restoring from ' 'initial checkpoint for core model.', init_checkpoint) checkpoint = tf.train.Checkpoint(model=sub_model) checkpoint.restore( init_checkpoint).assert_existing_objects_matched() logging.info('Loading from checkpoint file completed') train_loss_metric = tf.keras.metrics.Mean('training_loss', dtype=tf.float32) eval_metrics = [metric_fn()] if metric_fn else [] # If evaluation is required, make a copy of metric as it will be used by # both train and evaluation. train_metrics = [ metric.__class__.from_config(metric.get_config()) for metric in eval_metrics ] # Create summary writers summary_dir = os.path.join(model_dir, 'summaries') eval_summary_writer = tf.summary.create_file_writer( os.path.join(summary_dir, 'eval')) if steps_per_loop >= _MIN_SUMMARY_STEPS: # Only writes summary when the stats are collected sufficiently over # enough steps. train_summary_writer = tf.summary.create_file_writer( os.path.join(summary_dir, 'train')) else: train_summary_writer = None # Collects training variables. training_vars = model.trainable_variables def _replicated_step(inputs): """Replicated training step.""" inputs, labels = inputs with tf.GradientTape() as tape: model_outputs = model(inputs, training=True) loss = loss_fn(labels, model_outputs) if use_float16: scaled_loss = optimizer.get_scaled_loss(loss) if use_float16: scaled_grads = tape.gradient(scaled_loss, training_vars) grads = optimizer.get_unscaled_gradients(scaled_grads) else: grads = tape.gradient(loss, training_vars) optimizer.apply_gradients(zip(grads, training_vars)) # For reporting, the metric takes the mean of losses. train_loss_metric.update_state(loss) for metric in train_metrics: metric.update_state(labels, model_outputs) @tf.function def train_steps(iterator, steps): """Performs distributed training steps in a loop. Args: iterator: the distributed iterator of training datasets. steps: an tf.int32 integer tensor to specify number of steps to run inside host training loop. Raises: ValueError: Any of the arguments or tensor shapes are invalid. """ if not isinstance(steps, tf.Tensor): raise ValueError( 'steps should be an Tensor. Python object may cause ' 'retracing.') for _ in tf.range(steps): strategy.experimental_run_v2(_replicated_step, args=(next(iterator), )) def train_single_step(iterator): """Performs a distributed training step. Args: iterator: the distributed iterator of training datasets. Raises: ValueError: Any of the arguments or tensor shapes are invalid. """ strategy.experimental_run_v2(_replicated_step, args=(next(iterator), )) def test_step(iterator): """Calculates evaluation metrics on distributed devices.""" def _test_step_fn(inputs): """Replicated accuracy calculation.""" inputs, labels = inputs model_outputs = model(inputs, training=False) for metric in eval_metrics: metric.update_state(labels, model_outputs) strategy.experimental_run_v2(_test_step_fn, args=(next(iterator), )) if not run_eagerly: train_single_step = tf.function(train_single_step) test_step = tf.function(test_step) def _run_evaluation(current_training_step, test_iterator): """Runs validation steps and aggregate metrics.""" for _ in range(eval_steps): test_step(test_iterator) with eval_summary_writer.as_default(): for metric in eval_metrics + model.metrics: metric_value = _float_metric_value(metric) logging.info('Step: [%d] Validation %s = %f', current_training_step, metric.name, metric_value) tf.summary.scalar(metric.name, metric_value, step=current_training_step) eval_summary_writer.flush() def _run_callbacks_on_batch_begin(batch): """Runs custom callbacks at the start of every step.""" if not custom_callbacks: return for callback in custom_callbacks: callback.on_batch_begin(batch) def _run_callbacks_on_batch_end(batch, logs): """Runs custom callbacks at the end of every step.""" if not custom_callbacks: return for callback in custom_callbacks: callback.on_batch_end(batch, logs) # Training loop starts here. checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer) sub_model_checkpoint = tf.train.Checkpoint( model=sub_model) if sub_model_export_name else None latest_checkpoint_file = tf.train.latest_checkpoint(model_dir) if latest_checkpoint_file: logging.info( 'Checkpoint file %s found and restoring from ' 'checkpoint', latest_checkpoint_file) checkpoint.restore(latest_checkpoint_file) logging.info('Loading from checkpoint file completed') current_step = optimizer.iterations.numpy() checkpoint_name = 'ctl_step_{step}.ckpt' while current_step < total_training_steps: # Training loss/metric are taking average over steps inside micro # training loop. We reset the their values before each round. train_loss_metric.reset_states() for metric in train_metrics + model.metrics: metric.reset_states() _run_callbacks_on_batch_begin(current_step) # Runs several steps in the host while loop. steps = steps_to_run(current_step, steps_per_epoch, steps_per_loop) if steps == 1: # TODO(zongweiz): merge with train_steps once tf.while_loop # GPU performance bugs are fixed. train_single_step(train_iterator) else: # Converts steps to a Tensor to avoid tf.function retracing. train_steps(train_iterator, tf.convert_to_tensor(steps, dtype=tf.int32)) train_loss = _float_metric_value(train_loss_metric) _run_callbacks_on_batch_end(current_step, {'loss': train_loss}) current_step += steps # Updates training logging. training_status = 'Train Step: %d/%d / loss = %s' % ( current_step, total_training_steps, train_loss) if train_summary_writer: with train_summary_writer.as_default(): tf.summary.scalar(train_loss_metric.name, train_loss, step=current_step) for metric in train_metrics + model.metrics: metric_value = _float_metric_value(metric) training_status += ' %s = %f' % (metric.name, metric_value) tf.summary.scalar(metric.name, metric_value, step=current_step) train_summary_writer.flush() logging.info(training_status) # Saves model checkpoints and run validation steps at every epoch end. if current_step % steps_per_epoch == 0: # To avoid repeated model saving, we do not save after the last # step of training. if current_step < total_training_steps: _save_checkpoint(checkpoint, model_dir, checkpoint_name.format(step=current_step)) if sub_model_export_name: _save_checkpoint( sub_model_checkpoint, model_dir, '%s_step_%d.ckpt' % (sub_model_export_name, current_step)) if eval_input_fn: logging.info('Running evaluation after step: %s.', current_step) _run_evaluation( current_step, _get_input_iterator(eval_input_fn, strategy)) # Re-initialize evaluation metric. for metric in eval_metrics + model.metrics: metric.reset_states() _save_checkpoint(checkpoint, model_dir, checkpoint_name.format(step=current_step)) if sub_model_export_name: _save_checkpoint(sub_model_checkpoint, model_dir, '%s.ckpt' % sub_model_export_name) if eval_input_fn: logging.info( 'Running final evaluation after training is complete.') _run_evaluation(current_step, _get_input_iterator(eval_input_fn, strategy)) training_summary = { 'total_training_steps': total_training_steps, 'train_loss': _float_metric_value(train_loss_metric), } if eval_metrics: # TODO(hongkuny): Cleans up summary reporting in text. training_summary['last_train_metrics'] = _float_metric_value( train_metrics[0]) training_summary['eval_metrics'] = _float_metric_value( eval_metrics[0]) write_txt_summary(training_summary, summary_dir) return model
def run(target, is_chief, device_fn): """Run training. Args: target: The target of the TensorFlow standard server to use. Can be the empty string to run locally using an inprocess server. is_chief: Boolean indicating whether this process is the chief. device_fn: Device function used to assign ops to devices. """ if not FLAGS.dataset_config_pbtxt: logging.error('Need to specify --dataset_config_pbtxt') return g = tf.Graph() with g.as_default(): model = modeling.get_model(FLAGS.model_name) dataset = data_providers.get_dataset(FLAGS.dataset_config_pbtxt) print('Running training on {} with model {}\n'.format(dataset, model)) with tf.device(device_fn): # If ps_tasks is zero, the local device is used. When using multiple # (non-local) replicas, the ReplicaDeviceSetter distributes the variables # across the different devices. images, labels, _ = data_providers.make_batches( dataset.get_slim_dataset(), model, FLAGS.batch_size, mode='TRAIN') endpoints = model.create(images, dataset.num_classes, is_training=True) labels = slim.one_hot_encoding(labels, dataset.num_classes) total_loss = loss( endpoints['Logits'], labels, label_smoothing=FLAGS.label_smoothing) # Setup the moving averages: moving_average_variables = slim.get_model_variables() moving_average_variables.extend(slim.losses.get_losses()) moving_average_variables.append(total_loss) variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, slim.get_or_create_global_step()) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, variable_averages.apply(moving_average_variables)) # Configure the learning rate using an exponetial decay. decay_steps = int(((1.0 * dataset.num_examples) / FLAGS.batch_size) * FLAGS.num_epochs_per_decay) learning_rate = tf.train.exponential_decay( FLAGS.learning_rate, slim.get_or_create_global_step(), decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) opt = tf.train.RMSPropOptimizer(learning_rate, FLAGS.rmsprop_decay, FLAGS.rmsprop_momentum, FLAGS.rmsprop_epsilon) # Create training op train_tensor = slim.learning.create_train_op( total_loss, optimizer=opt, update_ops=tf.get_collection(tf.GraphKeys.UPDATE_OPS)) # Summaries: slim.summaries.add_histogram_summaries(slim.get_model_variables()) slim.summaries.add_scalar_summaries(slim.losses.get_losses(), 'losses') slim.summaries.add_scalar_summary(total_loss, 'Total_Loss', 'losses') slim.summaries.add_scalar_summary(learning_rate, 'Learning_Rate', 'training') slim.summaries.add_histogram_summaries(endpoints.values()) slim.summaries.add_zero_fraction_summaries(endpoints.values()) # redacted # Set start-up delay startup_delay_steps = FLAGS.task * FLAGS.startup_delay_steps init_fn = model_init_function(model, dataset.num_classes, FLAGS.start_from_checkpoint) saver = tf.train.Saver( max_to_keep=FLAGS.max_checkpoints_to_keep, keep_checkpoint_every_n_hours=FLAGS.keep_checkpoint_every_n_hours) # Train model slim.learning.train( train_tensor, number_of_steps=FLAGS.number_of_steps, logdir=FLAGS.train_dir, master=target, init_fn=init_fn, is_chief=is_chief, saver=saver, startup_delay_steps=startup_delay_steps, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs)
def ddpg_graph(a_func, q_func, transition, target_network_type=DQNTarget.normal, gamma=1.0, dqda_clipping=0.0, loss_fn=tf.losses.huber_loss, extra_callback=None): """DDPG. https://arxiv.org/abs/1509.02971. Args: a_func: Python function that takes in state, scope as input and returns action and intermediate endpoints dictionary. q_func: Python function that takes in state, action, scope as input and returns Q(state, action) and intermediate endpoints dictionary. transition: SARSTransition namedtuple. target_network_type: Option to use Q Learning without target network, Q Learning with a target network (default), or Double-Q Learning with a target network. gamma: Discount factor. dqda_clipping: (float) clips the gradient dqda element-wise between [-dqda_clipping, dqda_clipping]. Does not perform clipping if dqda_clipping == 0. loss_fn: Function that computes the td_loss tensor. Takes as arguments (target value tensor, predicted value tensor). extra_callback: Optional function that takes in (transition, end_points_t, end_points_tp1) and adds additional TF graph elements. Returns: A tuple (loss, summaries) where loss is a scalar loss tensor to minimize, summaries are TensorFlow summaries. """ state = transition.state action = transition.action state_p1 = transition.state_p1 reward = transition.reward done = transition.done q_t_selected, end_points_t = q_func(state, action, scope='q_func') if gamma != 0: action_p1, _ = a_func(state_p1, scope='a_func') if target_network_type == DQNTarget.notarget: # Evaluate target values using the current net only. q_tp1_best, end_points_tp1 = q_func(state_p1, action_p1, scope='q_func', reuse=True) elif target_network_type == DQNTarget.normal: # Target network Q values at t+1. q_tp1_best, end_points_tp1 = q_func(state_p1, action_p1, scope='target_q_func') else: logging.error('Invalid target_network_mode %s', target_network_type) q_tp1_best_masked = (1.0 - done) * q_tp1_best q_t_selected_target = tf.stop_gradient(reward + gamma * q_tp1_best_masked) else: # Supervised Target. q_t_selected_target = tf.stop_gradient(reward) # Critic Loss td_error = q_t_selected - q_t_selected_target critic_loss = loss_fn(q_t_selected_target, q_t_selected) # Actor Loss (maximize E[Q(a_t|s_t)] via policy grdient) policy_action, _ = a_func(state, scope='a_func', reuse=True) q_t, _ = q_func(state, policy_action, scope='q_func', reuse=True) dqda = tf.gradients(q_t, policy_action)[0] if dqda_clipping > 0: dqda = tf.clip_by_value(dqda, -dqda_clipping, dqda_clipping) actor_loss = tf.losses.mean_squared_error( tf.stop_gradient(dqda + policy_action), policy_action) loss = tf.losses.get_total_loss() if extra_callback is not None: extra_callback(transition, end_points_t, end_points_tp1) tf.summary.histogram('td_error', td_error) tf.summary.histogram('q_t_selected', q_t_selected) tf.summary.histogram('q_t_selected_target', q_t_selected_target) tf.summary.scalar('mean_q_t_selected', tf.reduce_mean(q_t_selected)) tf.summary.scalar('critic_loss', critic_loss) tf.summary.scalar('actor_loss', actor_loss) tf.summary.scalar('actor_mean_q', tf.reduce_mean(q_t, 0)) tf.summary.scalar('total_loss', loss) all_summaries = tf.summary.merge_all() # Make this a named tuple. return actor_loss, critic_loss, all_summaries
def join(self, timeout=_DEFAULT_TIMEOUT_SEC): """Joins all the processes with timeout. If any of the subprocesses does not exit approximately after `timeout` seconds has passed after `join` call, this raises a `SubprocessTimeoutError`. Note: At timeout, it uses SIGTERM to terminate the subprocesses, in order to log the stack traces of the subprocesses when they exit. However, this results in timeout when the test runs with tsan (thread sanitizer); if tsan is being run on the test targets that rely on timeout to assert information, `MultiProcessRunner.terminate_all()` must be called after `join()`, before the test exits, so the subprocesses are terminated with SIGKILL, and data race is removed. Args: timeout: optional integer or `None`. If provided as an integer, and not all processes report status within roughly `timeout` seconds, a `SubprocessTimeoutError` exception will be raised. If `None`, `join` never times out. Returns: A MultiProcessRunnerResult object, which has two attributes, `return_value` and `stdout`. `return_value` always contains the return values from the subprocesses. If `return_output` argument is True at `__init__`, `stdout` is available that contains a list of all messages from subprocesses' stdout and stderr. Raises: SubprocessTimeoutError: if not all processes report status approximately within `timeout` seconds. When this is raised, a `MultiProcessRunnerResult` object can be retrieved by `SubprocessTimeoutError`'s mpr_result attribute, which has the same structure as above 'Returns' section describes. UnexpectedSubprocessExitError: If any of the subprocesses did not exit properly (for example, they exit on SIGTERM or SIGKILL signal). When this is raised, a `MultiProcessRunnerResult` object can be retrieved by `UnexpectedSubprocessExitError`'s mpr_result attribute, which has the same structure as above 'Returns' section describes. If `max_run_time` is not `None`, it is expected that some subprocesses may be force-killed when `max_run_time` is up, and this is raised in those cases. Exception: if there is an Exception propagated from any subprocess. When this is raised, a `MultiProcessRunnerResult` object can be retrieved by `UnexpectedSubprocessExitError`'s mpr_result attribute, which has the same structure as above 'Returns' section describes. """ if timeout and not isinstance(timeout, int): raise ValueError('`timeout` must be an integer or `None`.') with self._process_lock: if self._joined: raise ValueError("MultiProcessRunner can't be joined twice.") self._joined = True self._watchdog_thread.join(timeout) if self._watchdog_thread.is_alive(): # Timeout. Force termination to dump worker processes stack trace. with self._process_lock: self._auto_restart = False logging.error( 'Timeout when joining for child processes. Terminating...') self.terminate_all(sig=signal.SIGTERM) # Wait for the processes to terminate by themselves first, so they have a # chance to dump stacktraces. After _FORCE_KILL_WAIT_SEC, we SIGKILL them. self._watchdog_thread.join(_FORCE_KILL_WAIT_SEC) if self._watchdog_thread.is_alive(): logging.error('Timeout when waiting for child processes to ' 'print stacktrace. Sending SIGKILL...') self.terminate_all() self._watchdog_thread.join() process_statuses = self._get_process_statuses() self._reraise_if_subprocess_error(process_statuses) raise SubprocessTimeoutError( 'One or more subprocesses timed out, where timeout was set to {}s. ' 'Please change the `timeout` argument for ' '`MultiProcessRunner.join()` or `multi_process_runner.run()` ' 'if it should be adjusted.'.format(timeout), self._get_mpr_result(process_statuses)) for (task_type, task_id), p in self._processes.items(): logging.info('%s-%d exit code: %s', task_type, task_id, p.exitcode) process_statuses = self._get_process_statuses() self._reraise_if_subprocess_error(process_statuses) # Checking all the processes that are expected to exit properly. for (task_type, task_id), p in self._processes.items(): # Successfully exiting process has exit code 0. We ignore processes that # are terminated. assert p.exitcode is not None if (p.exitcode > 0 and (task_type, task_id) not in self._terminated): raise UnexpectedSubprocessExitError( 'Subprocess %s-%d exited with exit code %s. See logs for details.' % (task_type, task_id, p.exitcode), self._get_mpr_result(process_statuses)) logging.info('Joining log reading threads.') for thread in self._reading_threads: thread.join() logging.info('Joined log reading threads.') # Clear the alarm. signal.alarm(0) return self._get_mpr_result(process_statuses)
def get_gin_bindings(exp, agent_name, initial_seed, value, test): gin_bindings = [f"{agent_name}.seed={initial_seed}"] if exp == "epsilon": gin_bindings += [f"create_opt.eps = {value}"] elif exp == "learning_rate": gin_bindings += [f"create_opt.learning_rate = {value}"] elif exp == "weight_decay": gin_bindings += [f"create_opt.weight_decay = {value}"] elif exp == "width": gin_bindings += [f"{agent_name}.neurons = {value}"] elif exp == "depth": gin_bindings += [f"{agent_name}.hidden_layer = {value}"] elif exp == "conv": gin_bindings += [f"{agent_name}.hidden_conv = {value}"] elif exp == "normalization": gin_bindings += [f"{agent_name}.normalization = '{value}'"] elif "init" in exp: gin_bindings = get_init_bidings(agent_name, value, initial_seed) elif exp == "activation": gin_bindings += [f"{agent_name}.layer_funct = '{value}'"] elif exp == "update_period": gin_bindings += [f"{agent_name}.update_period = {value}"] elif exp == "target_update_period": gin_bindings += [f"{agent_name}.target_update_period = {value}"] elif exp == "gamma": gin_bindings += [f"{agent_name}.gamma = {value}"] elif exp == "min_replay_history": gin_bindings += [f"{agent_name}.min_replay_history = {value}"] elif exp == "num_atoms": gin_bindings += [f"{agent_name}.num_atoms = {value}"] elif exp == "update_horizon": gin_bindings += [f"{agent_name}.update_horizon = {value}"] elif exp == "clip_rewards": gin_bindings += [f"Runner.clip_rewards = {value}"] elif exp == "batch_size": gin_bindings += [ f"OutOfGraphPrioritizedReplayBuffer.batch_size = {value}" ] elif exp == "noisy_net": gin_bindings += [f"{agent_name}.noisy = {value}"] else: logging.error("Error! Check the kind of experiment") raise ValueError("Experiment not recognized") if test: gin_bindings.extend( ["Runner.num_iterations=4", "Runner.training_steps=200"]) return gin_bindings