def training_data_fn(): if self.run_hparams.data_dir is not None: return data_utils.get_tfrecords_input_fn( data_dir=self.run_hparams.data_dir, num_epochs=num_iter, batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, training=True, # distort_color=self.run_hparams.distort_colors, # num_threads=self.run_hparams.num_preprocessing_threads, datasets_num_private_threads=None # deterministic=False if self.run_hparams.seed is None else True ) else: if hvd.rank() == 0: LOGGER.log("Using Synthetic Data ...") return data_utils.get_synth_input_fn( batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, num_channels=self.run_hparams.n_channels, data_format=self.run_hparams.input_format, num_classes=self.run_hparams.n_classes, dtype=self.run_hparams.dtype, )
def _get_session_config(mode, use_xla): if mode not in ["train", 'validation', 'benchmark']: raise ValueError( "Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark')" % mode) config = tf.ConfigProto() config.allow_soft_placement = True config.log_device_placement = False config.gpu_options.allow_growth = True if hvd_utils.is_using_hvd(): config.gpu_options.visible_device_list = str(hvd.local_rank()) if use_xla: LOGGER.log("XLA is activated - Experimental Feature") config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 config.gpu_options.force_gpu_compatible = True # Force pinned memory if mode == 'train': config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads if hvd_utils.is_using_hvd(): config.inter_op_parallelism_threads = max( 2, (multiprocessing.cpu_count() // hvd.size()) - 2) else: config.inter_op_parallelism_threads = 4 return config
def validate(model, criterion, valate_dataset, iteration, collate_fn, distributed_run, args): """Handles all the validation scoring and printing""" with evaluating(model), torch.no_grad(): val_loader = DataLoader(valate_dataset, num_workers=1, shuffle=False, batch_size=args.batch_size // len(args.validation_anchor_dirs), pin_memory=False, collate_fn=collate_fn) val_loss = 0.0 for i, batch in enumerate(val_loader): x, y, num_frames = batch_to_gpu(batch) y_pred = model(x) loss = criterion(y_pred, y) if distributed_run: reduced_val_loss = reduce_tensor(loss.data, args.world_size).item() else: reduced_val_loss = loss.item() val_loss += reduced_val_loss val_loss = val_loss / (i + 1) LOGGER.log(key="val_iter_loss", value=reduced_val_loss)
def predict(self): """Perform prediction with the runner's classifier """ if hvd.rank() == 0: LOGGER.log("Begin predict...") begin = time.time() pred = self._classifier.predict(input_fn=self._dataset.test_fn) predictions = [p['logits'] for p in pred] print('Inference took: {} sec'.format(time.time() - begin)) binary_masks = [np.argmax(p, axis=-1).astype(np.uint8) * 255 for p in predictions] multipage_tif = [Image.fromarray(mask).resize(size=(512, 512), resample=Image.BILINEAR) for mask in binary_masks] output_dir = os.path.join(self._model_dir, 'pred') if not os.path.exists(output_dir): os.makedirs(output_dir) multipage_tif[0].save(os.path.join(output_dir, 'test-masks.tif'), compression="tiff_deflate", save_all=True, append_images=multipage_tif[1:]) pickle.dump(predictions, open(os.path.join(output_dir, 'predictions.pkl'), 'wb')) LOGGER.log("Predict finished")
def evaluation_data_fn(): if self.run_hparams.data_dir is not None: return data_utils.get_tfrecords_input_fn( filenames=filenames, batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, training=False, distort_color=self.run_hparams.distort_colors, num_threads=self.run_hparams.num_preprocessing_threads, deterministic=False if self.run_hparams.seed is None else True) else: LOGGER.log("Using Synthetic Data ...\n") return data_utils.get_synth_input_fn( batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, num_channels=self.run_hparams.n_channels, data_format=self.run_hparams.input_format, num_classes=self.run_hparams.n_classes, dtype=self.run_hparams.dtype, )
def evaluation_data_fn(): if not is_benchmark or self.run_hparams.data_dir is not None: return self.dataset.dataset_fn( batch_size=batch_size, training=False, input_shape=list(self.run_hparams.input_shape) + [self.run_hparams.n_channels], mask_shape=list(self.run_hparams.mask_shape) + [self.run_hparams.n_channels], num_threads=64, use_gpu_prefetch=True, normalize_data_method="zero_centered", only_defective_images=False, augment_data=False, seed=self.run_hparams.seed) else: LOGGER.log("Using Synthetic Data ...") return self.dataset.synth_dataset_fn( batch_size=batch_size, training=False, input_shape=list(self.run_hparams.input_shape) + [self.run_hparams.n_channels], mask_shape=list(self.run_hparams.mask_shape) + [self.run_hparams.n_channels], num_threads=64, use_gpu_prefetch=True, normalize_data_method="zero_centered", only_defective_images=False, augment_data=False, seed=self.run_hparams.seed)
def end(self, session): try: avg_processing_speed = float( ProfilerHook.moving_average(self._processing_speed_arr, n=100)[-1]) except: avg_processing_speed = float(np.mean(self._processing_speed_arr)) total_processing_time = time.time() - self._start_training_time total_processing_hours, rem = divmod(total_processing_time, 3600) total_processing_minutes, total_processing_seconds = divmod(rem, 60) LOGGER.log("Final Summary:\n" "\t[*] Average Imgs/sec: %d\n" "\t[*] Total Processing Time: %dh %02dm %02ds\n" % (avg_processing_speed, total_processing_hours, total_processing_minutes, total_processing_seconds)) perf_dict = { 'throughput': str(avg_processing_speed), 'processing_time': str(total_processing_time) } perf_filename = "performances_%s.json" % ("train" if self._is_training else "eval") with open(os.path.join(self._sample_dir, "..", perf_filename), 'w') as f: json.dump(perf_dict, f)
def validate(model, criterion, valset, iteration, batch_size, world_size, collate_fn, distributed_run, rank, batch_to_gpu, fp16_run): """Handles all the validation scoring and printing""" with evaluating(model), torch.no_grad(): val_sampler = DistributedSampler(valset) if distributed_run else None val_loader = DataLoader(valset, num_workers=1, shuffle=False, sampler=val_sampler, batch_size=batch_size, pin_memory=False, collate_fn=collate_fn) val_loss = 0.0 for i, batch in enumerate(val_loader): x, y, len_x = batch_to_gpu(batch) if fp16_run: y_pred = model(fp32_to_fp16(x)) loss = criterion(fp16_to_fp32(y_pred), y) else: y_pred = model(x) loss = criterion(y_pred, y) if distributed_run: reduced_val_loss = reduce_tensor(loss.data, world_size).item() else: reduced_val_loss = loss.item() val_loss += reduced_val_loss val_loss = val_loss / (i + 1) LOGGER.log(key="val_iter_loss", value=reduced_val_loss)
def after_create_session(self, session, coord): params_count = tf.get_default_graph().get_tensor_by_name("trainable_parameters_count_ref:0") _params_count = session.run(params_count) LOGGER.log("# Total Trainable Parameters:", int(_params_count)) self._start_training_time = time.time()
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU. """ parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() LOGGER.set_model_name("Tacotron2_PyT") LOGGER.set_backends([ dllg.StdOutBackend(log_file=None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1), dllg.JsonBackend(log_file=args.log_file, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1) ]) LOGGER.register_metric("tacotron2_frames_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("tacotron2_latency", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE) model = load_and_setup_model(parser, args) log_hardware() log_args(args) if args.include_warmup: sequences = torch.randint(low=0, high=148, size=(1,50), dtype=torch.long).cuda() text_lengths = torch.IntTensor([sequence.size(1)]).cuda().long() for i in range(3): with torch.no_grad(): _, mels, _, _, mel_lengths = model.infer(sequences, text_lengths) os.makedirs(args.output, exist_ok=True) LOGGER.iteration_start() measurements = {} anchor_dirs = [os.path.join(args.dataset_path, anchor) for anchor in args.anchor_dirs] metadatas = [load_metadata(anchor) for anchor in anchor_dirs] with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"): for speaker_id in range(len(anchor_dirs)): metadata = metadatas[speaker_id] for mel_path, text in tqdm(metadata): seq = text_to_sequence(text, speaker_id, ['basic_cleaners']) seqs = torch.from_numpy(np.stack(seq)).unsqueeze(0) seq_lens = torch.IntTensor([len(text)]) melspec = torch.from_numpy(np.load(mel_path)) target = melspec[:, ::args.reduction_factor] targets = torch.from_numpy(np.stack(target)).unsqueeze(0) target_lengths = torch.IntTensor([target.shape[1]]) inputs = (to_gpu(seqs).long(), to_gpu(seq_lens).int(), to_gpu(targets).float(), to_gpu(target_lengths).int()) _, mel_outs, _, _ = model(inputs) fname = os.path.basename(mel_path) np.save(os.path.join(args.output, fname), mel_outs[0, :, :melspec.shape[1]], allow_pickle=False) LOGGER.log(key="tacotron2_latency", value=measurements['tacotron2_time']) LOGGER.log(key="latency", value=(measurements['tacotron2_time'])) LOGGER.iteration_stop() LOGGER.finish()
def after_run(self, run_context, run_values): batch_time = time.time() - self.t0 ips = self.global_batch_size / batch_time if self.current_step >= self.warmup_steps: LOGGER.log("iteration", int(self.current_step)) LOGGER.log("total_ips", float(ips)) LOGGER.iteration_stop() self.current_step += 1
def _log_hparams(classname, layername, **kwargs): log_msg = "%s: `%s`" % (classname, layername) for arg, val in sorted(kwargs.items()): log_msg += "\n\t[*] {}: {}".format(arg, val) log_msg += "\n" if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0: LOGGER.log(log_msg)
def train(self): """Perform training with the runner's classifier""" LOGGER.log("Begin training...") try: self._classifier.train(input_fn=self._dataset.train_fn, steps=self._max_steps, hooks=self._training_hooks) except KeyboardInterrupt: print("Keyboard interrupt") LOGGER.log("Training finished")
def _get_session_config(mode, use_xla, use_dali, gpu_memory_fraction, gpu_id=0): if mode not in ["train", 'validation', 'benchmark', 'inference']: raise ValueError( "Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark', 'inference')" % mode) # Limit available GPU memory (tune the size) if use_dali: LOGGER.log( "DALI is activated, GPU memory fraction used for training is limited to", gpu_memory_fraction) gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_memory_fraction) config = tf.ConfigProto(gpu_options=gpu_options) config.gpu_options.allow_growth = False else: config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True config.log_device_placement = False config.gpu_options.visible_device_list = str(gpu_id) if hvd_utils.is_using_hvd(): config.gpu_options.visible_device_list = str(hvd.local_rank()) if use_xla: LOGGER.log("XLA is activated - Experimental Feature") config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 config.gpu_options.force_gpu_compatible = True # Force pinned memory if mode == 'train': config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads if hvd_utils.is_using_hvd(): config.inter_op_parallelism_threads = max( 2, (multiprocessing.cpu_count() // hvd.size()) - 2) else: config.inter_op_parallelism_threads = 4 return config
def main(_): """ Starting point of the application """ flags = PARSER.parse_args() params = _cmd_params(flags) tf.logging.set_verbosity(tf.logging.ERROR) # Optimization flags os.environ['CUDA_CACHE_DISABLE'] = '0' os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL' os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private' os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1' os.environ['TF_ADJUST_HUE_FUSED'] = '1' os.environ['TF_ADJUST_SATURATION_FUSED'] = '1' os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' os.environ['TF_SYNC_ON_FINISH'] = '0' os.environ['TF_AUTOTUNE_THRESHOLD'] = '2' os.environ['TF_DISABLE_NVTX_RANGES'] = '1' if params['use_amp']: assert params['dtype'] == tf.float32, "TF-AMP requires FP32 precision" LOGGER.log("TF AMP is activated - Experimental Feature") os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1' runner = Runner(params) if 'train' in params['exec_mode'] \ or 'train_and predict' in params['exec_mode']: runner.train() if 'train_and predict' in params['exec_mode'] \ or 'predict' in params['exec_mode']: runner.predict() if 'benchmark' in params['exec_mode']: runner.benchmark()
def end(self, session): try: avg_processing_speed = float(ProfilerHook.moving_average(self._processing_speed_arr, n=100)[-1]) except: avg_processing_speed = float(np.mean(self._processing_speed_arr)) total_processing_time = time.time() - self._start_training_time total_processing_hours, rem = divmod(total_processing_time, 3600) total_processing_minutes, total_processing_seconds = divmod(rem, 60) LOGGER.log( "Final Summary:\n" "\t[*] Average Imgs/sec: %d\n" "\t[*] Total Processing Time: %dh %02dm %02ds\n" % (avg_processing_speed, total_processing_hours, total_processing_minutes, total_processing_seconds) )
def _build_hparams(*args): hparams = tf.contrib.training.HParams() for _hparams in args: if not isinstance(_hparams, tf.contrib.training.HParams): raise ValueError("Non valid HParams argument object detected:", _hparams) for key, val in _hparams.values().items(): try: hparams.add_hparam(name=key, value=val) except ValueError: LOGGER.log( "the parameter `{}` already exists - existing value: {} and duplicated value: {}" .format(key, hparams.get(key), val)) return hparams
def training_data_fn(): if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None: if hvd.rank() == 0: LOGGER.log("Using DALI input... ") return data_utils.get_dali_input_fn( filenames=filenames, idx_filenames=idx_filenames, batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, training=True, distort_color=self.run_hparams.distort_colors, num_threads=self.run_hparams.num_preprocessing_threads, deterministic=False if self.run_hparams.seed is None else True ) elif self.run_hparams.data_dir is not None: return data_utils.get_tfrecords_input_fn( filenames=filenames, batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, training=True, distort_color=self.run_hparams.distort_colors, num_threads=self.run_hparams.num_preprocessing_threads, deterministic=False if self.run_hparams.seed is None else True ) else: if hvd.rank() == 0: LOGGER.log("Using Synthetic Data ...") return data_utils.get_synth_input_fn( batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, num_channels=self.run_hparams.n_channels, data_format=self.run_hparams.input_format, num_classes=self.run_hparams.n_classes, dtype=self.run_hparams.dtype, )
def __init__(self, params): hvd.init() LOGGER.log(str(params)) data_dir = params['data_dir'] batch_size = params['batch_size'] augment = params['augment'] benchmark = params['benchmark'] seed = params['seed'] self._model_dir = params['model_dir'] self._max_steps = params['max_steps'] self._classifier = tf.estimator.Estimator( model_fn=_model_fn, model_dir=self._model_dir, params=params, config=tf.estimator.RunConfig( tf_random_seed=None, session_config=self._get_session_config(), save_checkpoints_steps=self._max_steps if hvd.rank() == 0 else None, keep_checkpoint_max=1)) self._dataset = Dataset(data_dir=data_dir, batch_size=batch_size, augment=augment, gpu_id=hvd.rank(), num_gpus=hvd.size(), seed=seed) self._training_hooks = [hvd.BroadcastGlobalVariablesHook(0)] if benchmark and hvd.rank() == 0: self._training_hooks.append( ProfilerHook(self._model_dir, batch_size, log_every=params['log_every'], warmup_steps=params['warmup_steps']))
def learning_rate_scheduler(lr_init, lr_warmup_epochs, global_step, batch_size, num_batches_per_epoch, num_decay_steps, num_gpus, use_cosine_lr): def get_scaled_base_learning_rate(): """Calculates base learning rate for creating lr schedule. In replicated mode, gradients are summed rather than averaged which, with the sgd and momentum optimizers, increases the effective learning rate by lr * num_gpus. Dividing the base lr by num_gpus negates the increase. Args: batch_size: Total batch-size. Returns: Base learning rate to use to create lr schedule. """ base_lr = lr_init * num_gpus # Starting LR = 0.1 with BS = 256, else linearly scale return base_lr * (batch_size / 256.0) rescaled_lr = get_scaled_base_learning_rate() if use_cosine_lr: LOGGER.log("Using cosine learning rate schedule") lr = tf.train.cosine_decay(rescaled_lr, global_step, num_decay_steps) else: LOGGER.log("Using step learning rate schedule") boundaries = [int(num_batches_per_epoch * x) for x in [30, 60, 80, 90]] values = [1e0, 1e-1, 1e-2, 1e-3, 1e-4] values = [rescaled_lr * v for v in values] lr = tf.train.piecewise_constant(global_step, boundaries, values) warmup_steps = int(num_batches_per_epoch * lr_warmup_epochs) warmup_lr = (rescaled_lr * tf.cast(global_step, tf.float32) / tf.cast(warmup_steps, tf.float32)) return tf.cond(global_step < warmup_steps, lambda: warmup_lr, lambda: lr)
def predict(self, to_predict): estimator_params = {} if to_predict is not None: filenames = runner_utils.parse_inference_input(to_predict) image_classifier = self._get_estimator( mode='inference', run_params=estimator_params, use_xla=self.run_hparams.use_xla, use_dali=self.run_hparams.use_dali, gpu_memory_fraction=self.run_hparams.gpu_memory_fraction) inference_hooks = [] def inference_data_fn(): return data_utils.get_inference_input_fn( filenames=filenames, height=self.run_hparams.height, width=self.run_hparams.width, num_threads=self.run_hparams.num_preprocessing_threads) try: inference_results = image_classifier.predict( input_fn=inference_data_fn, predict_keys=None, hooks=inference_hooks, yield_single_examples=True) for result in inference_results: LOGGER.log(result['classes'], str(result['probabilities'][result['classes']])) except KeyboardInterrupt: print("Keyboard interrupt") LOGGER.log('Ending Inference ...')
def evaluation_data_fn(): if self.run_hparams.data_dir is not None: return data_utils.get_tfrecords_input_fn( data_dir=self.run_hparams.data_dir, num_epochs=num_iter, batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, training=True, datasets_num_private_threads=None) else: LOGGER.log("Using Synthetic Data ...\n") return data_utils.get_synth_input_fn( batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, num_channels=self.run_hparams.n_channels, data_format=self.run_hparams.input_format, num_classes=self.run_hparams.n_classes, dtype=self.run_hparams.dtype, )
def after_run(self, run_context, run_values): cross_loss, dice_loss, total_loss = run_values.results batch_time = time.time() - self._t0 ips = self._global_batch_size / batch_time ips *= hvd.size() if self._current_step >= self._warmup_steps: LOGGER.log("iteration", int(self._current_step)) LOGGER.log("loss", float(cross_loss)) LOGGER.log("dice_loss", float(dice_loss)) LOGGER.log("total_loss", float(total_loss)) self._perf.record(ips) LOGGER.iteration_stop() self._current_step += 1
def after_run(self, run_context, run_values): #global_step, cross_entropy, logits_flat, labels_flat, rgb_inputs, depth_inputs, labels, logits = run_values.results global_step, cross_entropy = run_values.results batch_time = time.time() - self.t0 ips = self.global_batch_size / batch_time LOGGER.log("iteration", int(self.current_step)) LOGGER.log("imgs_per_sec", float(ips)) LOGGER.log("cross_entropy", float(cross_entropy)) LOGGER.iteration_stop() self.current_step += 1
def log_hardware(): # TODO: asserts - what if you cannot launch those commands? # number of CPU threads cpu_info_command = 'cat /proc/cpuinfo' cpu_info = subprocess.run(cpu_info_command.split(), stdout=subprocess.PIPE).stdout.split() cpu_num_index = len(cpu_info) - cpu_info[::-1].index(b'processor') + 1 cpu_num = int(cpu_info[cpu_num_index]) + 1 # CPU name cpu_name_begin_index = cpu_info.index(b'name') cpu_name_end_index = cpu_info.index(b'stepping') cpu_name = b' '.join(cpu_info[cpu_name_begin_index + 2:cpu_name_end_index]).decode('utf-8') LOGGER.log(key='cpu_info', value={"num": cpu_num, "name": cpu_name}) # RAM memory ram_info_command = 'free -m -h' ram_info = subprocess.run(ram_info_command.split(), stdout=subprocess.PIPE).stdout.split() ram_index = ram_info.index(b'Mem:') + 1 ram = ram_info[ram_index].decode('utf-8') LOGGER.log(key='mem_info', value={"ram": ram}) # GPU nvidia_smi_command = 'nvidia-smi -q -x' nvidia_smi_output = subprocess.run(nvidia_smi_command.split(), stdout=subprocess.PIPE).stdout nvidia_smi = ET.fromstring(nvidia_smi_output) gpus = nvidia_smi.findall('gpu') ver = nvidia_smi.findall('driver_version') LOGGER.log( key="gpu_info", value={ "driver_version": ver[0].text, "num": len(gpus), "name": [g.find('product_name').text for g in gpus], "mem": [g.find('fb_memory_usage').find('total').text for g in gpus] })
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_training_args(parser) args, _ = parser.parse_known_args() LOGGER.set_model_name("Tacotron2_PyT") LOGGER.set_backends([ dllg.StdOutBackend(log_file=None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1), dllg.JsonBackend(log_file=args.log_file, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1) ]) LOGGER.register_metric("tacotron2_frames_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("tacotron2_latency", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE) model, args = load_and_setup_model(parser, args) log_hardware() log_args(args) os.makedirs(args.output_dir, exist_ok=True) LOGGER.iteration_start() measurements = {} anchor_dirs = [ os.path.join(args.dataset_path, anchor) for anchor in args.training_anchor_dirs ] metadatas = [load_metadata(anchor) for anchor in anchor_dirs] stft = TacotronSTFT(args.filter_length, args.hop_length, args.win_length, args.n_mel_channels, args.sampling_rate, args.mel_fmin, args.mel_fmax) with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"): for speaker_id in range(len(anchor_dirs)): metadata = metadatas[speaker_id] for npy_path, text in tqdm(metadata): seq = text_to_sequence(text, speaker_id, ['basic_cleaners']) seqs = torch.from_numpy(np.stack(seq)).unsqueeze(0) seq_lens = torch.IntTensor([len(text)]) wav = load_wav_to_torch(npy_path) mel = stft.mel_spectrogram(wav.unsqueeze(0)) mel = mel.squeeze() max_target_len = mel.size(1) - 1 max_target_len += args.n_frames_per_step - max_target_len % args.n_frames_per_step padded_mel = np.pad(mel, [(0, 0), (0, max_target_len - mel.size(1))], mode='constant', constant_values=args.mel_pad_val) target = padded_mel[:, ::args.n_frames_per_step] targets = torch.from_numpy(np.stack(target)).unsqueeze(0) target_lengths = torch.IntTensor([target.shape[1]]) outputs = model.infer( to_gpu(seqs).long(), to_gpu(seq_lens).int(), to_gpu(targets).half(), to_gpu(target_lengths).int()) _, mel_out, _, _ = [ output.cpu() for output in outputs if output is not None ] mel_out = mel_out.squeeze()[:, :mel.size(-1) - 1] assert (mel_out.shape[-1] == wav.shape[-1] // args.hop_length) fname = os.path.basename(npy_path) np.save(os.path.join(args.output_dir, fname), mel_out, allow_pickle=False) # GTA synthesis # magnitudes = stft.inv_mel_spectrogram(mel_out.squeeze()) # wav = griffin_lim(magnitudes, stft.stft_fn, 60) # save_wav(wav, os.path.join(args.output_dir, 'eval.wav')) LOGGER.log(key="tacotron2_latency", value=measurements['tacotron2_time']) LOGGER.log(key="latency", value=(measurements['tacotron2_time'])) LOGGER.iteration_stop() LOGGER.finish()
def main(_): """ Starting point of the application """ flags = PARSER.parse_args() params = _cmd_params(flags) tf.logging.set_verbosity(tf.logging.ERROR) # Optimization flags os.environ['CUDA_CACHE_DISABLE'] = '0' os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL' os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private' os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = 'data' os.environ['TF_ADJUST_HUE_FUSED'] = 'data' os.environ['TF_ADJUST_SATURATION_FUSED'] = 'data' os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = 'data' os.environ['TF_SYNC_ON_FINISH'] = '0' os.environ['TF_AUTOTUNE_THRESHOLD'] = '2' if params['use_amp']: os.environ['TF_ENABLE_AUTO_MIXED_PRECISION']='1' hvd.init() # Build run config gpu_options = tf.GPUOptions() config = tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True) config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) config.gpu_options.force_gpu_compatible = True config.intra_op_parallelism_threads = 1 config.inter_op_parallelism_threads = max(2, 40 // hvd.size() - 2) run_config = tf.estimator.RunConfig( save_summary_steps=1, tf_random_seed=None, session_config=config, save_checkpoints_steps=params['max_steps'], keep_checkpoint_max=1) # Build the estimator model estimator = tf.estimator.Estimator( model_fn=unet_fn, model_dir=params['model_dir'], config=run_config, params=params) dataset = Dataset(data_dir=params['data_dir'], batch_size=params['batch_size'], augment=params['augment'], gpu_id=hvd.rank(), num_gpus=hvd.size(), seed=params['seed']) if 'train' in params['exec_mode']: hooks = [hvd.BroadcastGlobalVariablesHook(0), TrainingHook(params['log_every'])] if params['benchmark']: hooks.append(ProfilingHook(params['batch_size'], params['log_every'], params['warmup_steps'])) LOGGER.log('Begin Training...') LOGGER.log(tags.RUN_START) estimator.train( input_fn=dataset.train_fn, steps=params['max_steps'], hooks=hooks) LOGGER.log(tags.RUN_STOP) if 'predict' in params['exec_mode']: if hvd.rank() == 0: predict_steps = dataset.test_size hooks = None if params['benchmark']: hooks = [ProfilingHook(params['batch_size'], params['log_every'], params['warmup_steps'])] predict_steps = params['warmup_steps'] * 2 * params['batch_size'] LOGGER.log('Begin Predict...') LOGGER.log(tags.RUN_START) predictions = estimator.predict( input_fn=lambda: dataset.test_fn(count=math.ceil(predict_steps/dataset.test_size)), hooks=hooks) binary_masks = [np.argmax(p['logits'], axis=-1).astype(np.uint8) * 255 for p in predictions] LOGGER.log(tags.RUN_STOP) multipage_tif = [Image.fromarray(mask).resize(size=(512, 512), resample=Image.BILINEAR) for mask in binary_masks] output_dir = os.path.join(params['model_dir'], 'pred') if not os.path.exists(output_dir): os.makedirs(output_dir) multipage_tif[0].save(os.path.join(output_dir, 'test-masks.tif'), compression="tiff_deflate", save_all=True, append_images=multipage_tif[1:]) LOGGER.log("Predict finished") LOGGER.log("Results available in: {}".format(output_dir))
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() LOGGER.set_model_name("Tacotron2_PyT") LOGGER.set_backends([ dllg.StdOutBackend(log_file=None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1), dllg.JsonBackend(log_file=args.log_file, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1) ]) LOGGER.register_metric("tacotron2_items_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("tacotron2_latency", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("waveglow_items_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("waveglow_latency", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE) log_hardware() log_args(args) tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.amp_run) waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.amp_run) denoiser = Denoiser(waveglow).cuda() tacotron2.forward = tacotron2.infer type(tacotron2).forward = type(tacotron2).infer jitted_tacotron2 = torch.jit.script(tacotron2) texts = [] try: f = open(args.input, 'r') texts = f.readlines() except: print("Could not read file") sys.exit(1) if args.include_warmup: sequence = torch.randint(low=0, high=148, size=(1, 50), dtype=torch.long).cuda() input_lengths = torch.IntTensor([sequence.size(1)]).cuda().long() for i in range(3): with torch.no_grad(): mel, mel_lengths = jitted_tacotron2(sequence, input_lengths) _ = waveglow.infer(mel) LOGGER.iteration_start() measurements = {} sequences_padded, input_lengths = prepare_input_sequence(texts) with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"): mel, mel_lengths = jitted_tacotron2(sequences_padded, input_lengths) with torch.no_grad(), MeasureTime(measurements, "waveglow_time"): audios = waveglow.infer(mel, sigma=args.sigma_infer) audios = audios.float() audios = denoiser(audios, strength=args.denoising_strength).squeeze(1) tacotron2_infer_perf = mel.size(0) * mel.size( 2) / measurements['tacotron2_time'] waveglow_infer_perf = audios.size(0) * audios.size( 1) / measurements['waveglow_time'] LOGGER.log(key="tacotron2_items_per_sec", value=tacotron2_infer_perf) LOGGER.log(key="tacotron2_latency", value=measurements['tacotron2_time']) LOGGER.log(key="waveglow_items_per_sec", value=waveglow_infer_perf) LOGGER.log(key="waveglow_latency", value=measurements['waveglow_time']) LOGGER.log(key="latency", value=(measurements['tacotron2_time'] + measurements['waveglow_time'])) for i, audio in enumerate(audios): audio = audio[:mel_lengths[i] * args.stft_hop_length] audio = audio / torch.max(torch.abs(audio)) audio_path = args.output + "audio_" + str(i) + ".wav" write(audio_path, args.sampling_rate, audio.cpu().numpy()) LOGGER.iteration_stop() LOGGER.finish()
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() LOGGER.set_model_name("Tacotron2_PyT") LOGGER.set_backends([ dllg.StdOutBackend(log_file=None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1), dllg.JsonBackend(log_file=args.log_file, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1) ]) LOGGER.register_metric("tacotron2_items_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("waveglow_items_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE) log_hardware() log_args(args) # tacotron2 model filepath was specified if args.tacotron2: # Setup Tacotron2 tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.fp16_run) # file with mel spectrogram was specified elif args.mel_file: mel = torch.load(args.mel_file) mel = torch.autograd.Variable(mel.cuda()) mel = torch.unsqueeze(mel, 0) # Setup WaveGlow if args.old_waveglow: waveglow = torch.load(args.waveglow)['model'] waveglow = waveglow.remove_weightnorm(waveglow) waveglow = waveglow.cuda() waveglow.eval() else: waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.fp16_run) texts = [] try: f = open(args.input, 'r') texts = f.readlines() except: print("Could not read file. Using default text.") texts = ["The forms of printed letters should be beautiful, and\ that their arrangement on the page should be reasonable and\ a help to the shapeliness of the letters themselves."] for i, text in enumerate(texts): LOGGER.iteration_start() sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :] sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() if args.tacotron2: tacotron2_t0 = time.time() with torch.no_grad(): _, mel, _, _ = tacotron2.inference(sequence) tacotron2_t1 = time.time() tacotron2_infer_perf = sequence.size(1)/(tacotron2_t1-tacotron2_t0) LOGGER.log(key="tacotron2_items_per_sec", value=tacotron2_infer_perf) waveglow_t0 = time.time() with torch.no_grad(): audio = waveglow.infer(mel, sigma=args.sigma_infer) audio = audio.float() waveglow_t1 = time.time() waveglow_infer_perf = audio[0].size(0)/(waveglow_t1-waveglow_t0) audio_path = args.output + "audio_"+str(i)+".wav" write(audio_path, args.sampling_rate, audio[0].data.cpu().numpy()) LOGGER.log(key="waveglow_items_per_sec", value=waveglow_infer_perf) LOGGER.iteration_stop() LOGGER.finish()
def after_run(self, run_context, run_values): batch_time = time.time() - self._t0 imgs_per_sec = int(self._global_batch_size / batch_time) is_log_step = self._current_step % self._log_every == 0 if is_log_step: if self._current_step > self._warmup_steps: imgs_per_sec = float( ProfilerHook.moving_average(self._processing_speed_arr, n=30)[-1]) LOGGER.log("iteration", int(self._current_step)) LOGGER.log("total_ips", float(imgs_per_sec)) if self._is_training: LOGGER.log("weight_decay", float(run_values.results["weight_decay"])) LOGGER.log("reconstruction_loss", float(run_values.results["reconstruction_loss"])) LOGGER.log("total_loss", float(run_values.results["total_loss"])) LOGGER.log("learning_rate", float(run_values.results["learning_rate"])) for key, val in sorted(run_values.results["iou_scores"].items(), key=operator.itemgetter(0)): LOGGER.log("iou_score - THS %s" % key, float(val)) LOGGER.log("True Positives:", run_values.results["confusion_matrix"]["tp"]) LOGGER.log("True Negatives:", run_values.results["confusion_matrix"]["tn"]) LOGGER.log("False Positives:", run_values.results["confusion_matrix"]["fp"]) LOGGER.log("False Negatives:", run_values.results["confusion_matrix"]["fn"]) if self._sample_dir is not None and self._is_training: for key in sorted(run_values.results["samples"].keys(), key=operator.itemgetter(0)): with open( os.path.join( self._sample_dir, "sample_step_%04d_ths_%s.jpeg" % (self._current_step, key)), 'wb') as fd: fd.write(run_values.results["samples"][key]) with open( os.path.join( self._sample_dir, "sample_step_%04d_mask.jpeg" % self._current_step), 'wb') as fd: fd.write(run_values.results["samples"]["mask"]) print("######### STOP: %d ##############" % self._current_step) elif self._current_step > self._warmup_steps: # Do not store speed for log step due to additional fetches self._processing_speed_arr.append(imgs_per_sec)