예제 #1
0
        def training_data_fn():

            if self.run_hparams.data_dir is not None:

                return data_utils.get_tfrecords_input_fn(
                    data_dir=self.run_hparams.data_dir,
                    num_epochs=num_iter,
                    batch_size=batch_size,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    training=True,
                    # distort_color=self.run_hparams.distort_colors,
                    # num_threads=self.run_hparams.num_preprocessing_threads,
                    datasets_num_private_threads=None
                    # deterministic=False if self.run_hparams.seed is None else True
                )

            else:
                if hvd.rank() == 0:
                    LOGGER.log("Using Synthetic Data ...")
                return data_utils.get_synth_input_fn(
                    batch_size=batch_size,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    num_channels=self.run_hparams.n_channels,
                    data_format=self.run_hparams.input_format,
                    num_classes=self.run_hparams.n_classes,
                    dtype=self.run_hparams.dtype,
                )
예제 #2
0
    def _get_session_config(mode, use_xla):

        if mode not in ["train", 'validation', 'benchmark']:
            raise ValueError(
                "Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark')"
                % mode)

        config = tf.ConfigProto()

        config.allow_soft_placement = True
        config.log_device_placement = False

        config.gpu_options.allow_growth = True

        if hvd_utils.is_using_hvd():
            config.gpu_options.visible_device_list = str(hvd.local_rank())

        if use_xla:
            LOGGER.log("XLA is activated - Experimental Feature")
            config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1

        config.gpu_options.force_gpu_compatible = True  # Force pinned memory

        if mode == 'train':
            config.intra_op_parallelism_threads = 1  # Avoid pool of Eigen threads

            if hvd_utils.is_using_hvd():
                config.inter_op_parallelism_threads = max(
                    2, (multiprocessing.cpu_count() // hvd.size()) - 2)
            else:
                config.inter_op_parallelism_threads = 4

        return config
예제 #3
0
파일: train.py 프로젝트: herenje/tacotron2
def validate(model, criterion, valate_dataset, iteration, collate_fn,
             distributed_run, args):
    """Handles all the validation scoring and printing"""
    with evaluating(model), torch.no_grad():
        val_loader = DataLoader(valate_dataset,
                                num_workers=1,
                                shuffle=False,
                                batch_size=args.batch_size //
                                len(args.validation_anchor_dirs),
                                pin_memory=False,
                                collate_fn=collate_fn)

        val_loss = 0.0
        for i, batch in enumerate(val_loader):
            x, y, num_frames = batch_to_gpu(batch)
            y_pred = model(x)
            loss = criterion(y_pred, y)
            if distributed_run:
                reduced_val_loss = reduce_tensor(loss.data,
                                                 args.world_size).item()
            else:
                reduced_val_loss = loss.item()
            val_loss += reduced_val_loss
        val_loss = val_loss / (i + 1)

    LOGGER.log(key="val_iter_loss", value=reduced_val_loss)
예제 #4
0
    def predict(self):
        """Perform prediction with the runner's classifier """

        if hvd.rank() == 0:
            LOGGER.log("Begin predict...")

            begin = time.time()

            pred = self._classifier.predict(input_fn=self._dataset.test_fn)

            predictions = [p['logits'] for p in pred]

            print('Inference took: {} sec'.format(time.time() - begin))

            binary_masks = [np.argmax(p, axis=-1).astype(np.uint8) * 255 for p in predictions]
            multipage_tif = [Image.fromarray(mask).resize(size=(512, 512), resample=Image.BILINEAR)
                             for mask in binary_masks]

            output_dir = os.path.join(self._model_dir, 'pred')

            if not os.path.exists(output_dir):
                os.makedirs(output_dir)

            multipage_tif[0].save(os.path.join(output_dir, 'test-masks.tif'),
                                  compression="tiff_deflate",
                                  save_all=True,
                                  append_images=multipage_tif[1:])

            pickle.dump(predictions, open(os.path.join(output_dir, 'predictions.pkl'), 'wb'))

            LOGGER.log("Predict finished")
예제 #5
0
        def evaluation_data_fn():

            if self.run_hparams.data_dir is not None:
                return data_utils.get_tfrecords_input_fn(
                    filenames=filenames,
                    batch_size=batch_size,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    training=False,
                    distort_color=self.run_hparams.distort_colors,
                    num_threads=self.run_hparams.num_preprocessing_threads,
                    deterministic=False
                    if self.run_hparams.seed is None else True)

            else:
                LOGGER.log("Using Synthetic Data ...\n")
                return data_utils.get_synth_input_fn(
                    batch_size=batch_size,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    num_channels=self.run_hparams.n_channels,
                    data_format=self.run_hparams.input_format,
                    num_classes=self.run_hparams.n_classes,
                    dtype=self.run_hparams.dtype,
                )
예제 #6
0
        def evaluation_data_fn():

            if not is_benchmark or self.run_hparams.data_dir is not None:

                return self.dataset.dataset_fn(
                    batch_size=batch_size,
                    training=False,
                    input_shape=list(self.run_hparams.input_shape) +
                    [self.run_hparams.n_channels],
                    mask_shape=list(self.run_hparams.mask_shape) +
                    [self.run_hparams.n_channels],
                    num_threads=64,
                    use_gpu_prefetch=True,
                    normalize_data_method="zero_centered",
                    only_defective_images=False,
                    augment_data=False,
                    seed=self.run_hparams.seed)

            else:
                LOGGER.log("Using Synthetic Data ...")

                return self.dataset.synth_dataset_fn(
                    batch_size=batch_size,
                    training=False,
                    input_shape=list(self.run_hparams.input_shape) +
                    [self.run_hparams.n_channels],
                    mask_shape=list(self.run_hparams.mask_shape) +
                    [self.run_hparams.n_channels],
                    num_threads=64,
                    use_gpu_prefetch=True,
                    normalize_data_method="zero_centered",
                    only_defective_images=False,
                    augment_data=False,
                    seed=self.run_hparams.seed)
    def end(self, session):

        try:
            avg_processing_speed = float(
                ProfilerHook.moving_average(self._processing_speed_arr,
                                            n=100)[-1])
        except:
            avg_processing_speed = float(np.mean(self._processing_speed_arr))

        total_processing_time = time.time() - self._start_training_time

        total_processing_hours, rem = divmod(total_processing_time, 3600)
        total_processing_minutes, total_processing_seconds = divmod(rem, 60)

        LOGGER.log("Final Summary:\n"
                   "\t[*] Average Imgs/sec: %d\n"
                   "\t[*] Total Processing Time: %dh %02dm %02ds\n" %
                   (avg_processing_speed, total_processing_hours,
                    total_processing_minutes, total_processing_seconds))

        perf_dict = {
            'throughput': str(avg_processing_speed),
            'processing_time': str(total_processing_time)
        }

        perf_filename = "performances_%s.json" % ("train" if self._is_training
                                                  else "eval")

        with open(os.path.join(self._sample_dir, "..", perf_filename),
                  'w') as f:
            json.dump(perf_dict, f)
예제 #8
0
def validate(model, criterion, valset, iteration, batch_size, world_size,
             collate_fn, distributed_run, rank, batch_to_gpu, fp16_run):
    """Handles all the validation scoring and printing"""
    with evaluating(model), torch.no_grad():
        val_sampler = DistributedSampler(valset) if distributed_run else None
        val_loader = DataLoader(valset,
                                num_workers=1,
                                shuffle=False,
                                sampler=val_sampler,
                                batch_size=batch_size,
                                pin_memory=False,
                                collate_fn=collate_fn)

        val_loss = 0.0
        for i, batch in enumerate(val_loader):
            x, y, len_x = batch_to_gpu(batch)
            if fp16_run:
                y_pred = model(fp32_to_fp16(x))
                loss = criterion(fp16_to_fp32(y_pred), y)
            else:
                y_pred = model(x)
                loss = criterion(y_pred, y)
            if distributed_run:
                reduced_val_loss = reduce_tensor(loss.data, world_size).item()
            else:
                reduced_val_loss = loss.item()
            val_loss += reduced_val_loss
        val_loss = val_loss / (i + 1)

    LOGGER.log(key="val_iter_loss", value=reduced_val_loss)
예제 #9
0
    def after_create_session(self, session, coord):

        params_count = tf.get_default_graph().get_tensor_by_name("trainable_parameters_count_ref:0")
        _params_count = session.run(params_count)

        LOGGER.log("# Total Trainable Parameters:", int(_params_count))

        self._start_training_time = time.time()
예제 #10
0
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU.
    """
    parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Inference')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    LOGGER.set_model_name("Tacotron2_PyT")
    LOGGER.set_backends([
        dllg.StdOutBackend(log_file=None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1),
        dllg.JsonBackend(log_file=args.log_file, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1)
    ])
    LOGGER.register_metric("tacotron2_frames_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("tacotron2_latency", metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE)

    model = load_and_setup_model(parser, args)

    log_hardware()
    log_args(args)

    if args.include_warmup:
        sequences = torch.randint(low=0, high=148, size=(1,50), dtype=torch.long).cuda()
        text_lengths = torch.IntTensor([sequence.size(1)]).cuda().long()
        for i in range(3):
            with torch.no_grad():
                _, mels, _, _, mel_lengths = model.infer(sequences, text_lengths)

    os.makedirs(args.output, exist_ok=True)

    LOGGER.iteration_start()

    measurements = {}

    anchor_dirs = [os.path.join(args.dataset_path, anchor) for anchor in args.anchor_dirs]
    metadatas = [load_metadata(anchor) for anchor in anchor_dirs]
    with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"):
        for speaker_id in range(len(anchor_dirs)):
            metadata = metadatas[speaker_id]
            for mel_path, text in tqdm(metadata):
               seq = text_to_sequence(text, speaker_id, ['basic_cleaners'])
               seqs = torch.from_numpy(np.stack(seq)).unsqueeze(0)
               seq_lens = torch.IntTensor([len(text)])
               melspec = torch.from_numpy(np.load(mel_path))
               target = melspec[:, ::args.reduction_factor]
               targets = torch.from_numpy(np.stack(target)).unsqueeze(0)
               target_lengths = torch.IntTensor([target.shape[1]])
               inputs = (to_gpu(seqs).long(), to_gpu(seq_lens).int(), to_gpu(targets).float(), to_gpu(target_lengths).int())
               _, mel_outs, _, _ = model(inputs)
               fname = os.path.basename(mel_path)
               np.save(os.path.join(args.output, fname), mel_outs[0, :, :melspec.shape[1]], allow_pickle=False)

    LOGGER.log(key="tacotron2_latency", value=measurements['tacotron2_time'])
    LOGGER.log(key="latency", value=(measurements['tacotron2_time']))
    LOGGER.iteration_stop()
    LOGGER.finish()
예제 #11
0
    def after_run(self, run_context, run_values):
        batch_time = time.time() - self.t0
        ips = self.global_batch_size / batch_time
        if self.current_step >= self.warmup_steps:
            LOGGER.log("iteration", int(self.current_step))
            LOGGER.log("total_ips", float(ips))
            LOGGER.iteration_stop()

        self.current_step += 1
예제 #12
0
def _log_hparams(classname, layername, **kwargs):

    log_msg = "%s: `%s`" % (classname, layername)

    for arg, val in sorted(kwargs.items()):
        log_msg += "\n\t[*] {}: {}".format(arg, val)

    log_msg += "\n"

    if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0:
        LOGGER.log(log_msg)
예제 #13
0
    def train(self):
        """Perform training with the runner's classifier"""
        LOGGER.log("Begin training...")

        try:
            self._classifier.train(input_fn=self._dataset.train_fn,
                                   steps=self._max_steps,
                                   hooks=self._training_hooks)
        except KeyboardInterrupt:
            print("Keyboard interrupt")

        LOGGER.log("Training finished")
예제 #14
0
    def _get_session_config(mode,
                            use_xla,
                            use_dali,
                            gpu_memory_fraction,
                            gpu_id=0):

        if mode not in ["train", 'validation', 'benchmark', 'inference']:
            raise ValueError(
                "Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark', 'inference')"
                % mode)

        # Limit available GPU memory (tune the size)
        if use_dali:
            LOGGER.log(
                "DALI is activated, GPU memory fraction used for training is limited to",
                gpu_memory_fraction)
            gpu_options = tf.GPUOptions(
                per_process_gpu_memory_fraction=gpu_memory_fraction)
            config = tf.ConfigProto(gpu_options=gpu_options)
            config.gpu_options.allow_growth = False

        else:
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True

        config.allow_soft_placement = True
        config.log_device_placement = False

        config.gpu_options.visible_device_list = str(gpu_id)

        if hvd_utils.is_using_hvd():
            config.gpu_options.visible_device_list = str(hvd.local_rank())

        if use_xla:
            LOGGER.log("XLA is activated - Experimental Feature")
            config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1

        config.gpu_options.force_gpu_compatible = True  # Force pinned memory

        if mode == 'train':
            config.intra_op_parallelism_threads = 1  # Avoid pool of Eigen threads

            if hvd_utils.is_using_hvd():
                config.inter_op_parallelism_threads = max(
                    2, (multiprocessing.cpu_count() // hvd.size()) - 2)
            else:
                config.inter_op_parallelism_threads = 4

        return config
예제 #15
0
파일: main.py 프로젝트: Tejalsjsu/bert
def main(_):
    """
    Starting point of the application
    """

    flags = PARSER.parse_args()

    params = _cmd_params(flags)

    tf.logging.set_verbosity(tf.logging.ERROR)

    # Optimization flags
    os.environ['CUDA_CACHE_DISABLE'] = '0'

    os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL'

    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

    os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'

    os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'

    os.environ['TF_ADJUST_HUE_FUSED'] = '1'
    os.environ['TF_ADJUST_SATURATION_FUSED'] = '1'
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    os.environ['TF_SYNC_ON_FINISH'] = '0'
    os.environ['TF_AUTOTUNE_THRESHOLD'] = '2'
    os.environ['TF_DISABLE_NVTX_RANGES'] = '1'

    if params['use_amp']:
        assert params['dtype'] == tf.float32, "TF-AMP requires FP32 precision"

        LOGGER.log("TF AMP is activated - Experimental Feature")
        os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1'

    runner = Runner(params)

    if 'train' in params['exec_mode'] \
            or 'train_and predict' in params['exec_mode']:
        runner.train()
    if 'train_and predict' in params['exec_mode'] \
            or 'predict' in params['exec_mode']:
        runner.predict()
    if 'benchmark' in params['exec_mode']:
        runner.benchmark()
    def end(self, session):

        try:
            avg_processing_speed = float(ProfilerHook.moving_average(self._processing_speed_arr, n=100)[-1])
        except:
            avg_processing_speed = float(np.mean(self._processing_speed_arr))

        total_processing_time = time.time() - self._start_training_time

        total_processing_hours, rem = divmod(total_processing_time, 3600)
        total_processing_minutes, total_processing_seconds = divmod(rem, 60)

        LOGGER.log(
            "Final Summary:\n"
            "\t[*] Average Imgs/sec: %d\n"
            "\t[*] Total Processing Time: %dh %02dm %02ds\n" %
            (avg_processing_speed, total_processing_hours, total_processing_minutes, total_processing_seconds)
        )
예제 #17
0
    def _build_hparams(*args):

        hparams = tf.contrib.training.HParams()

        for _hparams in args:
            if not isinstance(_hparams, tf.contrib.training.HParams):
                raise ValueError("Non valid HParams argument object detected:",
                                 _hparams)

            for key, val in _hparams.values().items():
                try:
                    hparams.add_hparam(name=key, value=val)

                except ValueError:
                    LOGGER.log(
                        "the parameter `{}` already exists - existing value: {} and duplicated value: {}"
                        .format(key, hparams.get(key), val))

        return hparams
예제 #18
0
        def training_data_fn():
            
            if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None:
                if hvd.rank() == 0:
                    LOGGER.log("Using DALI input... ")
                    
                return data_utils.get_dali_input_fn(
                    filenames=filenames,
                    idx_filenames=idx_filenames,
                    batch_size=batch_size,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    training=True,
                    distort_color=self.run_hparams.distort_colors,
                    num_threads=self.run_hparams.num_preprocessing_threads,
                    deterministic=False if self.run_hparams.seed is None else True
                )
            
            elif self.run_hparams.data_dir is not None:

                return data_utils.get_tfrecords_input_fn(
                    filenames=filenames,
                    batch_size=batch_size,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    training=True,
                    distort_color=self.run_hparams.distort_colors,
                    num_threads=self.run_hparams.num_preprocessing_threads,
                    deterministic=False if self.run_hparams.seed is None else True
                )

            else:
                if hvd.rank() == 0:
                    LOGGER.log("Using Synthetic Data ...")
                return data_utils.get_synth_input_fn(
                    batch_size=batch_size,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    num_channels=self.run_hparams.n_channels,
                    data_format=self.run_hparams.input_format,
                    num_classes=self.run_hparams.n_classes,
                    dtype=self.run_hparams.dtype,
                )
예제 #19
0
    def __init__(self, params):
        hvd.init()

        LOGGER.log(str(params))

        data_dir = params['data_dir']
        batch_size = params['batch_size']
        augment = params['augment']
        benchmark = params['benchmark']
        seed = params['seed']

        self._model_dir = params['model_dir']
        self._max_steps = params['max_steps']

        self._classifier = tf.estimator.Estimator(
            model_fn=_model_fn,
            model_dir=self._model_dir,
            params=params,
            config=tf.estimator.RunConfig(
                tf_random_seed=None,
                session_config=self._get_session_config(),
                save_checkpoints_steps=self._max_steps
                if hvd.rank() == 0 else None,
                keep_checkpoint_max=1))

        self._dataset = Dataset(data_dir=data_dir,
                                batch_size=batch_size,
                                augment=augment,
                                gpu_id=hvd.rank(),
                                num_gpus=hvd.size(),
                                seed=seed)

        self._training_hooks = [hvd.BroadcastGlobalVariablesHook(0)]

        if benchmark and hvd.rank() == 0:
            self._training_hooks.append(
                ProfilerHook(self._model_dir,
                             batch_size,
                             log_every=params['log_every'],
                             warmup_steps=params['warmup_steps']))
def learning_rate_scheduler(lr_init, lr_warmup_epochs, global_step, batch_size,
                            num_batches_per_epoch, num_decay_steps, num_gpus,
                            use_cosine_lr):
    def get_scaled_base_learning_rate():
        """Calculates base learning rate for creating lr schedule.
        In replicated mode, gradients are summed rather than averaged which, with
        the sgd and momentum optimizers, increases the effective learning rate by
        lr * num_gpus. Dividing the base lr by num_gpus negates the increase.
        Args:
          batch_size: Total batch-size.
        Returns:
          Base learning rate to use to create lr schedule.
        """

        base_lr = lr_init * num_gpus

        # Starting LR = 0.1 with BS = 256, else linearly scale
        return base_lr * (batch_size / 256.0)

    rescaled_lr = get_scaled_base_learning_rate()

    if use_cosine_lr:
        LOGGER.log("Using cosine learning rate schedule")
        lr = tf.train.cosine_decay(rescaled_lr, global_step, num_decay_steps)

    else:
        LOGGER.log("Using step learning rate schedule")
        boundaries = [int(num_batches_per_epoch * x) for x in [30, 60, 80, 90]]

        values = [1e0, 1e-1, 1e-2, 1e-3, 1e-4]
        values = [rescaled_lr * v for v in values]

        lr = tf.train.piecewise_constant(global_step, boundaries, values)

    warmup_steps = int(num_batches_per_epoch * lr_warmup_epochs)
    warmup_lr = (rescaled_lr * tf.cast(global_step, tf.float32) /
                 tf.cast(warmup_steps, tf.float32))

    return tf.cond(global_step < warmup_steps, lambda: warmup_lr, lambda: lr)
예제 #21
0
    def predict(self, to_predict):

        estimator_params = {}

        if to_predict is not None:
            filenames = runner_utils.parse_inference_input(to_predict)

        image_classifier = self._get_estimator(
            mode='inference',
            run_params=estimator_params,
            use_xla=self.run_hparams.use_xla,
            use_dali=self.run_hparams.use_dali,
            gpu_memory_fraction=self.run_hparams.gpu_memory_fraction)

        inference_hooks = []

        def inference_data_fn():
            return data_utils.get_inference_input_fn(
                filenames=filenames,
                height=self.run_hparams.height,
                width=self.run_hparams.width,
                num_threads=self.run_hparams.num_preprocessing_threads)

        try:
            inference_results = image_classifier.predict(
                input_fn=inference_data_fn,
                predict_keys=None,
                hooks=inference_hooks,
                yield_single_examples=True)

            for result in inference_results:
                LOGGER.log(result['classes'],
                           str(result['probabilities'][result['classes']]))

        except KeyboardInterrupt:
            print("Keyboard interrupt")

        LOGGER.log('Ending Inference ...')
예제 #22
0
        def evaluation_data_fn():

            if self.run_hparams.data_dir is not None:
                return data_utils.get_tfrecords_input_fn(
                    data_dir=self.run_hparams.data_dir,
                    num_epochs=num_iter,
                    batch_size=batch_size,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    training=True,
                    datasets_num_private_threads=None)

            else:
                LOGGER.log("Using Synthetic Data ...\n")
                return data_utils.get_synth_input_fn(
                    batch_size=batch_size,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    num_channels=self.run_hparams.n_channels,
                    data_format=self.run_hparams.input_format,
                    num_classes=self.run_hparams.n_classes,
                    dtype=self.run_hparams.dtype,
                )
    def after_run(self,
                  run_context,
                  run_values):
        cross_loss, dice_loss, total_loss = run_values.results

        batch_time = time.time() - self._t0
        ips = self._global_batch_size / batch_time
        ips *= hvd.size()

        if self._current_step >= self._warmup_steps:
            LOGGER.log("iteration", int(self._current_step))
            LOGGER.log("loss", float(cross_loss))
            LOGGER.log("dice_loss", float(dice_loss))
            LOGGER.log("total_loss", float(total_loss))
            self._perf.record(ips)
            LOGGER.iteration_stop()

        self._current_step += 1
    def after_run(self, run_context, run_values):

        #global_step, cross_entropy, logits_flat, labels_flat, rgb_inputs, depth_inputs, labels, logits = run_values.results
        global_step, cross_entropy = run_values.results
        batch_time = time.time() - self.t0
        ips = self.global_batch_size / batch_time

        LOGGER.log("iteration", int(self.current_step))
        LOGGER.log("imgs_per_sec", float(ips))
        LOGGER.log("cross_entropy", float(cross_entropy))
        LOGGER.iteration_stop()
        self.current_step += 1
def log_hardware():
    # TODO: asserts - what if you cannot launch those commands?
    # number of CPU threads
    cpu_info_command = 'cat /proc/cpuinfo'
    cpu_info = subprocess.run(cpu_info_command.split(),
                              stdout=subprocess.PIPE).stdout.split()
    cpu_num_index = len(cpu_info) - cpu_info[::-1].index(b'processor') + 1
    cpu_num = int(cpu_info[cpu_num_index]) + 1

    # CPU name
    cpu_name_begin_index = cpu_info.index(b'name')
    cpu_name_end_index = cpu_info.index(b'stepping')
    cpu_name = b' '.join(cpu_info[cpu_name_begin_index +
                                  2:cpu_name_end_index]).decode('utf-8')

    LOGGER.log(key='cpu_info', value={"num": cpu_num, "name": cpu_name})

    # RAM memory
    ram_info_command = 'free -m -h'
    ram_info = subprocess.run(ram_info_command.split(),
                              stdout=subprocess.PIPE).stdout.split()
    ram_index = ram_info.index(b'Mem:') + 1
    ram = ram_info[ram_index].decode('utf-8')

    LOGGER.log(key='mem_info', value={"ram": ram})

    # GPU
    nvidia_smi_command = 'nvidia-smi -q -x'
    nvidia_smi_output = subprocess.run(nvidia_smi_command.split(),
                                       stdout=subprocess.PIPE).stdout
    nvidia_smi = ET.fromstring(nvidia_smi_output)
    gpus = nvidia_smi.findall('gpu')
    ver = nvidia_smi.findall('driver_version')

    LOGGER.log(
        key="gpu_info",
        value={
            "driver_version": ver[0].text,
            "num": len(gpus),
            "name": [g.find('product_name').text for g in gpus],
            "mem":
            [g.find('fb_memory_usage').find('total').text for g in gpus]
        })
예제 #26
0
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_training_args(parser)
    args, _ = parser.parse_known_args()

    LOGGER.set_model_name("Tacotron2_PyT")
    LOGGER.set_backends([
        dllg.StdOutBackend(log_file=None,
                           logging_scope=dllg.TRAIN_ITER_SCOPE,
                           iteration_interval=1),
        dllg.JsonBackend(log_file=args.log_file,
                         logging_scope=dllg.TRAIN_ITER_SCOPE,
                         iteration_interval=1)
    ])
    LOGGER.register_metric("tacotron2_frames_per_sec",
                           metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("tacotron2_latency",
                           metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE)

    model, args = load_and_setup_model(parser, args)

    log_hardware()
    log_args(args)

    os.makedirs(args.output_dir, exist_ok=True)

    LOGGER.iteration_start()

    measurements = {}

    anchor_dirs = [
        os.path.join(args.dataset_path, anchor)
        for anchor in args.training_anchor_dirs
    ]
    metadatas = [load_metadata(anchor) for anchor in anchor_dirs]
    stft = TacotronSTFT(args.filter_length, args.hop_length, args.win_length,
                        args.n_mel_channels, args.sampling_rate, args.mel_fmin,
                        args.mel_fmax)
    with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"):
        for speaker_id in range(len(anchor_dirs)):
            metadata = metadatas[speaker_id]
            for npy_path, text in tqdm(metadata):
                seq = text_to_sequence(text, speaker_id, ['basic_cleaners'])
                seqs = torch.from_numpy(np.stack(seq)).unsqueeze(0)
                seq_lens = torch.IntTensor([len(text)])
                wav = load_wav_to_torch(npy_path)
                mel = stft.mel_spectrogram(wav.unsqueeze(0))
                mel = mel.squeeze()
                max_target_len = mel.size(1) - 1
                max_target_len += args.n_frames_per_step - max_target_len % args.n_frames_per_step
                padded_mel = np.pad(mel, [(0, 0),
                                          (0, max_target_len - mel.size(1))],
                                    mode='constant',
                                    constant_values=args.mel_pad_val)
                target = padded_mel[:, ::args.n_frames_per_step]
                targets = torch.from_numpy(np.stack(target)).unsqueeze(0)
                target_lengths = torch.IntTensor([target.shape[1]])
                outputs = model.infer(
                    to_gpu(seqs).long(),
                    to_gpu(seq_lens).int(),
                    to_gpu(targets).half(),
                    to_gpu(target_lengths).int())
                _, mel_out, _, _ = [
                    output.cpu() for output in outputs if output is not None
                ]
                mel_out = mel_out.squeeze()[:, :mel.size(-1) - 1]
                assert (mel_out.shape[-1] == wav.shape[-1] // args.hop_length)
                fname = os.path.basename(npy_path)
                np.save(os.path.join(args.output_dir, fname),
                        mel_out,
                        allow_pickle=False)
                # GTA synthesis
                # magnitudes = stft.inv_mel_spectrogram(mel_out.squeeze())
                # wav = griffin_lim(magnitudes, stft.stft_fn, 60)
                # save_wav(wav, os.path.join(args.output_dir, 'eval.wav'))

    LOGGER.log(key="tacotron2_latency", value=measurements['tacotron2_time'])
    LOGGER.log(key="latency", value=(measurements['tacotron2_time']))
    LOGGER.iteration_stop()
    LOGGER.finish()
예제 #27
0
def main(_):
    """
    Starting point of the application
    """

    flags = PARSER.parse_args()

    params = _cmd_params(flags)

    tf.logging.set_verbosity(tf.logging.ERROR)

    # Optimization flags
    os.environ['CUDA_CACHE_DISABLE'] = '0'

    os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL'

    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

    os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'

    os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = 'data'

    os.environ['TF_ADJUST_HUE_FUSED'] = 'data'
    os.environ['TF_ADJUST_SATURATION_FUSED'] = 'data'
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = 'data'

    os.environ['TF_SYNC_ON_FINISH'] = '0'
    os.environ['TF_AUTOTUNE_THRESHOLD'] = '2'

    if params['use_amp']:
        os.environ['TF_ENABLE_AUTO_MIXED_PRECISION']='1'

    hvd.init()

    # Build run config
    gpu_options = tf.GPUOptions()
    config = tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True)
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    config.gpu_options.force_gpu_compatible = True
    config.intra_op_parallelism_threads = 1
    config.inter_op_parallelism_threads = max(2, 40 // hvd.size() - 2)

    run_config = tf.estimator.RunConfig(
        save_summary_steps=1,
        tf_random_seed=None,
        session_config=config,
        save_checkpoints_steps=params['max_steps'],
        keep_checkpoint_max=1)

    # Build the estimator model
    estimator = tf.estimator.Estimator(
        model_fn=unet_fn,
        model_dir=params['model_dir'],
        config=run_config,
        params=params)

    dataset = Dataset(data_dir=params['data_dir'],
                      batch_size=params['batch_size'],
                      augment=params['augment'],
                      gpu_id=hvd.rank(),
                      num_gpus=hvd.size(),
                      seed=params['seed'])

    if 'train' in params['exec_mode']:
        hooks = [hvd.BroadcastGlobalVariablesHook(0),
                 TrainingHook(params['log_every'])]

        if params['benchmark']:
            hooks.append(ProfilingHook(params['batch_size'],
                                       params['log_every'],
                                       params['warmup_steps']))

        LOGGER.log('Begin Training...')

        LOGGER.log(tags.RUN_START)
        estimator.train(
            input_fn=dataset.train_fn,
            steps=params['max_steps'],
            hooks=hooks)
        LOGGER.log(tags.RUN_STOP)

    if 'predict' in params['exec_mode']:
        if hvd.rank() == 0:
            predict_steps = dataset.test_size
            hooks = None
            if params['benchmark']:
                hooks = [ProfilingHook(params['batch_size'],
                                       params['log_every'],
                                       params['warmup_steps'])]
                predict_steps = params['warmup_steps'] * 2 * params['batch_size']

            LOGGER.log('Begin Predict...')
            LOGGER.log(tags.RUN_START)

            predictions = estimator.predict(
                input_fn=lambda: dataset.test_fn(count=math.ceil(predict_steps/dataset.test_size)),
                hooks=hooks)

            binary_masks = [np.argmax(p['logits'], axis=-1).astype(np.uint8) * 255 for p in predictions]
            LOGGER.log(tags.RUN_STOP)

            multipage_tif = [Image.fromarray(mask).resize(size=(512, 512), resample=Image.BILINEAR)
                             for mask in binary_masks]

            output_dir = os.path.join(params['model_dir'], 'pred')

            if not os.path.exists(output_dir):
                os.makedirs(output_dir)

            multipage_tif[0].save(os.path.join(output_dir, 'test-masks.tif'),
                                  compression="tiff_deflate",
                                  save_all=True,
                                  append_images=multipage_tif[1:])

            LOGGER.log("Predict finished")
            LOGGER.log("Results available in: {}".format(output_dir))
예제 #28
0
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    LOGGER.set_model_name("Tacotron2_PyT")
    LOGGER.set_backends([
        dllg.StdOutBackend(log_file=None,
                           logging_scope=dllg.TRAIN_ITER_SCOPE,
                           iteration_interval=1),
        dllg.JsonBackend(log_file=args.log_file,
                         logging_scope=dllg.TRAIN_ITER_SCOPE,
                         iteration_interval=1)
    ])
    LOGGER.register_metric("tacotron2_items_per_sec",
                           metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("tacotron2_latency",
                           metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("waveglow_items_per_sec",
                           metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("waveglow_latency",
                           metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE)

    log_hardware()
    log_args(args)

    tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2,
                                     args.amp_run)
    waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow,
                                    args.amp_run)
    denoiser = Denoiser(waveglow).cuda()

    tacotron2.forward = tacotron2.infer
    type(tacotron2).forward = type(tacotron2).infer
    jitted_tacotron2 = torch.jit.script(tacotron2)

    texts = []
    try:
        f = open(args.input, 'r')
        texts = f.readlines()
    except:
        print("Could not read file")
        sys.exit(1)

    if args.include_warmup:
        sequence = torch.randint(low=0,
                                 high=148,
                                 size=(1, 50),
                                 dtype=torch.long).cuda()
        input_lengths = torch.IntTensor([sequence.size(1)]).cuda().long()
        for i in range(3):
            with torch.no_grad():
                mel, mel_lengths = jitted_tacotron2(sequence, input_lengths)
                _ = waveglow.infer(mel)

    LOGGER.iteration_start()

    measurements = {}

    sequences_padded, input_lengths = prepare_input_sequence(texts)

    with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"):
        mel, mel_lengths = jitted_tacotron2(sequences_padded, input_lengths)

    with torch.no_grad(), MeasureTime(measurements, "waveglow_time"):
        audios = waveglow.infer(mel, sigma=args.sigma_infer)
        audios = audios.float()
        audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)

    tacotron2_infer_perf = mel.size(0) * mel.size(
        2) / measurements['tacotron2_time']
    waveglow_infer_perf = audios.size(0) * audios.size(
        1) / measurements['waveglow_time']

    LOGGER.log(key="tacotron2_items_per_sec", value=tacotron2_infer_perf)
    LOGGER.log(key="tacotron2_latency", value=measurements['tacotron2_time'])
    LOGGER.log(key="waveglow_items_per_sec", value=waveglow_infer_perf)
    LOGGER.log(key="waveglow_latency", value=measurements['waveglow_time'])
    LOGGER.log(key="latency",
               value=(measurements['tacotron2_time'] +
                      measurements['waveglow_time']))

    for i, audio in enumerate(audios):
        audio = audio[:mel_lengths[i] * args.stft_hop_length]
        audio = audio / torch.max(torch.abs(audio))
        audio_path = args.output + "audio_" + str(i) + ".wav"
        write(audio_path, args.sampling_rate, audio.cpu().numpy())

    LOGGER.iteration_stop()
    LOGGER.finish()
예제 #29
0
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    LOGGER.set_model_name("Tacotron2_PyT")
    LOGGER.set_backends([
        dllg.StdOutBackend(log_file=None,
                           logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1),
        dllg.JsonBackend(log_file=args.log_file,
                         logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1)
    ])
    LOGGER.register_metric("tacotron2_items_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("waveglow_items_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE)

    log_hardware()
    log_args(args)

    # tacotron2 model filepath was specified
    if args.tacotron2:
        # Setup Tacotron2
        tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.fp16_run)
    # file with mel spectrogram was specified
    elif args.mel_file:
        mel = torch.load(args.mel_file)
        mel = torch.autograd.Variable(mel.cuda())
        mel = torch.unsqueeze(mel, 0)

    # Setup WaveGlow
    if args.old_waveglow:
        waveglow = torch.load(args.waveglow)['model']
        waveglow = waveglow.remove_weightnorm(waveglow)
        waveglow = waveglow.cuda()
        waveglow.eval()
    else:
        waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.fp16_run)

    texts = []
    try:
        f = open(args.input, 'r')
        texts = f.readlines()
    except:
        print("Could not read file. Using default text.")
        texts = ["The forms of printed letters should be beautiful, and\
        that their arrangement on the page should be reasonable and\
        a help to the shapeliness of the letters themselves."]

    for i, text in enumerate(texts):

        LOGGER.iteration_start()

        sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
        sequence = torch.autograd.Variable(
            torch.from_numpy(sequence)).cuda().long()

        if args.tacotron2:
            tacotron2_t0 = time.time()
            with torch.no_grad():
                _, mel, _, _ = tacotron2.inference(sequence)
            tacotron2_t1 = time.time()
            tacotron2_infer_perf = sequence.size(1)/(tacotron2_t1-tacotron2_t0)
            LOGGER.log(key="tacotron2_items_per_sec", value=tacotron2_infer_perf)

        waveglow_t0 = time.time()
        with torch.no_grad():
            audio = waveglow.infer(mel, sigma=args.sigma_infer)
            audio = audio.float()
        waveglow_t1 = time.time()
        waveglow_infer_perf = audio[0].size(0)/(waveglow_t1-waveglow_t0)

        audio_path = args.output + "audio_"+str(i)+".wav"
        write(audio_path, args.sampling_rate, audio[0].data.cpu().numpy())

        LOGGER.log(key="waveglow_items_per_sec", value=waveglow_infer_perf)
        LOGGER.iteration_stop()

    LOGGER.finish()
    def after_run(self, run_context, run_values):

        batch_time = time.time() - self._t0
        imgs_per_sec = int(self._global_batch_size / batch_time)

        is_log_step = self._current_step % self._log_every == 0

        if is_log_step:

            if self._current_step > self._warmup_steps:
                imgs_per_sec = float(
                    ProfilerHook.moving_average(self._processing_speed_arr,
                                                n=30)[-1])

            LOGGER.log("iteration", int(self._current_step))
            LOGGER.log("total_ips", float(imgs_per_sec))

            if self._is_training:
                LOGGER.log("weight_decay",
                           float(run_values.results["weight_decay"]))
                LOGGER.log("reconstruction_loss",
                           float(run_values.results["reconstruction_loss"]))
                LOGGER.log("total_loss",
                           float(run_values.results["total_loss"]))
                LOGGER.log("learning_rate",
                           float(run_values.results["learning_rate"]))

            for key, val in sorted(run_values.results["iou_scores"].items(),
                                   key=operator.itemgetter(0)):
                LOGGER.log("iou_score - THS %s" % key, float(val))

            LOGGER.log("True Positives:",
                       run_values.results["confusion_matrix"]["tp"])
            LOGGER.log("True Negatives:",
                       run_values.results["confusion_matrix"]["tn"])
            LOGGER.log("False Positives:",
                       run_values.results["confusion_matrix"]["fp"])
            LOGGER.log("False Negatives:",
                       run_values.results["confusion_matrix"]["fn"])

            if self._sample_dir is not None and self._is_training:

                for key in sorted(run_values.results["samples"].keys(),
                                  key=operator.itemgetter(0)):

                    with open(
                            os.path.join(
                                self._sample_dir,
                                "sample_step_%04d_ths_%s.jpeg" %
                                (self._current_step, key)), 'wb') as fd:
                        fd.write(run_values.results["samples"][key])

                    with open(
                            os.path.join(
                                self._sample_dir,
                                "sample_step_%04d_mask.jpeg" %
                                self._current_step), 'wb') as fd:
                        fd.write(run_values.results["samples"]["mask"])

            print("######### STOP: %d ##############" % self._current_step)

        elif self._current_step > self._warmup_steps:
            # Do not store speed for log step due to additional fetches
            self._processing_speed_arr.append(imgs_per_sec)