示例#1
0
  def load(self, checkpoint_path, fast=True):
    # Presets
    if hparams.preset is not None and hparams.preset != "":
        preset = hparams.presets[hparams.preset]
        import json
        hparams.parse_json(json.dumps(preset))
        print("Override hyper parameters with preset \"{}\": {}".format(
            hparams.preset, json.dumps(preset, indent=4)))

    self._frontend = getattr(frontend, hparams.frontend)
    import train
    train._frontend = self._frontend
    from train import build_model

    # Model
    self.model = build_model()

    # Load checkpoints separately
    checkpoint = torch.load(checkpoint_path)
    self.model.load_state_dict(checkpoint["state_dict"])
    #model.seq2seq.decoder.max_decoder_steps = max_decoder_steps

    self.use_cuda = torch.cuda.is_available()
    if self.use_cuda:
        self.model = self.model.cuda()
    self.model.eval()
    if fast:
        self.model.make_generation_fast_()
示例#2
0
def main():
    parser = argparse.ArgumentParser(description='Train FFTNet')
    parser.add_argument('--base_dir', default='')
    parser.add_argument(
        '--hparams',
        default='',
        help=
        'Hyper parameter overrides as a comma-separated list of name=value pairs'
    )
    parser.add_argument('--train_file', default='training_data/train.txt')
    parser.add_argument('--val_file', default='training_data/val.txt')
    parser.add_argument('--name', help='Name of logging directory.')
    parser.add_argument('--model', default='fftnet')
    parser.add_argument('--preset',
                        default=None,
                        type=str,
                        help='the preset config json file')
    parser.add_argument('--output_dir',
                        default='output/',
                        help='folder to contain synthesized mel spectrograms')

    parser.add_argument('--restore_step',
                        default=None,
                        type=int,
                        help='the restore step')

    parser.add_argument('--summary_interval',
                        type=int,
                        default=200,
                        help='Steps between running summary ops')
    parser.add_argument('--summary_val_interval',
                        type=int,
                        default=10,
                        help='Steps between running summary ops')
    parser.add_argument('--eval_interval',
                        type=int,
                        default=100,
                        help='Steps between train eval ops')
    parser.add_argument('--checkpoint_interval',
                        type=int,
                        default=2000,
                        help='Steps between writing checkpoints')
    parser.add_argument('--epochs',
                        type=int,
                        default=2000,
                        help='total number of tacotron training steps')
    parser.add_argument('--tf_log_level',
                        type=int,
                        default=2,
                        help='TensorFlow C++ log level.')
    args = parser.parse_args()

    # load preset config, so u don't need to change anything in the hparams
    if args.preset is not None:
        with open(args.preset) as f:
            hparams.parse_json(f.read())

    log_dir, hp = prepare_run(args)
    train(log_dir, args, hp)
示例#3
0
def load_hparams_from_preset(preset):
    hparams_json_string = ""
    with open(preset) as f:
        for line in f:
            if line.strip().startswith("//"):
                continue
            hparams_json_string += line
        wavenet_hparams.parse_json(hparams_json_string)
示例#4
0
def main():
    args = docopt(__doc__)
    print("Command line args:\n", args)
    checkpoint_dir = args["--checkpoint-dir"]
    source_data_root = args["--source-data-root"]
    target_data_root = args["--target-data-root"]
    selected_list_dir = args["--selected-list-dir"]
    use_multi_gpu = args["--multi-gpus"]

    if args["--hparam-json-file"]:
        with open(args["--hparam-json-file"]) as f:
            json = "".join(f.readlines())
            hparams.parse_json(json)

    hparams.parse(args["--hparams"])

    training_list = list(load_key_list("train.csv", selected_list_dir))
    validation_list = list(load_key_list("validation.csv", selected_list_dir))

    training_source_files = [
        os.path.join(source_data_root,
                     f"{key}.{hparams.source_file_extension}")
        for key in training_list
    ]
    training_target_files = [
        os.path.join(target_data_root,
                     f"{key}.{hparams.target_file_extension}")
        for key in training_list
    ]
    validation_source_files = [
        os.path.join(source_data_root,
                     f"{key}.{hparams.source_file_extension}")
        for key in validation_list
    ]
    validation_target_files = [
        os.path.join(target_data_root,
                     f"{key}.{hparams.target_file_extension}")
        for key in validation_list
    ]

    print("training source", len(training_source_files))
    print("training target", len(training_target_files))

    log = logging.getLogger("tensorflow")
    log.setLevel(logging.INFO)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    fh = logging.FileHandler(hparams.logfile)
    fh.setLevel(logging.INFO)
    fh.setFormatter(formatter)
    log.addHandler(fh)
    tf.logging.set_verbosity(tf.logging.INFO)

    tf.logging.info(hparams_debug_string())

    train_and_evaluate(hparams, checkpoint_dir, training_source_files,
                       training_target_files, validation_source_files,
                       validation_target_files, use_multi_gpu)
示例#5
0
def main():
    args = get_args()
    if args.preset is not None:
        with open(args.preset) as f:
            hparams.parse_json(f.read())

    modified_hp = hparams.parse(args.hparams)
    print(hparams_debug_string())
    synthesis(args.checkpoint_path, args.local_path, args.global_id,
              args.output_dir, modified_hp)
示例#6
0
def prepare_run(args):
    if args.hparams_fp is not None:
        modified_hp = hparams.parse_json(open(args.hparams_fp, 'r').read())
    else:
        modified_hp = hparams.parse(args.hparams)
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(args.tf_log_level)
    run_name = args.name or args.model
    log_dir = os.path.join(args.base_dir, 'logs-{}'.format(run_name))
    os.makedirs(log_dir, exist_ok=True)
    infolog.init(os.path.join(log_dir, 'Terminal_train_log'), run_name,
                 args.slack_url)
    return log_dir, modified_hp
def main():
    args = docopt(__doc__)
    print("Command line args:\n", args)
    checkpoint_dir = args["--checkpoint-dir"]
    checkpoint_path = args["--checkpoint"]
    source_data_root = args["--source-data-root"]
    target_data_root = args["--target-data-root"]
    selected_list_dir = args["--selected-list-dir"]
    output_dir = args["--output-dir"]
    selected_list_filename = args["--selected-list-filename"] or "test.csv"

    tf.logging.set_verbosity(tf.logging.INFO)

    if args["--hparam-json-file"]:
        with open(args["--hparam-json-file"]) as f:
            json = "".join(f.readlines())
            hparams.parse_json(json)

    hparams.parse(args["--hparams"])
    tf.logging.info(hparams_debug_string())

    tf.logging.info(
        f"A selected list file to use: {os.path.join(selected_list_dir, selected_list_filename)}"
    )

    test_list = list(load_key_list(selected_list_filename, selected_list_dir))

    test_source_files = [
        os.path.join(source_data_root,
                     f"{key}.{hparams.source_file_extension}")
        for key in test_list
    ]
    test_target_files = [
        os.path.join(target_data_root,
                     f"{key}.{hparams.target_file_extension}")
        for key in test_list
    ]

    predict(hparams, checkpoint_dir, checkpoint_path, output_dir,
            test_source_files, test_target_files)
def get_model(checkpoint_path):
    max_decoder_steps = 500

    assert hparams.name == "deepvoice3"

    preset = join(dirname(__file__), 'presets/nyanko_ljspeech.json')
    with open(preset) as f:
        hparams.parse_json(f.read())

    global _frontend
    _frontend = getattr(frontend, hparams.frontend)

    train._frontend = _frontend
    # Model
    model = build_model()

    checkpoint = _load(checkpoint_path)
    model.load_state_dict(checkpoint["state_dict"])
    checkpoint_name = splitext(basename(checkpoint_path))[0]

    model.seq2seq.decoder.max_decoder_steps = max_decoder_steps

    return model
示例#9
0
        out0 = np.clip(out0, -1, 1)
        sf.write(join(writing_dir, "out0_{}.wav".format(sigma)), out0,
                 hparams.sample_rate)

        out1 = inv_linear_quantize(x1[0].detach().cpu().numpy(),
                                   hparams.quantize_channels - 1)
        out1 = np.clip(out1, -1, 1)
        sf.write(join(writing_dir, "out1_{}.wav".format(sigma)), out1,
                 hparams.sample_rate)


if __name__ == "__main__":
    args = docopt(__doc__)

    # Load preset if specified
    if args["--preset"] is not None:
        with open(args["--preset"]) as f:
            hparams.parse_json(f.read())
    else:
        hparams_json = join(dirname(args["<checkpoint1>"]), "hparams.json")
        if exists(hparams_json):
            print("Loading hparams from {}".format(hparams_json))
            with open(hparams_json) as f:
                hparams.parse_json(f.read())

    # Override hyper parameters
    hparams.parse(args["--hparams"])
    assert hparams.name == "wavenet_vocoder"

    main(args)
from pyspark import SparkContext
from docopt import docopt
from hparams import hparams, hparams_debug_string
from preprocess.vctk import VCTK

if __name__ == "__main__":
    args = docopt(__doc__)
    in_dir = args["<in_dir>"]
    out_dir = args["<out_dir>"]
    source_only = args["--source-only"]
    target_only = args["--target-only"]

    if args["--hparam-json-file"]:
        with open(args["--hparam-json-file"]) as f:
            hparams_json = "".join(f.readlines())
            hparams.parse_json(hparams_json)

    hparams.parse(args["--hparams"])
    print(hparams_debug_string())

    if source_only:
        process_source = True
        process_target = False
    elif target_only:
        process_source = False
        process_target = True
    else:
        process_source = True
        process_target = True

    instance = VCTK(in_dir, out_dir, hparams)
示例#11
0
    if data_root is None:
        data_root = join(dirname(__file__), "data", "ljspeech")

    log_event_path = args["--log-event-path"]
    reset_optimizer = args["--reset-optimizer"]

    # Override hyper parameters
    hparams.parse(args["--hparams"])
    print(hparams_debug_string())
    assert hparams.name == "wavenet_vocoder"

    # Presets
    if hparams.preset is not None and hparams.preset != "":
        preset = hparams.presets[hparams.preset]
        import json
        hparams.parse_json(json.dumps(preset))
        print("Override hyper parameters with preset \"{}\": {}".format(
            hparams.preset, json.dumps(preset, indent=4)))

    os.makedirs(checkpoint_dir, exist_ok=True)

    # Dataloader setup
    data_loaders = get_data_loaders(data_root, speaker_id, test_shuffle=True)

    # Model
    model = build_model()
    print(model)
    if use_cuda:
        model = model.cuda()

    receptive_field = model.receptive_field
from pyspark import SparkContext
from docopt import docopt
from hparams import hparams, hparams_debug_string
from preprocess.vctk import VCTK

if __name__ == "__main__":
    args = docopt(__doc__)
    in_dir = args["<in_dir>"]
    out_dir = args["<out_dir>"]
    source_only = args["--source-only"]
    target_only = args["--target-only"]

    if args["--hparam-json-file"]:
        with open(args["--hparam-json-file"]) as f:
            json = "".join(f.readlines())
            hparams.parse_json(json)

    hparams.parse(args["--hparams"])
    print(hparams_debug_string())

    if source_only:
        process_source = True
        process_target = False
    elif target_only:
        process_source = False
        process_target = True
    else:
        process_source = True
        process_target = True

    instance = VCTK(in_dir, out_dir, hparams)
示例#13
0
def synthesis(checkpoint_path, preset, dst_dir, srt_path, face_path):
    global _frontend
    checkpoint_seq2seq_path = None
    checkpoint_postnet_path = None
    max_decoder_steps = 500
    file_name_suffix = ""
    replace_pronunciation_prob = float(0.0)

    # Load preset if specified
    if preset is not None:
        with open(preset) as f:
            hparams.parse_json(f.read())
    # Override hyper parameters
    hparams.parse("")
    assert hparams.name == "deepvoice3"

    _frontend = getattr(frontend, hparams.frontend)
    print(_frontend)
    import train
    train._frontend = _frontend
    from train import plot_alignment, build_model

    # Model
    model = build_model()

    # Load checkpoints separately
    if checkpoint_postnet_path is not None and checkpoint_seq2seq_path is not None:
        checkpoint = _load(checkpoint_seq2seq_path)
        model.seq2seq.load_state_dict(checkpoint["state_dict"])
        checkpoint = _load(checkpoint_postnet_path)
        model.postnet.load_state_dict(checkpoint["state_dict"])
        checkpoint_name = splitext(basename(checkpoint_seq2seq_path))[0]
    else:
        checkpoint = _load(checkpoint_path)
        model.load_state_dict(checkpoint["state_dict"])
        checkpoint_name = splitext(basename(checkpoint_path))[0]

    model.seq2seq.decoder.max_decoder_steps = max_decoder_steps

    os.makedirs(dst_dir, exist_ok=True)

    task = load_srt(srt_path, face_path)
    idx = 0
    for i in task:
        speaker_id = i[3]
        text = i[4]

        words = nltk.word_tokenize(text)
        file_name = "{} speaker_{} {}-{}".format(idx, speaker_id, i[1], i[2])
        print(text)
        waveform, alignment, _, _ = tts(model,
                                        text,
                                        p=replace_pronunciation_prob,
                                        speaker_id=speaker_id,
                                        fast=True)
        dst_wav_path = join(dst_dir, "{}.wav".format(file_name))
        dst_alignment_path = join(dst_dir,
                                  "{}_alignment.png".format(file_name))
        plot_alignment(alignment.T,
                       dst_alignment_path,
                       info="{}, {}".format(hparams.builder,
                                            basename(checkpoint_path)))
        audio.save_wav(waveform, dst_wav_path)
        print(
            idx, ": {}\n ({} chars, {} words)".format(text, len(text),
                                                      len(words)))
        idx += 1

    print(
        "Finished! Check out {} for generated audio samples.".format(dst_dir))
示例#14
0
    name = args["<name>"]
    in_dir = args["<in_dir>"]
    out_dir = args["<out_dir>"]
    num_workers = args["--num_workers"]
    num_workers = cpu_count() // 2 if num_workers is None else int(num_workers)
    preset = args["--preset"]

    # Load preset if specified
    if preset is not None:
        hparams_json_string = ""
        with open(preset) as f:
            for line in f:
                if line.strip().startswith("//"):
                    continue
                hparams_json_string += line
            hparams.parse_json(hparams_json_string)
    # Override hyper parameters
    hparams.parse(args["--hparams"])
    assert hparams.name == "wavenet_vocoder"

    print("Using name: '%s'" % name)
    print("Sampling frequency: {}".format(hparams.sample_rate))
    if name in ["cmu_arctic", "jsut", "librivox"]:
        print("""warn!: {} is no longer explicitly supported!

Please use a generic dataest 'wavallin' instead.
All you need to do is to put all wav files in a single directory.""".format(
            name))
        sys.exit(1)

    if name == "ljspeech":
示例#15
0
def main():
    args = docopt(__doc__)
    print("Command line args:\n", args)
    run_name = args["--run-name"]  # dataset root
    device = args["--device"]
    phase = args["--phase"]  # train or synthesis
    data_root = args["--data-root"]  # dataset root
    checkpoint_name = args["--checkpoint-name"]
    speaker_id = args["--speaker-id"]
    log_event_path = args["--log-event-path"]
    reset_optimizer = args["--reset-optimizer"]
    text_list_file_path = args["--text-list-file"]

    preset = args["--preset"]

    speaker_id = int(speaker_id) if speaker_id is not None else None

    if run_name is None:
        run_name = "Tacotron2" + time_string()
    log_dir = prepare_run(run_name)

    if data_root is None:
        data_root = os.path.join(dirname(__file__), "data", "mandarin")

    # Load preset if specified
    if preset is not None:
        with open(preset) as f:
            hparams.parse_json(f.read())
    # Override hyper parameters
    hparams.parse(args["--hparams"])

    assert hparams.builder == "Tacotron2"

    if device is not None:
        hparams.device = device

    print(hparams_debug_string())

    train_path = os.path.join(log_dir, "train")
    val_path = os.path.join(log_dir, "val")
    checkpoint_path = os.path.join(log_dir, "pretrained")
    os.makedirs(train_path, exist_ok=True)
    os.makedirs(val_path, exist_ok=True)
    os.makedirs(checkpoint_path, exist_ok=True)

    best_loss = 0
    global global_epoch
    global_epoch = 0
    global global_step
    global_step = 0

    if hparams.seed is not None:
        random.seed(hparams.seed)
        torch.manual_seed(hparams.seed)
        cudnn.deterministic = hparams.cudnn_deterministic
        cudnn.benchmark = hparams.cudnn_benchmark
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')

        log("The system set the random number to:{}".format(hparams.seed))

    if hparams.device > -1:
        warnings.warn('You have chosen a specific GPU. This will completely '
                      'disable data parallelism.')

    distributed = hparams.world_size > 1
    if distributed:
        dist.init_process_group(backend=hparams.dist_backend,
                                init_method=hparams.dist_url,
                                world_size=hparams.world_size)
    model = build_model()
    print(model)

    if hparams.device > -1:
        model = model.cuda(hparams.device)
    elif distributed:
        model.cuda()
        model = torch.nn.parallel.DistributedDataParallel(model)
    else:
        model = torch.nn.DataParallel(model).cuda()

    # define loss function (criterion) and optimizer
    mels_criterion = MaskedMSELoss()
    stop_criterion = MaskedBCELoss()

    optimizer = torch.optim.Adam(model.get_trainable_parameters(),
                                 lr=hparams.init_learning_rate,
                                 betas=(hparams.adam_beta1,
                                        hparams.adam_beta2),
                                 eps=hparams.adam_epsilon,
                                 weight_decay=hparams.weight_decay)
    scheduler = ExpLRDecay(init_learning_rate=hparams.init_learning_rate,
                           decay_rate=hparams.decay_rate,
                           start_step=hparams.start_decay,
                           decay_steps=hparams.decay_step)

    # optionally resume from a checkpoint
    if checkpoint_name is not None:
        if os.path.isfile(checkpoint_name):
            load_checkpoint(checkpoint_name, model, hparams.device, optimizer,
                            reset_optimizer)
        else:
            file_full_path = os.path.join(checkpoint_path, checkpoint_name)
            if os.path.isfile(file_full_path):
                load_checkpoint(file_full_path, model, hparams.device,
                                optimizer, reset_optimizer)
            else:
                log("=> no checkpoint found at '{}'".format(checkpoint_name))

    # synthesis
    if phase == "synthesis":
        if text_list_file_path is None:
            test_lines = [
                "yun2cong2ke1ji4cheng2li4yu2er4ling2yi1wu3nian2si4yue4",
                "shi4yi1jia1fu1hua4yu2zhong1guo2ke1xue2yuan4chong2qing4yan2jiu1yuan4de0gao1ke1ji4qi3ye4"
                "zhuan1zhu4yu2ji4suan4ji1shi4jue2yu3ren2gong1zhi4neng2",
                "yi2ge4hao3zheng4quan2zhi1de2yi3bao3chi2da4bu4fen4zai4yu2bu4tong2de0zheng4jian4",
                "he2li3de0fa1hui1qi2gong1yong4"
            ]
        else:
            test_lines = []
            with open(text_list_file_path, "rb") as f:
                lines = f.readlines()
                for line in lines:
                    text = line.decode("utf-8")[:-1]
                    test_lines.append(text)
        synthesis(test_lines, model, device, log_dir)
        return

    # Setup summary writer for tensorboard
    if log_event_path is None:
        log_event_path = os.path.join(log_dir, "log_event_path")
    print("Los event path: {}".format(log_event_path))
    writer = SummaryWriter(log_dir=log_event_path)

    # Prepare dataset
    dataset_dir = os.path.join(dirname(__file__), data_root)
    texts_list, mels_list, mels_length_list, speaker_ids_list = get_item_list(
        dataset_dir, "train.txt")

    #indices = np.arange(256*16)
    indices = np.arange(len(texts_list) - len(texts_list) % hparams.batch_size)
    test_size = hparams.test_batches * hparams.batch_size
    train_indices, val_indices = train_test_split(indices,
                                                  test_size=test_size,
                                                  random_state=hparams.seed)
    collate_fn = AudioCollate(padding_mels=hparams.padding_mels)

    # prepare train dataset
    train_dataset_text_ids = [texts_list[i] for i in train_indices]
    train_dataset_mels_ids = [mels_list[i] for i in train_indices]
    train_dataset_mels_length_ids = [
        mels_length_list[i] for i in train_indices
    ]
    if speaker_ids_list is not None:
        train_dataset_speaker_ids = [
            speaker_ids_list[i] for i in train_indices
        ]
    else:
        train_dataset_speaker_ids = None
    train_dataset = AudiobookDataset(train_dataset_text_ids,
                                     train_dataset_mels_ids,
                                     train_dataset_speaker_ids, dataset_dir)

    if distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
        train_loader = DataLoader(train_dataset,
                                  collate_fn=collate_fn,
                                  batch_size=hparams.batch_size,
                                  num_workers=2,
                                  shuffle=True,
                                  pin_memory=hparams.pin_memory)
    else:
        if hparams.dynamical_batch_size:
            train_sampler = DynamicalSimilarTimeLengthSampler(
                train_dataset_mels_length_ids,
                batch_size_min=hparams.batch_size,
                batch_expand_level=hparams.batch_size_level,
                batch_group=hparams.batch_group,
                permutate=hparams.permutate)
            train_batch_sampler = DynamicalBatchSampler(train_sampler)
            train_loader = DataLoader(train_dataset,
                                      collate_fn=collate_fn,
                                      batch_size=hparams.batch_size,
                                      batch_sampler=train_batch_sampler,
                                      num_workers=2,
                                      shuffle=False,
                                      pin_memory=True)
        else:
            train_sampler = SimilarTimeLengthSampler(
                train_dataset_mels_length_ids,
                descending=True,
                batch_size=hparams.batch_size,
                batch_group_size=hparams.batch_group_size,
                permutate=hparams.permutate)
            train_sampler = None
            shuffle = (train_sampler == None)
            train_loader = DataLoader(train_dataset,
                                      collate_fn=collate_fn,
                                      batch_size=hparams.batch_size,
                                      sampler=train_sampler,
                                      num_workers=2,
                                      shuffle=False,
                                      pin_memory=True)

    # prepare val dataset
    val_dataset_text_ids = [texts_list[i] for i in val_indices]
    val_dataset_mels_ids = [mels_list[i] for i in val_indices]
    val_dataset_mels_length_ids = [mels_length_list[i] for i in val_indices]
    if speaker_ids_list is not None:
        val_dataset_speaker_ids = [speaker_ids_list[i] for i in val_indices]
    else:
        val_dataset_speaker_ids = None

    val_dataset = AudiobookDataset(val_dataset_text_ids, val_dataset_mels_ids,
                                   val_dataset_speaker_ids, dataset_dir)
    val_loader = DataLoader(val_dataset,
                            collate_fn=collate_fn,
                            batch_size=hparams.batch_size,
                            num_workers=2,
                            shuffle=True,
                            pin_memory=True)

    for epoch in range(global_epoch, hparams.nepochs):
        # train for one epoch
        train(train_loader, model, hparams.device, mels_criterion,
              stop_criterion, optimizer, scheduler, writer, train_path)

        # evaluate on validation set
        loss = validate(val_loader, model, hparams.device, mels_criterion,
                        stop_criterion, writer, val_path)

        # remember best prec@1 and save checkpoint
        is_best = loss < best_loss
        best_loss = min(loss, best_loss)
        save_checkpoint(model, optimizer, checkpoint_path)
示例#16
0
    conditional_path = args["--conditional"]
    # From https://github.com/Rayhane-mamah/Tacotron-2
    symmetric_mels = args["--symmetric-mels"]
    max_abs_value = float(args["--max-abs-value"])

    file_name_suffix = args["--file-name-suffix"]

    output_html = args["--output-html"]
    speaker_id = args["--speaker-id"]
    speaker_id = None if speaker_id is None else int(speaker_id)
    preset = args["--preset"]

    # Load preset if specified
    if preset is not None:
        with open(preset) as f:
            hparams.parse_json(f.read())
    # Override hyper parameters
    hparams.parse(args["--hparams"])
    assert hparams.name == "wavenet_vocoder"

    # Load conditional features
    if conditional_path is not None:
        c = np.load(conditional_path)
        if c.shape[1] != hparams.num_mels:
            np.swapaxes(c, 0, 1)
        if max_abs_value > 0:
            min_, max_ = 0, max_abs_value
            if symmetric_mels:
                min_ = -max_
            print("Normalize features to desired range [0, 1] from [{}, {}]".
                  format(min_, max_))
示例#17
0
def wavsynthesis():
    args = docopt(__doc__)
    print("Command line args:\n", args)
    checkpoint_path = args["<checkpoint>"]
    dst_dir = args["<dst_dir>"]

    length = int(args["--length"])
    initial_value = args["--initial-value"]
    initial_value = None if initial_value is None else float(initial_value)
    conditional_path = args["--conditional"]

    file_name_suffix = args["--file-name-suffix"]
    output_html = args["--output-html"]
    speaker_id = args["--speaker-id"]
    speaker_id = None if speaker_id is None else int(speaker_id)
    preset = args["--preset"]

    # Force CPU synthesis mode if required
    if args["--force-cpu"]:
        use_cuda = False
        device = torch.device("cpu")

    # Load preset if specified
    if preset is not None:
        with open(preset) as f:
            hparams.parse_json(f.read())
    # Override hyper parameters
    hparams.parse(args["--hparams"])
    assert hparams.name == "wavenet_vocoder"

    # Load conditional features
    if conditional_path is not None:
        c = np.load(conditional_path)
        if c.shape[1] != hparams.num_mels:
            c = np.swapaxes(c, 0, 1)
    else:
        c = None

    from train import build_model

    # Model
    model = build_model().to(device)

    # Load checkpoint
    print("Load checkpoint from {}".format(checkpoint_path))
    if use_cuda:
        checkpoint = torch.load(checkpoint_path)
    else:
        checkpoint = torch.load(checkpoint_path,
                                map_location=lambda storage, loc: storage)
    model.load_state_dict(checkpoint["state_dict"])
    checkpoint_name = splitext(basename(checkpoint_path))[0]

    os.makedirs(dst_dir, exist_ok=True)
    if not file_name_suffix:
        file_name_suffix = splitext(basename(conditional_path))[0]
    dst_wav_path = join(dst_dir, "{}.wav".format(file_name_suffix))

    # Prepare mel spectrogram condition
    C = FileSourceDataset(SingleFileDataSource(conditional_path))
    data_loader = data_utils.DataLoader(C,
                                        batch_size=hparams.batch_size,
                                        drop_last=False,
                                        num_workers=hparams.num_workers,
                                        sampler=None,
                                        shuffle=False,
                                        collate_fn=dummy_collate,
                                        pin_memory=hparams.pin_memory)

    cin_pad = hparams.cin_pad

    for idx, (x, y, c, g, input_lengths) in enumerate(data_loader):
        if cin_pad > 0:
            c = F.pad(c, pad=(cin_pad, cin_pad), mode="replicate")

        # B x 1 x T
        if x[0] is not None:
            B, _, T = x.shape
        else:
            B, _, Tn = c.shape
            T = Tn * audio.get_hop_size()

        # DO generate
        y_hats = batch_wavegen(model, c=c, g=g, fast=True, tqdm=tqdm)

        for i, (ref, gen, length) in enumerate(zip(x, y_hats, input_lengths)):
            gen = gen[:length]
            gen = np.clip(gen, -1.0, 1.0)

            # save
            wavfile.write(dst_wav_path, hparams.sample_rate, to_int16(gen))

    print(
        "Finished! Check out {} for generated audio samples.".format(dst_dir))
    sys.exit(0)
示例#18
0
def load_model(name: str, device="cpu"):
    if name.lower() == 'uniglow':
        from nemo.collections.tts.models import UniGlowModel
        return UniGlowModel.from_pretrained(model_name="tts_uniglow",
                                            map_location=device)
    elif name.lower() == 'tacotron':
        import nemo.collections.tts as nemo_tts
        return nemo_tts.models.Tacotron2Model.from_pretrained(
            model_name="Tacotron2-22050Hz", map_location=device)
    elif name.lower() == 'quartznet':
        import nemo.collections.asr as nemo_asr
        return nemo_asr.models.EncDecCTCModel.from_pretrained(
            model_name="QuartzNet15x5Base-En", map_location=device)
    elif name.lower() == 'speakerverification_speakernet':
        import nemo.collections.asr as nemo_asr
        stt = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained(
            model_name="speakerverification_speakernet", map_location=device)
        return stt
    elif name.lower() == 'speakerrecognition_speakernet':
        import nemo.collections.asr as nemo_asr
        stt = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained(
            model_name="speakerrecognition_speakernet", map_location=device)
        return stt
    elif name.lower() == 'jasper':
        import nemo.collections.asr as nemo_asr
        stt = nemo_asr.models.ASRModel.from_pretrained(
            model_name="stt_en_jasper10x5dr")
        return stt
    elif name.lower() == 'quartznet_de':
        import nemo.collections.asr as nemo_asr
        stt = nemo_asr.models.ASRModel.from_pretrained(
            model_name="stt_de_quartznet15x5")
        return stt
    elif name.lower() == 'deepspeech2':
        from .deep_speech import DeepSpeechEncoderWrapper
        if os.path.isfile('weights/an4_pretrained_v2.pth'):
            return DeepSpeechEncoderWrapper("weights/an4_pretrained_v2.pth",
                                            device=device)
        else:
            if not os.path.exists("weights"):
                os.makedirs("weights")
            wget.download(
                "https://github.com/SeanNaren/deepspeech.pytorch/releases/download/v2.0/an4_pretrained_v2.pth",
                out="weights")
            return DeepSpeechEncoderWrapper("weights/an4_pretrained_v2.pth",
                                            device=device)
    elif name.lower() == 'wav2vec2':
        from .wav2vec2 import Wav2Vec2FullEncoder
        return Wav2Vec2FullEncoder(device)
    elif name.lower() == 'wav2vec2_conv':
        from .wav2vec2 import Wav2Vec2ConvEncoder
        return Wav2Vec2ConvEncoder(device)
    elif name.lower() == 'melgan':
        import torch
        import gdown
        os.makedirs("weights", exist_ok=True)
        # wget.download('https://github.com/descriptinc/melgan-neurips/archive/master.zip', out="weights")
        url = 'https://drive.google.com/uc?id=' + '1vNp5ZsfEBZQBXqsUOJZUYTkTedk6HZQS'
        gdown.download(url, 'weights/melgan-neurips-master.zip', quiet=True)
        os.system('unzip weights/melgan-neurips-master.zip -d weights/')
        vocoder = torch.hub.load('weights/melgan-neurips-master',
                                 'load_melgan',
                                 source='local')
        return vocoder
    elif name.lower() == 'waveglow':
        from .waveglow import Vocoder
        vocoder = Vocoder().to(device)
        return vocoder
    elif name.lower() == 'wavenet':
        wn_preset = "weights/20180510_mixture_lj_checkpoint_step000320000_ema.json"
        wn_checkpoint_path = "weights/20180510_mixture_lj_checkpoint_step000320000_ema.pth"

        if not os.path.exists(wn_preset):
            os.makedirs("weights", exist_ok=True)
            # wget.download(
            #     'https://www.dropbox.com/s/0vsd7973w20eskz/20180510_mixture_lj_checkpoint_step000320000_ema.json',
            #     out="weights"
            # )
            os.system(
                'curl -L "https://www.dropbox.com/s/0vsd7973w20eskz/20180510_mixture_lj_checkpoint_step000320000_ema.json" -o weights/20180510_mixture_lj_checkpoint_step000320000_ema.json'
            )
        if not os.path.exists(wn_checkpoint_path):
            os.makedirs("weights", exist_ok=True)
            # wget.download(
            #     'https://www.dropbox.com/s/zdbfprugbagfp2w/20180510_mixture_lj_checkpoint_step000320000_ema.pth',
            #     out="weights"
            # )
            os.system(
                'curl -L "https://www.dropbox.com/s/zdbfprugbagfp2w/20180510_mixture_lj_checkpoint_step000320000_ema.pth" -o weights/20180510_mixture_lj_checkpoint_step000320000_ema.pth'
            )

        from hparams import hparams
        with open(wn_preset) as f:
            hparams.parse_json(f.read())

        import sys
        sys.path.append('thirdparty/wavenet_vocoder')

        from train import build_model
        from synthesis import wavegen
        import torch

        model = build_model().to(device)

        print("Load checkpoint from {}".format(wn_checkpoint_path))
        checkpoint = torch.load(wn_checkpoint_path, map_location=device)
        model.load_state_dict(checkpoint["state_dict"])

        return (hparams, model)

    elif name.lower() in ['hifigan', 'hifigan_v1', 'hifigan_v2', 'hifigan_v3']:
        import gdown

        name = name.lower()
        header = "https://drive.google.com/uc?id="

        if name in ['hifigan', 'hifigan_v1']:
            name = 'hifigan_v1'
            model_url = header + "1QEBKespXTmsMzsSRBXWdpIT0Ve7nnaRZ"
            config_url = header + "1l5EUVBKM0SK7ec4HWf_wZvEITAsdOLFC"
        elif name == 'hifigan_v2':
            model_url = header + "1I415g2Cdx5FWy6ECma0zEc9GhX_TnbFv"
            config_url = header + "11LnhSum3EAeo5zag-tpU8HKk0MdbrQxF"
        else:
            model_url = header + "1fnkOteyRdPq4Gh2cfso3gqqrC6inLWsF"
            config_url = header + "1mke75axgO2sdJ41GL2HTrcb4KyAl0i45"

        if not os.path.exists(f'pretrained/{name}'):
            os.makedirs(f'pretrained/{name}', exist_ok=True)

        model_output = f'pretrained/{name}/model.pth'
        config_output = f'pretrained/{name}/config.json'
        gdown.download(model_url, model_output, quiet=True)
        gdown.download(config_url, config_output, quiet=True)

    elif name.lower() == "wave2vec_mos":
        from .wav2vec2 import Wav2Vec2MOS
        import gdown
        if not os.path.isfile('weights/wave2vec2mos.pth'):
            if not os.path.exists("weights"):
                os.makedirs("weights")
            gdown.download(
                "https://drive.google.com/uc?id=18kMTxj2VbRDrs_CBcCmZT-kGTvFvbVmm",
                output="weights/wave2vec2mos.pth")
        return Wav2Vec2MOS('weights/wave2vec2mos.pth')
    else:
        raise NotImplementedError