word_num = section["word_num"] letter_num = section["letter_num"] print("Done!") #%% print("Loading results....") word_results = np.load(results_dir / "word_stateseq.npz") letter_results = np.load(results_dir / "letter_stateseq.npz") duration_results = np.load(results_dir / "word_durations.npz") keys = sorted(list(word_results.keys())) train_iter = word_results[keys[0]].shape[0] if args.speaker_id is not None: speaker, spkind_keys = separate_speaker(np.load(args.speaker_id)) speaker_N = len(speaker) spkind_phn_labels = get_separated_values(phn_labels, spkind_keys) spkind_wrd_labels = get_separated_values(wrd_labels, spkind_keys) spkind_letter_results = get_separated_values(letter_results, spkind_keys) spkind_word_results = get_separated_values(word_results, spkind_keys) spkind_letter_ARI = np.zeros((speaker_N, train_iter)) spkind_letter_macro_f1_score = np.zeros((speaker_N, train_iter)) spkind_letter_micro_f1_score = np.zeros((speaker_N, train_iter)) spkind_word_ARI = np.zeros((speaker_N, train_iter)) spkind_word_macro_f1_score = np.zeros((speaker_N, train_iter)) spkind_word_micro_f1_score = np.zeros((speaker_N, train_iter)) spkind_letter_confusion_matrix = np.zeros( (speaker_N, train_iter, phn_label_N, letter_num), dtype=int) spkind_word_confusion_matrix = np.zeros( (speaker_N, train_iter, wrd_label_N, word_num), dtype=int)
gen_path = args.generator or (args.snapshot_dir / args.snapshot_name).with_suffix(gen_suffix) dis_path = args.discriminator or (args.snapshot_dir / args.snapshot_name).with_suffix(dis_suffix) cls_path = args.classifier or (args.snapshot_dir / args.snapshot_name).with_suffix(cls_suffix) # Set up model num_mels = 36 zdim = 5 hdim = 32 cdim = 8 adim = 32 speakers, speaker_individual_keys = separate_speaker(np.load(args.speaker_id)) speaker_num = len(speakers) identity = np.identity(speaker_num, dtype=np.float32) spkind_mcep = get_separated_values(np.load(args.mcep), speaker_individual_keys) spkind_f0 = get_separated_values(np.load(args.f0), speaker_individual_keys) spkind_ap = get_separated_values(np.load(args.ap), speaker_individual_keys) mcep_mean = np.load(args.mcep_norm_param[0]) mcep_std = np.load(args.mcep_norm_param[1]) logf0_mean = np.load(args.logf0_norm_param[0]) logf0_std = np.load(args.logf0_norm_param[1]) generator = generator_class(speaker_num) adverserial_discriminator = discriminator_class(num_mels, speaker_num, adim) serializers.load_npz(gen_path, generator) serializers.load_npz(dis_path, adverserial_discriminator) spkind_kmfa = [speaker_individual_keys, spkind_mcep, spkind_f0, spkind_ap]
parser.add_argument("--size", type=int, default=1) parser.add_argument("--mode", choices=["ML", "RND"], default="ML") parser.add_argument("--LM", choices=["LSTM", "Bigram", "Unigram"]) parser.add_argument("--unique", action="store_true") parser.add_argument("--LSTM_model", type=Path) args = parser.parse_args() speakers, spkind_keys = separate_speaker(np.load(args.speaker_id)) speaker_num = len(speakers) target_idx = speakers.index(args.target_speaker) src_letter_stateseq = get_separated_values(np.load(args.letter_stateseq), spkind_keys)[target_idx] src_f0 = get_separated_values(np.load(args.f0), spkind_keys)[target_idx] src_ap = get_separated_values(np.load(args.ap), spkind_keys)[target_idx] mcep_min = np.load(args.mcep_norm_param[0]) mcep_max = np.load(args.mcep_norm_param[1]) if args.sentences is None: if args.LM == "Unigram": snt_generator = Unigram_generator(args.sentences_file) elif args.LM == "Bigram": snt_generator = Bigram_generator(args.sentences_file, args.parameter) elif args.LM == "LSTM": snt_generator = LSTMLM_generator(args.LSTM_model, args.sentences_file) ap_generator = AP_generator(args.letter_num, src_ap,
def main(): parser = argparse.ArgumentParser( description='Train stargan voice convertor') parser.add_argument('--gpu', type=int, default=-1, help='GPU ID (negative value indicates CPU)') parser.add_argument("--train_data", type=Path, required=True, help="training data") parser.add_argument("--speaker_id", type=Path, required=True, help="speaker_id file") parser.add_argument("--output_file", type=Path, required=True) parser.add_argument('--epoch', default=6000, type=int, help='number of epochs to learn') parser.add_argument("--epoch_start", type=int, default=0) parser.add_argument('--snapshot', default=100, type=int, help='interval of snapshot') parser.add_argument('--batchsize', type=int, default=4, help='Batch size') parser.add_argument('--optimizer', default='Adam', choices=["Adam", "MomentumSGD", "RMSprop"], type=str, help='optimizer to use: Adam, MomentumSGD, RMSprop') parser.add_argument('--lrate', default='0.00001', type=float, help='learning rate for Adam, MomentumSGD or RMSprop') parser.add_argument('--genpath', type=str, help='path for a pretrained generator') parser.add_argument('--clspath', type=str, help='path for a pretrained classifier') parser.add_argument('--advdispath', type=str, help='path for a pretrained real/fake discriminator') args = parser.parse_args() epsi = sys.float_info.epsilon output_file = args.output_file output_dir = output_file.with_suffix("") output_dir.mkdir(exist_ok=True, parents=True) all_source = np.load(args.train_data) Speakers, SpeakerIndividualKeys = separate_speaker(np.load( args.speaker_id)) NormalizedAllData = get_separated_values(all_source, SpeakerIndividualKeys) SpeakerNum = len(Speakers) # Set input directories EpochNum = args.epoch BatchSize = args.batchsize SentenceNum = [len(SpeakerIndividualKeys[s]) for s in range(SpeakerNum)] MaxSentenceNum = max(SentenceNum) print('#GPU: {}'.format(args.gpu)) print('#epoch: {}'.format(EpochNum)) print('Optimizer: {}'.format(args.optimizer)) print('Learning rate: {}'.format(args.lrate)) print('Snapshot: {}'.format(args.snapshot)) # Set up model num_mels = 36 zdim = 5 hdim = 32 cdim = 8 adim = 32 # num_mels = data.shape[0] (36dim) # zdim = 8 # hdim = 32 generator_class = net.Generator_new classifier_class = net.Classifier1 discriminator_class = net.AdvDiscriminator1 loss_class = net.Loss_new generator = generator_class(SpeakerNum) paranum = sum(p.data.size for p in generator.params()) print('Parameter #: {}'.format(paranum)) # cdim = 8 classifier = classifier_class(num_mels, SpeakerNum, cdim) paranum = sum(p.data.size for p in classifier.params()) print('Parameter #: {}'.format(paranum)) # adim = 32 adverserial_discriminator = discriminator_class(num_mels, SpeakerNum, adim) # adverserial_discriminator = net.AdvDiscriminator_noactive(num_mels, SpeakerNum, adim) paranum = sum(p.data.size for p in adverserial_discriminator.params()) print('Parameter #: {}'.format(paranum)) if args.genpath is not None: try: serializers.load_npz(args.genpath, generator) except: print('Could not load generator.') if args.clspath is not None: try: serializers.load_npz(args.clspath, classifier) except: print('Could not load domain classifier.') if args.advdispath is not None: try: serializers.load_npz(args.advdispath, adverserial_discriminator) except: print('Could not load real/fake discriminator.') if args.gpu >= 0: chainer.cuda.get_device(args.gpu).use() generator.to_gpu() classifier.to_gpu() adverserial_discriminator.to_gpu() xp = np if args.gpu < 0 else cuda.cupy # Set up optimziers # loss = net.Loss1(generator, classifier, adverserial_discriminator) loss = loss_class(generator, classifier, adverserial_discriminator) # w_adv = 1.0 # w_cls = 1.0 # w_cyc = 1.0 # w_rec = 1.0 w_adv = 1.0 w_cls = 1.0 w_cyc = 1.0 w_rec = 1.0 if args.optimizer == 'MomentumSGD': opt_gen = optimizers.MomentumSGD(lr=args.lrate, momentum=0.9) opt_cls = optimizers.MomentumSGD(lr=args.lrate, momentum=0.9) opt_advdis = optimizers.MomentumSGD(lr=args.lrate, momentum=0.9) elif args.optimizer == 'Adam': opt_gen = optimizers.Adam(alpha=0.001, beta1=0.9) opt_cls = optimizers.Adam(alpha=0.00005, beta1=0.5) opt_advdis = optimizers.Adam(alpha=0.00001, beta1=0.5) elif args.optimizer == 'RMSprop': opt_gen = optimizers.RMSprop(lr=args.lrate) opt_cls = optimizers.RMSprop(lr=args.lrate) opt_advdis = optimizers.RMSprop(lr=args.lrate) opt_gen.setup(generator) opt_cls.setup(classifier) opt_advdis.setup(adverserial_discriminator) AllCombinationPairs = list(itertools.combinations(range(SpeakerNum), 2)) # train for epoch in trange(args.epoch_start, EpochNum + 1): # shuffled_indexes[speaker_idx][idx]: value is index of NormalizedAllData[speaker_idx][**here**] shuffled_indexes = [ myperm(SentenceNum[s], MaxSentenceNum) for s in range(SpeakerNum) ] for n in range(MaxSentenceNum // BatchSize): # batchlist_mcep[speaker_idx][sentence_idx_in_batch] batchlist_mcep = [] begin_idx = n * BatchSize end_idx = begin_idx + BatchSize # not include @ end_idx for s in range(SpeakerNum): batch_tmp = [] for idx in shuffled_indexes[s][begin_idx:end_idx]: batch_tmp.append( NormalizedAllData[s][idx].T) # Transpose here!! batchlist_mcep.append(batch_tmp) # Convert batchlist into a list of arrays X = [batchlist2array(batchlist) for batchlist in batchlist_mcep] xin = [ chainer.Variable(xp.asarray(Xs, dtype=np.float32)) for Xs in X ] # Iterate through all speaker pairs random.shuffle(AllCombinationPairs) for s0, s1 in AllCombinationPairs: AdvLoss_d, AdvLoss_g, ClsLoss_r, ClsLoss_f, CycLoss, RecLoss \ = loss.calc_loss(xin[s0], xin[s1], s0, s1, SpeakerNum) gen_loss = (w_adv * AdvLoss_g + w_cls * ClsLoss_f + w_cyc * CycLoss + w_rec * RecLoss) cls_loss = ClsLoss_r advdis_loss = AdvLoss_d generator.cleargrads() gen_loss.backward() opt_gen.update() classifier.cleargrads() cls_loss.backward() opt_cls.update() adverserial_discriminator.cleargrads() advdis_loss.backward() opt_advdis.update() print('epoch {}, mini-batch {}:'.format(epoch, n + 1)) print('AdvLoss_d={}, AdvLoss_g={}, ClsLoss_r={}, ClsLoss_f={}'. format(AdvLoss_d.data, AdvLoss_g.data, ClsLoss_r.data, ClsLoss_f.data)) print('CycLoss={}, RecLoss={}'.format(CycLoss.data, RecLoss.data)) save_loss(output_dir, AdvLoss_d.data, AdvLoss_g.data, ClsLoss_r.data, ClsLoss_f.data, CycLoss.data, RecLoss.data) if epoch % args.snapshot == 0: snapshot_dir = output_dir / "snapshot" snapshot_dir.mkdir(exist_ok=True) snapshot(snapshot_dir, epoch, generator, classifier, adverserial_discriminator) snapshot_feature_dir = output_dir / "snapshot_feature" snapshot_feature_dir.mkdir(exist_ok=True) output = {} with chainer.no_backprop_mode(): identity = np.identity(SpeakerNum) for s in range(SpeakerNum): speaker_vec = chainer.Variable( xp.asarray(identity[s], dtype=np.float32)) for key, mcep in zip(SpeakerIndividualKeys[s], NormalizedAllData[s]): mcep_T = mcep.T out = generator.hidden_layer( chainer.Variable( xp.asarray(mcep_T[np.newaxis, :, :], dtype=np.float32)), speaker_vec) out = np.squeeze(cuda.to_cpu(out.data)) output[key] = out.T np.savez( snapshot_feature_dir / f"{output_file.stem}_epoch_{epoch:05}.npz", **output) # output final result output = {} with chainer.no_backprop_mode(): identity = np.identity(SpeakerNum) for s in range(SpeakerNum): speaker_vec = chainer.Variable( xp.asarray(identity[s], dtype=np.float32)) for key, mcep in zip(SpeakerIndividualKeys[s], NormalizedAllData[s]): mcep_T = mcep.T out = generator.hidden_layer( chainer.Variable( xp.asarray(mcep_T[np.newaxis, :, :], dtype=np.float32)), speaker_vec) out = np.squeeze(cuda.to_cpu(out.data)) output[key] = out.T np.savez(output_file, **output)
parser.add_argument("--output_prefix", type=Path) parser.add_argument("--key_of_pickuped_sentences", nargs="+", required=True) parser.add_argument("--mode", choices=["ML", "RND"], default="ML") args = parser.parse_args() speakers, spkind_keys = separate_speaker(np.load(args.speaker_id)) speaker_num = len(speakers) target_idx = speakers.index(args.target_speaker) phn = np.load(args.phn) phn_N = int(max(map(np.max, phn.values()))) + 1 gold_transcription = get_separated_values(phn, spkind_keys)[target_idx] src_f0 = get_separated_values(np.load(args.f0), spkind_keys)[target_idx] src_ap = get_separated_values(np.load(args.ap), spkind_keys)[target_idx] src_mcep = get_separated_values(np.load(args.mcep), spkind_keys)[target_idx] ap_generator = AP_generator(phn_N, src_ap, letter_stateseq=gold_transcription, flat=args.flat_ap, mode=args.mode) f0_generator = F0_generator(phn_N, src_f0, letter_stateseq=gold_transcription, flat=args.flat_f0, mode=args.mode) mcep_generator = MCEP_generator(phn_N,