def train(train_A_dir, train_B_dir, model_dir, model_name, random_seed, validation_A_dir, validation_B_dir, output_dir, tensorboard_log_dir): np.random.seed(random_seed) num_epochs = 5000 mini_batch_size = 1 # mini_batch_size = 1 is better generator_learning_rate = 0.0002 generator_learning_rate_decay = generator_learning_rate / 200000 discriminator_learning_rate = 0.0001 discriminator_learning_rate_decay = discriminator_learning_rate / 200000 sampling_rate = 16000 num_mcep = 24 frame_period = 5.0 n_frames = 128 lambda_cycle = 10 lambda_identity = 5 print('Preprocessing Data...') start_time = time.time() wavs_A = load_wavs(wav_dir = train_A_dir, sr = sampling_rate) wavs_B = load_wavs(wav_dir = train_B_dir, sr = sampling_rate) f0s_A, timeaxes_A, sps_A, aps_A, coded_sps_A = world_encode_data(wavs = wavs_A, fs = sampling_rate, frame_period = frame_period, coded_dim = num_mcep) f0s_B, timeaxes_B, sps_B, aps_B, coded_sps_B = world_encode_data(wavs = wavs_B, fs = sampling_rate, frame_period = frame_period, coded_dim = num_mcep) log_f0s_mean_A, log_f0s_std_A = logf0_statistics(f0s_A) log_f0s_mean_B, log_f0s_std_B = logf0_statistics(f0s_B) print('Log Pitch A') print('Mean: %f, Std: %f' %(log_f0s_mean_A, log_f0s_std_A)) print('Log Pitch B') print('Mean: %f, Std: %f' %(log_f0s_mean_B, log_f0s_std_B)) coded_sps_A_transposed = transpose_in_list(lst = coded_sps_A) coded_sps_B_transposed = transpose_in_list(lst = coded_sps_B) coded_sps_A_norm, coded_sps_A_mean, coded_sps_A_std = coded_sps_normalization_fit_transoform(coded_sps = coded_sps_A_transposed) print("Input data fixed.") coded_sps_B_norm, coded_sps_B_mean, coded_sps_B_std = coded_sps_normalization_fit_transoform(coded_sps = coded_sps_B_transposed) if not os.path.exists(model_dir): os.makedirs(model_dir) np.savez(os.path.join(model_dir, 'logf0s_normalization.npz'), mean_A = log_f0s_mean_A, std_A = log_f0s_std_A, mean_B = log_f0s_mean_B, std_B = log_f0s_std_B) np.savez(os.path.join(model_dir, 'mcep_normalization.npz'), mean_A = coded_sps_A_mean, std_A = coded_sps_A_std, mean_B = coded_sps_B_mean, std_B = coded_sps_B_std) if validation_A_dir is not None: validation_A_output_dir = os.path.join(output_dir, 'converted_A') if not os.path.exists(validation_A_output_dir): os.makedirs(validation_A_output_dir) if validation_B_dir is not None: validation_B_output_dir = os.path.join(output_dir, 'converted_B') if not os.path.exists(validation_B_output_dir): os.makedirs(validation_B_output_dir) end_time = time.time() time_elapsed = end_time - start_time print('Preprocessing Done.') print('Time Elapsed for Data Preprocessing: %02d:%02d:%02d' % (time_elapsed // 3600, (time_elapsed % 3600 // 60), (time_elapsed % 60 // 1))) model = CycleGAN(num_features = num_mcep) for epoch in range(num_epochs): print('Epoch: %d' % epoch) ''' if epoch > 60: lambda_identity = 0 if epoch > 1250: generator_learning_rate = max(0, generator_learning_rate - 0.0000002) discriminator_learning_rate = max(0, discriminator_learning_rate - 0.0000001) ''' start_time_epoch = time.time() dataset_A, dataset_B = sample_train_data(dataset_A = coded_sps_A_norm, dataset_B = coded_sps_B_norm, n_frames = n_frames) n_samples = dataset_A.shape[0] for i in range(n_samples // mini_batch_size): num_iterations = n_samples // mini_batch_size * epoch + i if num_iterations > 10000: lambda_identity = 0 if num_iterations > 200000: generator_learning_rate = max(0, generator_learning_rate - generator_learning_rate_decay) discriminator_learning_rate = max(0, discriminator_learning_rate - discriminator_learning_rate_decay) start = i * mini_batch_size end = (i + 1) * mini_batch_size generator_loss, discriminator_loss = model.train(input_A = dataset_A[start:end], input_B = dataset_B[start:end], lambda_cycle = lambda_cycle, lambda_identity = lambda_identity, generator_learning_rate = generator_learning_rate, discriminator_learning_rate = discriminator_learning_rate) if i % 50 == 0: #print('Iteration: %d, Generator Loss : %f, Discriminator Loss : %f' % (num_iterations, generator_loss, discriminator_loss)) print('Iteration: {:07d}, Generator Learning Rate: {:.7f}, Discriminator Learning Rate: {:.7f}, Generator Loss : {:.3f}, Discriminator Loss : {:.3f}'.format(num_iterations, generator_learning_rate, discriminator_learning_rate, generator_loss, discriminator_loss)) model.save(directory = model_dir, filename = model_name) end_time_epoch = time.time() time_elapsed_epoch = end_time_epoch - start_time_epoch print('Time Elapsed for This Epoch: %02d:%02d:%02d' % (time_elapsed_epoch // 3600, (time_elapsed_epoch % 3600 // 60), (time_elapsed_epoch % 60 // 1))) if validation_A_dir is not None: if epoch % 50 == 0: print('Generating Validation Data B from A...') for file in os.listdir(validation_A_dir): filepath = os.path.join(validation_A_dir, file) wav, _ = librosa.load(filepath, sr = sampling_rate, mono = True) wav = wav_padding(wav = wav, sr = sampling_rate, frame_period = frame_period, multiple = 4) f0, timeaxis, sp, ap = world_decompose(wav = wav, fs = sampling_rate, frame_period = frame_period) f0_converted = pitch_conversion(f0 = f0, mean_log_src = log_f0s_mean_A, std_log_src = log_f0s_std_A, mean_log_target = log_f0s_mean_B, std_log_target = log_f0s_std_B) coded_sp = world_encode_spectral_envelop(sp = sp, fs = sampling_rate, dim = num_mcep) coded_sp_transposed = coded_sp.T coded_sp_norm = (coded_sp_transposed - coded_sps_A_mean) / coded_sps_A_std coded_sp_converted_norm = model.test(inputs = np.array([coded_sp_norm]), direction = 'A2B')[0] coded_sp_converted = coded_sp_converted_norm * coded_sps_B_std + coded_sps_B_mean coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray(coded_sp_converted) decoded_sp_converted = world_decode_spectral_envelop(coded_sp = coded_sp_converted, fs = sampling_rate) wav_transformed = world_speech_synthesis(f0 = f0_converted, decoded_sp = decoded_sp_converted, ap = ap, fs = sampling_rate, frame_period = frame_period) librosa.output.write_wav(os.path.join(validation_A_output_dir, os.path.basename(file)), wav_transformed, sampling_rate) if validation_B_dir is not None: if epoch % 50 == 0: print('Generating Validation Data A from B...') for file in os.listdir(validation_B_dir): filepath = os.path.join(validation_B_dir, file) wav, _ = librosa.load(filepath, sr = sampling_rate, mono = True) wav = wav_padding(wav = wav, sr = sampling_rate, frame_period = frame_period, multiple = 4) f0, timeaxis, sp, ap = world_decompose(wav = wav, fs = sampling_rate, frame_period = frame_period) f0_converted = pitch_conversion(f0 = f0, mean_log_src = log_f0s_mean_B, std_log_src = log_f0s_std_B, mean_log_target = log_f0s_mean_A, std_log_target = log_f0s_std_A) coded_sp = world_encode_spectral_envelop(sp = sp, fs = sampling_rate, dim = num_mcep) coded_sp_transposed = coded_sp.T coded_sp_norm = (coded_sp_transposed - coded_sps_B_mean) / coded_sps_B_std coded_sp_converted_norm = model.test(inputs = np.array([coded_sp_norm]), direction = 'B2A')[0] coded_sp_converted = coded_sp_converted_norm * coded_sps_A_std + coded_sps_A_mean coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray(coded_sp_converted) decoded_sp_converted = world_decode_spectral_envelop(coded_sp = coded_sp_converted, fs = sampling_rate) wav_transformed = world_speech_synthesis(f0 = f0_converted, decoded_sp = decoded_sp_converted, ap = ap, fs = sampling_rate, frame_period = frame_period) librosa.output.write_wav(os.path.join(validation_B_output_dir, os.path.basename(file)), wav_transformed, sampling_rate)
def train(img_A_dir, img_B_dir, model_dir, model_name, random_seed, batch_size_maximum, validation_A_dir, validation_B_dir, output_dir, tensorboard_log_dir): np.random.seed(random_seed) num_epochs = 1000 mini_batch_size = 1 # mini_batch_size = 1 is better learning_rate = 0.0002 input_size = [256, 256, 3] num_filters = 64 # Tried num_filters = 8 still not good for 200 epochs if validation_A_dir is not None: validation_A_output_dir = os.path.join(output_dir, 'converted_A') if not os.path.exists(validation_A_output_dir): os.makedirs(validation_A_output_dir) if validation_B_dir is not None: validation_B_output_dir = os.path.join(output_dir, 'converted_B') if not os.path.exists(validation_B_output_dir): os.makedirs(validation_B_output_dir) model = CycleGAN(input_size=input_size, num_filters=num_filters, mode='train', log_dir=tensorboard_log_dir) dataset_A_raw = load_data(img_dir=img_A_dir, load_size=256) dataset_B_raw = load_data(img_dir=img_B_dir, load_size=256) for epoch in range(num_epochs): print('Epoch: %d' % epoch) start_time_epoch = time.time() dataset_A, dataset_B = sample_train_data( dataset_A_raw, dataset_B_raw, load_size=286, output_size=256, batch_size_maximum=batch_size_maximum) n_samples = dataset_A.shape[0] for i in range(n_samples // mini_batch_size): start = i * mini_batch_size end = (i + 1) * mini_batch_size generator_loss, discriminator_loss = model.train( input_A=dataset_A[start:end], input_B=dataset_B[start:end], learning_rate=learning_rate) if i % 50 == 0: print( 'Minibatch: %d, Generator Loss : %f, Discriminator Loss : %f' % (i, generator_loss, discriminator_loss)) model.save(directory=model_dir, filename=model_name) if validation_A_dir is not None: for file in os.listdir(validation_A_dir): filepath = os.path.join(validation_A_dir, file) img = cv2.imread(filepath) img_height, img_width, img_channel = img.shape img = cv2.resize(img, (input_size[1], input_size[0])) img = image_scaling(imgs=img) img_converted = model.test(inputs=np.array([img]), direction='A2B')[0] img_converted = image_scaling_inverse(imgs=img_converted) img_converted = cv2.resize(img_converted, (img_width, img_height)) cv2.imwrite( os.path.join(validation_A_output_dir, os.path.basename(file)), img_converted) if validation_B_dir is not None: for file in os.listdir(validation_B_dir): filepath = os.path.join(validation_B_dir, file) img = cv2.imread(filepath) img_height, img_width, img_channel = img.shape img = cv2.resize(img, (input_size[1], input_size[0])) img = image_scaling(imgs=img) img_converted = model.test(inputs=np.array([img]), direction='B2A')[0] img_converted = image_scaling_inverse(imgs=img_converted) img_converted = cv2.resize(img_converted, (img_width, img_height)) cv2.imwrite( os.path.join(validation_B_output_dir, os.path.basename(file)), img_converted) end_time_epoch = time.time() time_elapsed_epoch = end_time_epoch - start_time_epoch print('Time Elapsed for This Epoch: %02d:%02d:%02d' % (time_elapsed_epoch // 3600, (time_elapsed_epoch % 3600 // 60), (time_elapsed_epoch % 60 // 1)))
def train(train_A_dir, train_B_dir, training_data_dir, model_dir, model_name, random_seed, validation_A_dir, validation_B_dir, output_dir): np.random.seed(random_seed) num_epochs = 2000 mini_batch_size = 1 generator_learning_rate = 0.0002 generator_learning_rate_decay = generator_learning_rate / 200000 discriminator_learning_rate = 0.0001 discriminator_learning_rate_decay = discriminator_learning_rate / 200000 sampling_rate = 16000 num_mcep = 24 frame_period = 5.0 n_frames = 128 lambda_cycle = 10 lambda_identity = 5 # **************************************************************** # *************************Loading DATA*************************** # **************************************************************** with open(os.path.join(training_data_dir, 'A_coded_norm.pk'), "rb") as fa: coded_sps_A_norm = pickle.load(fa) with open(os.path.join(training_data_dir, 'B_coded_norm.pk'), "rb") as fb: coded_sps_B_norm = pickle.load(fb) mcep_normalization_params = np.load( os.path.join(training_data_dir, 'mcep_normalization.npz')) coded_sps_A_mean = mcep_normalization_params['mean_A'] coded_sps_A_std = mcep_normalization_params['std_A'] coded_sps_B_mean = mcep_normalization_params['mean_B'] coded_sps_B_std = mcep_normalization_params['std_B'] logf0s_normalization_params = np.load( os.path.join(training_data_dir, 'logf0s_normalization.npz')) log_f0s_mean_A = logf0s_normalization_params['mean_A'] log_f0s_std_A = logf0s_normalization_params['std_A'] log_f0s_mean_B = logf0s_normalization_params['mean_B'] log_f0s_std_B = logf0s_normalization_params['std_B'] if validation_A_dir is not None: validation_A_output_dir = os.path.join(output_dir, 'converted_A') if not os.path.exists(validation_A_output_dir): os.makedirs(validation_A_output_dir) if validation_B_dir is not None: validation_B_output_dir = os.path.join(output_dir, 'converted_B') if not os.path.exists(validation_B_output_dir): os.makedirs(validation_B_output_dir) if not os.path.exists(model_dir): os.makedirs(model_dir) print("****************************************************************") print("*************************Start Training*************************") print("****************************************************************") model = CycleGAN(num_features=num_mcep) epoch = 0 # load model if os.path.exists(os.path.join(model_dir, "checkpoint")) == True: f = open(os.path.join(model_dir, "checkpoint"), "r") all_ckpt = f.readlines() f.close() pretrain_ckpt = all_ckpt[-1].split("\n")[0].split("\"")[1] epoch = int(pretrain_ckpt.split("-")[1].split(".")[0]) if os.path.exists(os.path.join(model_dir, (pretrain_ckpt + ".index"))) == True: model.load(filepath=os.path.join(model_dir, pretrain_ckpt)) print("Loading pretrained model {}".format(pretrain_ckpt)) else: print("Training model from 0 epoch") for k in range(epoch + 1, num_epochs): print('Epoch: %d' % k) start_time_epoch = time.time() pool_A, pool_B = list(coded_sps_A_norm), list(coded_sps_B_norm) dataset_A, dataset_B = sample_train_data(dataset_A=pool_A, dataset_B=pool_B, n_frames=n_frames) print('dataset_A', np.shape(dataset_A), 'dataset_B', np.shape(dataset_B)) n_samples = dataset_A.shape[0] for i in trange(n_samples // mini_batch_size): num_iterations = n_samples // mini_batch_size * epoch + i if num_iterations > 10000: lambda_identity = 0 if num_iterations > 200000: generator_learning_rate = max( 0, generator_learning_rate - generator_learning_rate_decay) discriminator_learning_rate = max( 0, discriminator_learning_rate - discriminator_learning_rate_decay) start = i * mini_batch_size end = (i + 1) * mini_batch_size generator_loss, discriminator_loss = model.train( input_A=dataset_A[start:end], input_B=dataset_B[start:end], lambda_cycle=lambda_cycle, lambda_identity=lambda_identity, generator_learning_rate=generator_learning_rate, discriminator_learning_rate=discriminator_learning_rate) if i % (n_samples // 2) == 0: print( 'Iteration: {:07d}, Generator Learning Rate: {:.7f}, Discriminator Learning Rate: {:.7f}, Generator Loss : {:.3f}, Discriminator Loss : {:.3f}' .format(num_iterations, generator_learning_rate, discriminator_learning_rate, generator_loss, discriminator_loss)) if k == 1 or k % 100 == 0: print("Saving Epoch {}".format(k)) ckpt_name = model_name + "-" + str(k) + ".ckpt" model.save(directory=model_dir, filename=ckpt_name) end_time_epoch = time.time() time_elapsed_epoch = end_time_epoch - start_time_epoch print('Time Elapsed for This Epoch: %02d:%02d:%02d' % (time_elapsed_epoch // 3600, (time_elapsed_epoch % 3600 // 60), (time_elapsed_epoch % 60 // 1))) if validation_A_dir is not None: if k % 300 == 0: print('Generating Validation Data B from A...') for i in trange(len(os.listdir(validation_A_dir))): file = os.listdir(validation_A_dir)[i] filepath = os.path.join(validation_A_dir, file) wav, _ = librosa.load(filepath, sr=sampling_rate, mono=True) wav = wav_padding(wav=wav, sr=sampling_rate, frame_period=frame_period, multiple=4) f0, timeaxis, sp, ap = world_decompose( wav=wav, fs=sampling_rate, frame_period=frame_period) f0_converted = pitch_conversion( f0=f0, mean_log_src=log_f0s_mean_A, std_log_src=log_f0s_std_A, mean_log_target=log_f0s_mean_B, std_log_target=log_f0s_std_B) coded_sp = world_encode_spectral_envelop(sp=sp, fs=sampling_rate, dim=num_mcep) coded_sp_transposed = coded_sp.T coded_sp_norm = (coded_sp_transposed - coded_sps_A_mean) / coded_sps_A_std coded_sp_converted_norm = model.test(inputs=np.array( [coded_sp_norm]), direction='A2B')[0] coded_sp_converted = coded_sp_converted_norm * coded_sps_B_std + coded_sps_B_mean coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray( coded_sp_converted) decoded_sp_converted = world_decode_spectral_envelop( coded_sp=coded_sp_converted, fs=sampling_rate) wav_transformed = world_speech_synthesis( f0=f0_converted, decoded_sp=decoded_sp_converted, ap=ap, fs=sampling_rate, frame_period=frame_period) librosa.output.write_wav( os.path.join(validation_A_output_dir, os.path.basename(file)), wav_transformed, sampling_rate) if validation_B_dir is not None: if k % 300 == 0: print('Generating Validation Data A from B...') for i in trange(len(os.listdir(validation_B_dir))): file = os.listdir(validation_A_dir)[i] filepath = os.path.join(validation_B_dir, file) wav, _ = librosa.load(filepath, sr=sampling_rate, mono=True) wav = wav_padding(wav=wav, sr=sampling_rate, frame_period=frame_period, multiple=4) f0, timeaxis, sp, ap = world_decompose( wav=wav, fs=sampling_rate, frame_period=frame_period) f0_converted = pitch_conversion( f0=f0, mean_log_src=log_f0s_mean_B, std_log_src=log_f0s_std_B, mean_log_target=log_f0s_mean_A, std_log_target=log_f0s_std_A) coded_sp = world_encode_spectral_envelop(sp=sp, fs=sampling_rate, dim=num_mcep) coded_sp_transposed = coded_sp.T coded_sp_norm = (coded_sp_transposed - coded_sps_B_mean) / coded_sps_B_std coded_sp_converted_norm = model.test(inputs=np.array( [coded_sp_norm]), direction='B2A')[0] coded_sp_converted = coded_sp_converted_norm * coded_sps_A_std + coded_sps_A_mean coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray( coded_sp_converted) decoded_sp_converted = world_decode_spectral_envelop( coded_sp=coded_sp_converted, fs=sampling_rate) wav_transformed = world_speech_synthesis( f0=f0_converted, decoded_sp=decoded_sp_converted, ap=ap, fs=sampling_rate, frame_period=frame_period) librosa.output.write_wav( os.path.join(validation_B_output_dir, os.path.basename(file)), wav_transformed, sampling_rate)
def train(train_A_dir, train_B_dir, model_dir, model_name, random_seed, validation_A_dir, validation_B_dir, output_dir, tensorboard_log_dir, gen_model, MCEPs_dim, lambda_list, processed_data_dir): gen_loss_thres = 100.0 np.random.seed(random_seed) num_epochs = 5000 mini_batch_size = 1 generator_learning_rate = 0.0002 generator_learning_rate_decay = generator_learning_rate / 200000 discriminator_learning_rate = 0.0001 discriminator_learning_rate_decay = discriminator_learning_rate / 200000 sampling_rate = 44000 num_mcep = MCEPs_dim frame_period = 5.0 n_frames = 128 lambda_cycle = lambda_list[0] lambda_identity = lambda_list[1] Speaker_A_features = os.path.join(processed_data_dir, 'wav_A.npz') Speaker_B_features = os.path.join(processed_data_dir, 'wav_B.npz') start_time = time.time() print('lookiong for preprocessed data in:{}'.format(processed_data_dir)) if os.path.exists(Speaker_A_features) and os.path.exists( Speaker_B_features): print('#### loading processed data #######') f0s_A, timeaxes_A, sps_A, aps_A, coded_sps_A = load_speaker_features( Speaker_A_features) f0s_B, timeaxes_B, sps_B, aps_B, coded_sps_B = load_speaker_features( Speaker_B_features) else: print('Preprocessing Data...') if not os.path.exists(processed_data_dir): os.makedirs(processed_data_dir) wavs_A = load_wavs(wav_dir=train_A_dir, sr=sampling_rate) f0s_A, timeaxes_A, sps_A, aps_A, coded_sps_A = world_encode_data( wavs=wavs_A, fs=sampling_rate, frame_period=frame_period, coded_dim=num_mcep) np.savez(Speaker_A_features, f0s=f0s_A, timeaxes=timeaxes_A, sps=sps_A, aps=aps_A, coded_sps=coded_sps_A) del wavs_A wavs_B = load_wavs(wav_dir=train_B_dir, sr=sampling_rate) f0s_B, timeaxes_B, sps_B, aps_B, coded_sps_B = world_encode_data( wavs=wavs_B, fs=sampling_rate, frame_period=frame_period, coded_dim=num_mcep) np.savez(Speaker_B_features, f0s=f0s_B, timeaxes=timeaxes_B, sps=sps_B, aps=aps_B, coded_sps=coded_sps_B) del wavs_B print('Data preprocessing finished !') return log_f0s_mean_A, log_f0s_std_A = logf0_statistics(f0s_A) log_f0s_mean_B, log_f0s_std_B = logf0_statistics(f0s_B) print('Log Pitch A') print('Mean: %f, Std: %f' % (log_f0s_mean_A, log_f0s_std_A)) print('Log Pitch B') print('Mean: %f, Std: %f' % (log_f0s_mean_B, log_f0s_std_B)) coded_sps_A, f0s_A = remove_radical_pitch_samples(f0s_A, coded_sps_A, log_f0s_mean_A, log_f0s_std_A) coded_sps_B, f0s_B = remove_radical_pitch_samples(f0s_B, coded_sps_B, log_f0s_mean_B, log_f0s_std_B) print('recalculating mean and std of radical cleared f0s') log_f0s_mean_A, log_f0s_std_A = logf0_statistics(f0s_A) log_f0s_mean_B, log_f0s_std_B = logf0_statistics(f0s_B) coded_sps_A_transposed = transpose_in_list(lst=coded_sps_A) coded_sps_B_transposed = transpose_in_list(lst=coded_sps_B) print("Input data fixed.") coded_sps_A_norm, coded_sps_A_mean, coded_sps_A_std = coded_sps_normalization_fit_transoform( coded_sps=coded_sps_A_transposed) coded_sps_B_norm, coded_sps_B_mean, coded_sps_B_std = coded_sps_normalization_fit_transoform( coded_sps=coded_sps_B_transposed) if not os.path.exists(model_dir): os.makedirs(model_dir) np.savez(os.path.join(model_dir, 'logf0s_normalization.npz'), mean_A=log_f0s_mean_A, std_A=log_f0s_std_A, mean_B=log_f0s_mean_B, std_B=log_f0s_std_B) np.savez(os.path.join(model_dir, 'mcep_normalization.npz'), mean_A=coded_sps_A_mean, std_A=coded_sps_A_std, mean_B=coded_sps_B_mean, std_B=coded_sps_B_std) if validation_A_dir is not None: validation_A_output_dir = os.path.join(output_dir, 'converted_A') if not os.path.exists(validation_A_output_dir): os.makedirs(validation_A_output_dir) if validation_B_dir is not None: validation_B_output_dir = os.path.join(output_dir, 'converted_B') if not os.path.exists(validation_B_output_dir): os.makedirs(validation_B_output_dir) end_time = time.time() time_elapsed = end_time - start_time print('Preprocessing Done.') print('Time Elapsed for Data Preprocessing: %02d:%02d:%02d' % (time_elapsed // 3600, (time_elapsed % 3600 // 60), (time_elapsed % 60 // 1))) # ---------------------------------------------- Data preprocessing ---------------------------------------------- # # Model define model = CycleGAN(num_features=num_mcep, log_dir=tensorboard_log_dir, model_name=model_name, gen_model=gen_model) # load model if os.path.exists(os.path.join(model_dir, (model_name + ".index"))) == True: model.load(filepath=os.path.join(model_dir, model_name)) # =================================================== Training =================================================== # for epoch in range(num_epochs): print('Epoch: %d' % epoch) start_time_epoch = time.time() dataset_A, dataset_B = sample_train_data(dataset_A=coded_sps_A_norm, dataset_B=coded_sps_B_norm, n_frames=n_frames) n_samples = dataset_A.shape[0] # -------------------------------------------- one epoch learning -------------------------------------------- # for i in tqdm.tqdm(range(n_samples // mini_batch_size)): num_iterations = n_samples // mini_batch_size * epoch + i if num_iterations > 10000: lambda_identity = 0 if num_iterations > 200000: generator_learning_rate = max( 0, generator_learning_rate - generator_learning_rate_decay) discriminator_learning_rate = max( 0, discriminator_learning_rate - discriminator_learning_rate_decay) start = i * mini_batch_size end = (i + 1) * mini_batch_size generator_loss, discriminator_loss, generator_loss_A2B = model.train\ (input_A = dataset_A[start:end], input_B = dataset_B[start:end], lambda_cycle = lambda_cycle, lambda_identity = lambda_identity, generator_learning_rate = generator_learning_rate, discriminator_learning_rate = discriminator_learning_rate) # issue #4, # model.summary() # Minimum AtoB loss model save # if gen_loss_thres > generator_loss_A2B: # gen_loss_thres = generator_loss_A2B # best_model_name = 'Bestmodel' + model_name # model.save(directory=model_dir, filename=best_model_name) # print("generator loss / generator A2B loss ", generator_loss, generator_loss_A2B) if i % 50 == 0: print( 'Iteration: {:07d}, Generator Learning Rate: {:.7f}, Discriminator Learning Rate: {:.7f}, Generator Loss : {:.3f}, Discriminator Loss : {:.3f}' .format(num_iterations, generator_learning_rate, discriminator_learning_rate, generator_loss, discriminator_loss)) # Last model save if epoch % 10 == 0: model.save(directory=model_dir, filename=model_name) end_time_epoch = time.time() time_elapsed_epoch = end_time_epoch - start_time_epoch print('Time Elapsed for This Epoch: %02d:%02d:%02d' % (time_elapsed_epoch // 3600, (time_elapsed_epoch % 3600 // 60), (time_elapsed_epoch % 60 // 1))) # -------------------------------------------- one epoch learning -------------------------------------------- # # ------------------------------------------- validation inference ------------------------------------------- # if validation_A_dir is not None: # if epoch % 50 == 0: if epoch % 10 == 0: print('Generating Validation Data B from A...') for file in os.listdir(validation_A_dir): filepath = os.path.join(validation_A_dir, file) wav, _ = librosa.load(filepath, sr=sampling_rate, mono=True) wav = wav_padding(wav=wav, sr=sampling_rate, frame_period=frame_period, multiple=4) f0, timeaxis, sp, ap = world_decompose( wav=wav, fs=sampling_rate, frame_period=frame_period) f0_converted = pitch_conversion( f0=f0, mean_log_src=log_f0s_mean_A, std_log_src=log_f0s_std_A, mean_log_target=log_f0s_mean_B, std_log_target=log_f0s_std_B) coded_sp = world_encode_spectral_envelop(sp=sp, fs=sampling_rate, dim=num_mcep) coded_sp_transposed = coded_sp.T coded_sp_norm = (coded_sp_transposed - coded_sps_A_mean) / coded_sps_A_std coded_sp_converted_norm = model.test(inputs=np.array( [coded_sp_norm]), direction='A2B')[0] coded_sp_converted = coded_sp_converted_norm * coded_sps_B_std + coded_sps_B_mean coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray( coded_sp_converted) decoded_sp_converted = world_decode_spectral_envelop( coded_sp=coded_sp_converted, fs=sampling_rate) wav_transformed = world_speech_synthesis( f0=f0_converted, decoded_sp=decoded_sp_converted, ap=ap, fs=sampling_rate, frame_period=frame_period) librosa.output.write_wav( os.path.join(validation_A_output_dir, os.path.basename(file)), wav_transformed, sampling_rate) # break if validation_B_dir is not None: # if epoch % 50 == 0: if epoch % 10 == 0: print('Generating Validation Data A from B...') for file in os.listdir(validation_B_dir): filepath = os.path.join(validation_B_dir, file) wav, _ = librosa.load(filepath, sr=sampling_rate, mono=True) wav = wav_padding(wav=wav, sr=sampling_rate, frame_period=frame_period, multiple=4) f0, timeaxis, sp, ap = world_decompose( wav=wav, fs=sampling_rate, frame_period=frame_period) f0_converted = pitch_conversion( f0=f0, mean_log_src=log_f0s_mean_B, std_log_src=log_f0s_std_B, mean_log_target=log_f0s_mean_A, std_log_target=log_f0s_std_A) coded_sp = world_encode_spectral_envelop(sp=sp, fs=sampling_rate, dim=num_mcep) coded_sp_transposed = coded_sp.T coded_sp_norm = (coded_sp_transposed - coded_sps_B_mean) / coded_sps_B_std coded_sp_converted_norm = model.test(inputs=np.array( [coded_sp_norm]), direction='B2A')[0] coded_sp_converted = coded_sp_converted_norm * coded_sps_A_std + coded_sps_A_mean coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray( coded_sp_converted) decoded_sp_converted = world_decode_spectral_envelop( coded_sp=coded_sp_converted, fs=sampling_rate) wav_transformed = world_speech_synthesis( f0=f0_converted, decoded_sp=decoded_sp_converted, ap=ap, fs=sampling_rate, frame_period=frame_period) librosa.output.write_wav( os.path.join(validation_B_output_dir, os.path.basename(file)), wav_transformed, sampling_rate)
def train(train_dir, model_dir, model_name, random_seed, \ validation_dir, output_dir, pre_train=None, \ lambda_cycle=0, lambda_momenta=0): np.random.seed(random_seed) num_epochs = 500 mini_batch_size = 1 generator_learning_rate = 0.0001 discriminator_learning_rate = 0.0000001 num_mcep = 23 n_frames = 128 lambda_cycle = lambda_cycle lambda_momenta = lambda_momenta lc_lm = "lc_"+str(lambda_cycle) \ +"_lm_"+str(lambda_momenta) logger_file = './log/' + lc_lm + '.log' if not os.path.exists('./log'): os.mkdir('./log') if os.path.exists(logger_file): os.remove(logger_file) logging.basicConfig(filename=logger_file, \ level=logging.DEBUG) logging.info("lambda_cycle - {}".format(lambda_cycle)) logging.info("lambda_momenta - {}".format(lambda_momenta)) if not os.path.isdir("./generated_pitch/" + lc_lm): os.mkdir("./generated_pitch/" + lc_lm) else: for f in glob(os.path.join("./generated_pitch/", "*.png")): os.remove(f) start_time = time.time() data_train = scio.loadmat(os.path.join(train_dir, 'train.mat')) data_valid = scio.loadmat(os.path.join(train_dir, 'valid.mat')) pitch_A_train = np.expand_dims(data_train['src_f0_feat'], axis=-1) pitch_B_train = np.expand_dims(data_train['tar_f0_feat'], axis=-1) mfc_A_train = data_train['src_mfc_feat'] mfc_B_train = data_train['tar_mfc_feat'] pitch_A_valid = np.expand_dims(data_valid['src_f0_feat'], axis=-1) pitch_B_valid = np.expand_dims(data_valid['tar_f0_feat'], axis=-1) mfc_A_valid = data_valid['src_mfc_feat'] mfc_B_valid = data_valid['tar_mfc_feat'] # Shuffle to get non-parallel training data indices_train = np.arange(0, pitch_A_train.shape[0]) np.random.shuffle(indices_train) pitch_A_train = pitch_A_train[indices_train] mfc_A_train = mfc_A_train[indices_train] np.random.shuffle(indices_train) pitch_B_train = pitch_B_train[indices_train] mfc_B_train = mfc_B_train[indices_train] mfc_A_valid, pitch_A_valid, \ mfc_B_valid, pitch_B_valid = preproc.sample_data(mfc_A=mfc_A_valid, \ mfc_B=mfc_B_valid, pitch_A=pitch_A_valid, \ pitch_B=pitch_B_valid) if validation_dir is not None: validation_output_dir = os.path.join(output_dir, lc_lm) if not os.path.exists(validation_output_dir): os.makedirs(validation_output_dir) end_time = time.time() time_elapsed = end_time - start_time print('Time Elapsed for Data Preprocessing: %02d:%02d:%02d' % (time_elapsed // 3600, \ (time_elapsed % 3600 // 60), \ (time_elapsed % 60 // 1))) #use pre_train arg to provide trained model model = CycleGAN(dim_pitch=1, dim_mfc=num_mcep, \ n_frames=n_frames, pre_train=pre_train) for epoch in range(1, num_epochs + 1): print('Epoch: %d' % epoch) logging.info('Epoch: %d' % epoch) start_time_epoch = time.time() mfc_A, pitch_A, \ mfc_B, pitch_B = preproc.sample_data(mfc_A=mfc_A_train, \ mfc_B=mfc_B_train, pitch_A=pitch_A_train, \ pitch_B=pitch_B_train) n_samples = mfc_A.shape[0] train_gen_loss = [] train_disc_loss = [] for i in range(n_samples // mini_batch_size): start = i * mini_batch_size end = (i + 1) * mini_batch_size generator_loss, discriminator_loss, \ gen_A, gen_B, \ mom_A, mom_B = model.train(mfc_A=mfc_A[start:end], \ mfc_B=mfc_B[start:end], \ pitch_A=pitch_A[start:end], \ pitch_B=pitch_B[start:end], \ lambda_cycle=lambda_cycle, \ lambda_momenta=lambda_momenta, \ generator_learning_rate=generator_learning_rate, \ discriminator_learning_rate=discriminator_learning_rate) train_gen_loss.append(generator_loss) train_disc_loss.append(discriminator_loss) logging.info("Train Generator Loss- {}".format( np.mean(train_gen_loss))) logging.info("Train Discriminator Loss- {}".format( np.mean(train_disc_loss))) if epoch % 100 == 0: for i in range(mfc_A_valid.shape[0]): gen_A, gen_B, mom_A, mom_B \ = model.test_gen(mfc_A=mfc_A_valid[i:i+1], \ mfc_B=mfc_B_valid[i:i+1], \ pitch_A=pitch_A_valid[i:i+1], \ pitch_B=pitch_B_valid[i:i+1]) pylab.figure(figsize=(12, 12)) pylab.subplot(121) pylab.plot(pitch_A_valid[i].reshape(-1, ), label='Input A') pylab.plot(gen_B.reshape(-1, ), label='Generated B') pylab.plot(mom_B.reshape(-1, ), label='Generated momenta') pylab.legend(loc=2) pylab.subplot(122) pylab.plot(pitch_B_valid[i].reshape(-1, ), label='Input B') pylab.plot(gen_A.reshape(-1, ), label='Generated A') pylab.plot(mom_A.reshape(-1, ), label='Generated momenta') pylab.legend(loc=2) pylab.title('Epoch ' + str(epoch) + ' example ' + str(i + 1)) pylab.savefig('./generated_pitch/' + str(epoch) + '_' + str(i + 1) + '.png') pylab.close() end_time_epoch = time.time() time_elapsed_epoch = end_time_epoch - start_time_epoch logging.info('Time Elapsed for This Epoch: %02d:%02d:%02d' % (time_elapsed_epoch // 3600, \ (time_elapsed_epoch % 3600 // 60), (time_elapsed_epoch % 60 // 1))) if epoch % 100 == 0: cur_model_name = model_name + "_" + str(epoch) + ".ckpt" model.save(directory=model_dir, filename=cur_model_name)