def generate_world_features(filenames, data_dir): """Code for creating and saving world features and sample labels""" world_dir = os.path.join(data_dir, 'world') f0_dir = os.path.join(data_dir, 'f0') labels_dir = os.path.join(data_dir, "labels") if not os.path.exists(world_dir): os.mkdir(world_dir) if not os.path.exists(f0_dir): os.mkdir(f0_dir) if not os.path.exists(labels_dir): os.mkdir(labels_dir) MIN_LENGTH = 0 # actual is 59 MAX_LENGTH = 1719 worlds_made = 0 for i, f in enumerate(filenames): wav, labels = get_wav_and_labels(f, data_dir) wav = np.array(wav, dtype=np.float64) labels = np.array(labels) coded_sp_name = os.path.join(world_dir, f[:-4] + ".npy") label_name = os.path.join(labels_dir, f[:-4] + ".npy") f0_name = os.path.join(f0_dir, f[:-4] + ".npy") if os.path.exists(coded_sp_name) and os.path.exists( label_name) and os.path.exists(f0_name): worlds_made += 1 continue # Ignores data sample if wrong emotion if labels[0] != -1: f0, ap, sp, coded_sp = cal_mcep(wav) # Ignores data sample sample is too long if coded_sp.shape[1] < MAX_LENGTH: np.save(os.path.join(world_dir, f[:-4] + ".npy"), coded_sp) np.save(os.path.join(labels_dir, f[:-4] + ".npy"), labels) np.save(os.path.join(f0_dir, f[:-4] + ".npy"), f0) worlds_made += 1 if i % 10 == 0: print(i, " complete.") print(worlds_made, "worlds made.")
def _single_conversion(filename, model, one_hot_emo): ''' THIS WON'T WORK RIGHT NOW, USE THE WORLD CONVERSION LOOP IN MAIN Call only from __main__ section in this module. Generates sample converted into each emotion. (str) filename - name.wav file to be converted (StarGAN-emo-VC1) model - pretrained model to perform conversion (torch.Tensor(long)) one_hot_emo - one hot encoding of emotion to convert to ''' wav, labels = pp.get_wav_and_labels(filenames[5], config['data']['dataset_dir']) wav = np.array(wav, dtype=np.double) f0, ap, sp, coded_sp = pw.cal_mcep(wav) coded_sp = coded_sp.T coded_sp_torch = torch.Tensor(coded_sp).unsqueeze(0).unsqueeze(0).to( device=device) fake = model.G(coded_sp_torch, one_hot_emo.unsqueeze(0)) fake = fake.squeeze() print("Sampled size = ", fake.size()) converted_sp = fake.cpu().detach().numpy() converted_sp = np.array(converted_sp, dtype=np.float64) sample_length = converted_sp.shape[0] if sample_length != ap.shape[0]: ap = np.ascontiguousarray(ap[0:sample_length, :], dtype=np.float64) f0 = np.ascontiguousarray(f0[0:sample_length], dtype=np.float64) f0 = np.ascontiguousarray(f0[20:-20], dtype=np.float64) ap = np.ascontiguousarray(ap[20:-20, :], dtype=np.float64) converted_sp = np.ascontiguousarray(converted_sp[40:-40, :], dtype=np.float64) coded_sp = np.ascontiguousarray(coded_sp[20:-20, :], dtype=np.float64) target = np.argmax(one_hot_emo) out_name = filename[:-4] + str(labels[1]) + "to" + target + ".wav" audio_utils.save_world_wav([f0, ap, sp, converted_sp], out_name)
def generate_f0_stats(filenames, data_dir): """Generate absolute and relative f0 dictionary""" NUM_SPEAKERS = 10 NUM_EMOTIONS = 4 f0_dir = os.path.join(data_dir, 'f0') # CALCULATE ABSOLUTE F0 STATS emo_stats = {} for e in range(NUM_EMOTIONS): spk_dict = {} for s in range(NUM_SPEAKERS): f0s = [] for f in filenames: wav, labels = get_wav_and_labels(f, data_dir) wav = np.array(wav, dtype=np.float64) labels = np.array(labels) if labels[0] == e and labels[1] == s: f0_file = os.path.join(f0_dir, f[:-4] + ".npy") if os.path.exists(f0_file): f0 = np.load(f0_file) f0s.append(f0) log_f0_mean, f0_std = get_f0_stats(f0s) spk_dict[s] = (log_f0_mean, f0_std) print(f"Done emotion {e}, speaker {s}.") emo_stats[e] = spk_dict with open('f0_dict.pkl', 'wb') as absolute_file: pickle.dump(emo_stats, absolute_file, pickle.HIGHEST_PROTOCOL) print(" ---- Absolute f0 stats completed ----") for tag, val in emo_stats.items(): print(f'Emotion {tag} stats:') for tag2, val2 in val.items(): print(f'{tag2} = {val2[0]}, {val2[1]}') # CALCULATE RELATIVE F0 STATS emo2emo_dict = {} for e1 in range(NUM_EMOTIONS): emo2emo_dict[e1] = {} for e2 in range(NUM_EMOTIONS): mean_list = [] std_list = [] for s in range(NUM_SPEAKERS): mean_diff = emo_stats[e2][s][0] - emo_stats[e1][s][0] std_diff = emo_stats[e2][s][1] - emo_stats[e1][s][1] mean_list.append(mean_diff) std_list.append(std_diff) mean_mean = np.mean(mean_list) std_mean = np.mean(std_list) emo2emo_dict[e1][e2] = (mean_mean, std_mean) print(" ---- Relative f0 stats completed ----") for tag, val in emo2emo_dict.items(): print(f'Emotion {tag} stats:') for tag2, val2 in val.items(): print(f'{tag2} = {val2[0]}, {val2[1]}') with open('f0_relative_dict.pkl', 'wb') as relative_file: pickle.dump(emo2emo_dict, relative_file, pickle.HIGHEST_PROTOCOL)
filenames.append(f) print("Converting sample set.") else: data_dir = os.path.join(config['data']['dataset_dir'], "audio") print("Data directory = ", data_dir) files = find_files(data_dir, ext='.wav') label_dir = os.path.join(config['data']['dataset_dir'], 'labels') num_emos = config['model']['num_classes'] # filenames = [f + ".wav" for f in files] filenames = [ f for f in files if -1 < pp.get_wav_and_labels( f, config['data']['dataset_dir'])[1][0] < num_emos ] filenames = [ os.path.join(config['data']['dataset_dir'], f) for f in filenames ][:10] files = my_dataset.shuffle(files) train_test_split = config['data']['train_test_split'] split_index = int(len(files) * train_test_split) filenames = files[split_index:] print("Converting 10 random test set samples.") print(filenames) # for one_hot in emo_targets: # _single_conversion(filenames[0], model, one_hot)
# # n, bins, patches = plt.hist(lengths, bins = 22) # plt.xlabel('Sequence length') # plt.ylabel('Count') # plt.title(r'New histogram of sequence lengths for 4 emotional categories') # plt.show() ############################################ # Generate f0_dict # ############################################ emo_stats = {} for e in range(0, 4): spk_dict = {} for s in range(0, 10): f0s = [] for f in filenames: wav, labels = pp.get_wav_and_labels(f, data_dir) wav = np.array(wav, dtype=np.float64) labels = np.array(labels) if labels[0] == e and labels[1] == s: f0_dir = data_dir + "/f0/" + f[:-4] + ".npy" f0 = np.load(f0_dir) f0s.append(f0) log_f0_mean, f0_std = get_f0_stats(f0s) spk_dict[s] = (log_f0_mean, f0_std) print(f"Done emotion {e}, speaker {s}.") emo_stats[e] = spk_dict with open('f0_dict.pkl', 'wb') as f: pickle.dump(emo_stats, f, pickle.HIGHEST_PROTOCOL)