def adv_ex(model, x_in, parameters, sampling_rate, target, eps, n_adv, sess, multi_model, attack): if parameters['feature_type'] == 'raw': hop_size_samples = tools.sec_to_samples(parameters['hop_size'], sampling_rate) x, _ = torchaudio.load(x_in) num_frames = np.floor(x.shape[1] / hop_size_samples) x = x[:, :int(num_frames * hop_size_samples) - 1] else: x = fe.compute_features_with_context(x_in, **parameters) x = np.reshape(x, (x.shape[0], (x.shape[1] * x.shape[2])), order='C') signal_length = x.shape[1] window_size_samples = tools.next_pow2_samples(parameters['window_size'], sampling_rate) hop_size_samples = tools.sec_to_samples(parameters['hop_size'], sampling_rate) num_frames = tools.get_num_frames(signal_length, window_size_samples, hop_size_samples) + 1 # if target length does nt fit signal length if target.shape[0] != num_frames: x = x[:, :-hop_size_samples] signal_length = x.shape[1] num_frames = tools.get_num_frames(signal_length, window_size_samples, hop_size_samples) + 1 adv, single_advs = targeted(model, x.shape, sess, x, target, eps, n_adv, attack, multi_model) # x.cpu().numpy(), return adv, single_advs
def generator(model, hmm, x_dirs, y_dirs, sampling_rate, parameters, viterbi_training=False): """ creates feature-target-pairs out of files lists for training. :param model: trained dnn model :param hmm: hmm class instance :param x_dirs: *.wav file list :param y_dirs: *.TextGrid file list :param sampling_rate: sampling frequency in hz :param parameters: parameters for feature extraction :param viterbi_training: flag for viterbi training :return: x, y: feature-target-pair """ # set random seed random.seed(42) # init A for viterbo training hmm.A_count = np.ceil(hmm.A) # same values for all utterances window_size_samples = tools.next_pow2_samples(parameters['window_size'], sampling_rate) hop_size_samples = tools.sec_to_samples(parameters['hop_size'], sampling_rate) # generator while True: x_dirs, y_dirs = tools.shuffle_list(x_dirs, y_dirs) for audio_file, target_dir in zip(x_dirs, y_dirs): # get features and target y = tools.praat_file_to_word_target(target_dir, sampling_rate, window_size_samples, hop_size_samples, hmm) x, _ = torchaudio.load(audio_file) # to have the same number of frames as the targets num_frames = np.floor(x.shape[1] / hop_size_samples) x = x[:, :int(num_frames * hop_size_samples) - 1] yield x, y, target_dir
def preprocess_dataset(model_type, data_dir, feature_parameters, device='cuda'): def load_raw_data_dir(dataset_dir, device='cuda'): dataset_dir = dataset_dir.resolve() # To resolve symlinks! # find raw data wav_files = [ f for f in sorted( dataset_dir.joinpath('wav').resolve().glob('*.wav')) ] praat_files = [ f for f in sorted( dataset_dir.joinpath('TextGrid').resolve().glob('*.TextGrid')) ] lab_files = [ f for f in sorted( dataset_dir.joinpath('lab').resolve().glob('*.lab')) ] # load raw data X = [] Y = [] texts = [] for wav_file, praat_file, lab_file in tqdm( zip(wav_files, praat_files, lab_files), total=len(wav_files), bar_format=' load raw {l_bar}{bar:30}{r_bar}'): # sanity check assert wav_file.stem == praat_file.stem == lab_file.stem ## load x x, _ = torchaudio.load(wav_file) # round to the next `full` frame num_frames = np.floor(x.shape[1] / hop_size_samples) x = x[:, :int(num_frames * hop_size_samples)].to(device) X.append(x) ## load y # optional: convert praats into jsons # dataset_dir.joinpath('align').mkdir(parents=True, exist_ok=True) # tg = tgio.openTextgrid(praat_file) # align_dict = tools.textgrid_to_dict(tg) # json_file = Path(str(praat_file).replace('TextGrid', 'align')).with_suffix('.json') # json_file.write_text(json.dumps(align_dict, indent=4)) # y = tools.json_file_to_target(json_file, sampling_rate, window_size_samples, hop_size_samples, hmm) y = tools.praat_file_to_target(praat_file, sampling_rate, window_size_samples, hop_size_samples, hmm) y = torch.from_numpy(y).to(device) Y.append(y) ## load text text = lab_file.read_text().strip() texts.append(text) return wav_files, X, Y, texts """ Creates two datasets: - plain is simply a pre-processed version of TIDIGITS - aligned replaces the targets Y with more precise targets (obtained via viterbi training) """ # check if data dir exist raw_data_dir = Path(data_dir).joinpath('raw') assert raw_data_dir.is_dir() # data config sampling_rate = feature_parameters['sampling_rate'] window_size_samples = tools.next_pow2_samples( feature_parameters['window_size'], sampling_rate) hop_size_samples = tools.sec_to_samples(feature_parameters['hop_size'], sampling_rate) # check if dataset is already pre-processed plain_out_dir = Path(data_dir).joinpath(model_type, 'plain') aligend_out_dir = Path(data_dir).joinpath(model_type, 'aligned') if plain_out_dir.joinpath('hmm.h5').is_file() and aligend_out_dir.joinpath( 'hmm.h5').is_file(): logging.info(f"[+] Dataset already pre-processed") return shutil.rmtree(plain_out_dir, ignore_errors=True) plain_out_dir.mkdir(parents=True) shutil.rmtree(aligend_out_dir, ignore_errors=True) aligend_out_dir.mkdir(parents=True) # Step 1: plain data # -> wavs are split into individual frames (the Xs) # -> each frame is mapped to the corresponding target state # of the hmm (the Ys) # # As these targets are always depend on a particular hmm, # we save the hmm alongside with the data hmm = HMM.HMM('word') pickle.dump(hmm, plain_out_dir.joinpath('hmm.h5').open('wb')) # pre-proccess plain data dataset_names = [ d.name for d in Path(raw_data_dir).glob('*') if d.is_dir() ] for dataset_name in dataset_names: logging.info(f"[+] Pre-process {dataset_name}") wav_files, X, Y, texts = load_raw_data_dir( raw_data_dir.joinpath(dataset_name)) ## dump plain X_out_dir = plain_out_dir.joinpath(dataset_name, 'X') X_out_dir.mkdir(parents=True) Y_out_dir = plain_out_dir.joinpath(dataset_name, 'Y') Y_out_dir.mkdir(parents=True) text_out_dir = plain_out_dir.joinpath(dataset_name, 'text') text_out_dir.mkdir(parents=True) wav_out_dir = plain_out_dir.joinpath(dataset_name, 'wavs') wav_out_dir.mkdir(parents=True) for wav_file, x, y, text in tqdm( zip(wav_files, X, Y, texts), total=len(wav_files), bar_format=' dump plain {l_bar}{bar:30}{r_bar}'): filename = wav_file.stem torch.save(y, Y_out_dir.joinpath(filename).with_suffix('.pt')) torch.save(x, X_out_dir.joinpath(filename).with_suffix('.pt')) text_out_dir.joinpath(filename).with_suffix('.txt').write_text( text) shutil.copyfile(wav_file, wav_out_dir.joinpath(filename).with_suffix('.wav')) # Step 2: align data # -> for the plain data we only used relatively vague alignements between # input frame and target # -> to improve this we create a second dataset that uses a hmm # that is trained with viterbi to obtain more precise alignments # first we need to train the hmm with viterbi training dataset = Dataset(plain_out_dir.joinpath('TRAIN'), feature_parameters) model = init_model(model_type, feature_parameters, hmm) model.train_model(dataset, epochs=12, batch_size=32) model.train_model(dataset, epochs=1, batch_size=32, viterbi_training=True) model.hmm.A = hmm.modifyTransitions(model.hmm.A_count) model.train_model(dataset, epochs=2, batch_size=32, viterbi_training=True) # again, save hmm alongside the data pickle.dump(hmm, aligend_out_dir.joinpath('hmm.h5').open('wb')) # pre-proccess aligned data dataset_names = [ d.name for d in Path(raw_data_dir).glob('*') if d.is_dir() ] for dataset_name in dataset_names: logging.info(f"[+] Pre-process {dataset_name}") # wav_files, X, Y, texts = load_raw_data_dir(raw_data_dir.joinpath(dataset_name), device=device) dst_path = plain_out_dir.joinpath(dataset_name) dataset = Dataset(dst_path, feature_parameters) ## dump plain X_out_dir = aligend_out_dir.joinpath(dataset_name, 'X') X_out_dir.mkdir(parents=True) Y_out_dir = aligend_out_dir.joinpath(dataset_name, 'Y') Y_out_dir.mkdir(parents=True) text_out_dir = aligend_out_dir.joinpath(dataset_name, 'text') text_out_dir.mkdir(parents=True) wav_out_dir = aligend_out_dir.joinpath(dataset_name, 'wavs') wav_out_dir.mkdir(parents=True) with tqdm( total=len(wav_files), bar_format=' dump aligned {l_bar}{bar:30}{r_bar}') as pbar: for X_batch, Y_batch, texts_batch, y_true_length, x_true_length, filenames in dataset.generator( return_filename=True, batch_size=32, return_x_length=True): posteriors = model.features_to_posteriors(X_batch) Y_batch = hmm.viterbi_train(posteriors, y_true_length, Y_batch, texts_batch) for filename, x, y, y_length, x_length, text in zip( filenames, X_batch, Y_batch, y_true_length, x_true_length, texts_batch): torch.save(y.clone()[:y_length], Y_out_dir.joinpath(filename).with_suffix('.pt')) torch.save(x.clone()[:x_length].unsqueeze(dim=0), X_out_dir.joinpath(filename).with_suffix('.pt')) text_out_dir.joinpath(filename).with_suffix( '.txt').write_text(text) shutil.copyfile( dst_path.joinpath('wavs', filename).with_suffix('.wav'), wav_out_dir.joinpath(filename).with_suffix('.wav')) pbar.update(1)
def preprocess(data_dir, feature_parameters): def load_raw_data_dir(dataset_dir, device='cuda'): dataset_dir = dataset_dir.resolve() # To resolve symlinks! # find raw data wav_files = [ f for f in sorted( dataset_dir.joinpath('wav').resolve().glob('*.wav')) ] praat_files = [ f for f in sorted( dataset_dir.joinpath('TextGrid').resolve().glob('*.TextGrid')) ] lab_files = [ f for f in sorted( dataset_dir.joinpath('lab').resolve().glob('*.lab')) ] # load raw data X = [] Y = [] texts = [] for wav_file, praat_file, lab_file in tqdm( zip(wav_files, praat_files, lab_files), total=len(wav_files), bar_format=' load raw {l_bar}{bar:30}{r_bar}'): # sanity check assert wav_file.stem == praat_file.stem == lab_file.stem, f'{wav_file.stem} {praat_file.stem} {lab_file.stem}' ## load x x, _ = torchaudio.load(wav_file) # round to the next `full` frame num_frames = np.floor(x.shape[1] / hop_size_samples) x = x[:, :int(num_frames * hop_size_samples)].to(device) X.append(x) ## load y # optional: convert praats into jsons # dataset_dir.joinpath('align').mkdir(parents=True, exist_ok=True) # tg = tgio.openTextgrid(praat_file) # align_dict = tools.textgrid_to_dict(tg) # json_file = Path(str(praat_file).replace('TextGrid', 'align')).with_suffix('.json') # json_file.write_text(json.dumps(align_dict, indent=4)) # y = tools.json_file_to_target(json_file, sampling_rate, window_size_samples, hop_size_samples, hmm) y = tools.praat_file_to_target(praat_file, sampling_rate, window_size_samples, hop_size_samples, hmm) y = torch.from_numpy(y).to(device) Y.append(y) ## load text text = lab_file.read_text().strip() texts.append(text) return wav_files, X, Y, texts raw_data_dir = Path(data_dir).joinpath('raw') assert raw_data_dir.is_dir() # data config sampling_rate = feature_parameters['sampling_rate'] window_size_samples = tools.next_pow2_samples( feature_parameters['window_size'], sampling_rate) hop_size_samples = tools.sec_to_samples(feature_parameters['hop_size'], sampling_rate) plain_out_dir = Path(data_dir).joinpath('plain') plain_out_dir.mkdir() hmm = HMM.HMM('word') pickle.dump(hmm, plain_out_dir.joinpath('hmm.h5').open('wb')) # pre-proccess plain data dataset_names = [ d.name for d in Path(raw_data_dir).glob('*') if d.is_dir() ] for dataset_name in dataset_names: wav_files, X, Y, texts = load_raw_data_dir( raw_data_dir.joinpath(dataset_name)) ## dump plain X_out_dir = plain_out_dir.joinpath(dataset_name, 'X') X_out_dir.mkdir(parents=True) Y_out_dir = plain_out_dir.joinpath(dataset_name, 'Y') Y_out_dir.mkdir(parents=True) text_out_dir = plain_out_dir.joinpath(dataset_name, 'text') text_out_dir.mkdir(parents=True) wav_out_dir = plain_out_dir.joinpath(dataset_name, 'wavs') wav_out_dir.mkdir(parents=True) for wav_file, x, y, text in tqdm( zip(wav_files, X, Y, texts), total=len(wav_files), bar_format=' dump plain {l_bar}{bar:30}{r_bar}'): filename = wav_file.stem torch.save(y, Y_out_dir.joinpath(filename).with_suffix('.pt')) torch.save(x, X_out_dir.joinpath(filename).with_suffix('.pt')) text_out_dir.joinpath(filename).with_suffix('.txt').write_text( text) shutil.copyfile(wav_file, wav_out_dir.joinpath(filename).with_suffix('.wav'))
assert params.model_type in params.attack_dir, "It seems you are trying to evalute " \ "results generated for a different model type" # assert params.model_type in str(params.data_dir), "You are using the wrong hmm (and aligned data)!" feature_parameters = { 'window_size': 25e-3, 'hop_size': 12.5e-3, 'feature_type': 'raw', 'num_ceps': 13, 'left_context': 4, 'right_context': 4, 'sampling_rate': tools.get_sampling_rate(params.data_dir.parent) } feature_parameters['hop_size_samples'] = tools.sec_to_samples( feature_parameters['hop_size'], feature_parameters['sampling_rate']) feature_parameters['window_size_samples'] = tools.next_pow2_samples( feature_parameters['window_size'], feature_parameters['sampling_rate']) tools.set_seed(params.seed) attack_dir = Path(params.attack_dir) assert os.path.exists(attack_dir) if not attack_dir.joinpath('log.txt').is_file(): assert len(list(attack_dir.iterdir()) ) == 1, "more than one instance of attack exist!" attack_dir = list(attack_dir.iterdir())[0] attack_step_dirs = [s for s in attack_dir.iterdir() if s.is_dir()] attack_step_dirs = sorted(attack_step_dirs, key=lambda s: int(s.name)) attack_last_step_dir = attack_step_dirs[-1]