def adv_ex(model, x_in, parameters, sampling_rate, target, eps, n_adv, sess, multi_model, attack): if parameters['feature_type'] == 'raw': hop_size_samples = tools.sec_to_samples(parameters['hop_size'], sampling_rate) x, _ = torchaudio.load(x_in) num_frames = np.floor(x.shape[1] / hop_size_samples) x = x[:, :int(num_frames * hop_size_samples) - 1] else: x = fe.compute_features_with_context(x_in, **parameters) x = np.reshape(x, (x.shape[0], (x.shape[1] * x.shape[2])), order='C') signal_length = x.shape[1] window_size_samples = tools.next_pow2_samples(parameters['window_size'], sampling_rate) hop_size_samples = tools.sec_to_samples(parameters['hop_size'], sampling_rate) num_frames = tools.get_num_frames(signal_length, window_size_samples, hop_size_samples) + 1 # if target length does nt fit signal length if target.shape[0] != num_frames: x = x[:, :-hop_size_samples] signal_length = x.shape[1] num_frames = tools.get_num_frames(signal_length, window_size_samples, hop_size_samples) + 1 adv, single_advs = targeted(model, x.shape, sess, x, target, eps, n_adv, attack, multi_model) # x.cpu().numpy(), return adv, single_advs
def generator(x_dirs, y_dirs, hmm, sampling_rate, parameters): feats_list = [] target_list = [] length_feats = 0 length_target = 0 number_features = parameters['num_ceps'] * 3 number_context = parameters['left_context'] + parameters[ 'right_context'] + 1 for i in range(len(x_dirs)): # Compute audiofile to feature matrix # get path to audio file audio_file = x_dirs[i] # compute features feats = fe.compute_features_with_context(audio_file, **parameters) # get label target_dir = y_dirs[i] # calculate window size and hop size window_size_samples = tools.sec_to_samples(parameters['window_size'], sampling_rate) window_size_samples = 2**tools.next_pow2(window_size_samples) hop_size_samples = tools.sec_to_samples(parameters['hop_size'], sampling_rate) # calculatge target target = tools.praat_file_to_target(target_dir, sampling_rate, window_size_samples, hop_size_samples, hmm) # append to list with features and targets length_feats += len(feats) length_target += len(target) feats_list.append(feats) target_list.append(target) target_list = list(chain.from_iterable(target_list)) feats_list = list(chain.from_iterable(feats_list)) feats_list_new = np.reshape(np.array(feats_list), newshape=(length_feats, number_features, number_context)) target_list_new = np.reshape(np.array(target_list), newshape=(length_feats, hmm.get_num_states())) return feats_list_new, target_list_new
def return_start_words(praat_file, sampling_rate=16000, window_size=25e-3, hop_size=12.5e-3): #:param praat_file: *.TextGrid file. window_size_samples = tools.sec_to_samples(window_size, sampling_rate) hop_size_samples = tools.sec_to_samples(hop_size, sampling_rate) intervals, min_time, max_time = tools.praat_to_word_Interval(praat_file) # parse intervals starts = [] ends = [] for interval in intervals: start_frame = tools.sec_to_frame(interval.start, sampling_rate, window_size_samples, hop_size_samples) end_frame = tools.sec_to_frame(interval.end, sampling_rate, window_size_samples, hop_size_samples) starts.append(start_frame) ends.append(end_frame) return starts, ends
def get_mel_filters(sampling_rate, window_size_sec, n_filters, f_min=0, f_max=8000): """ Returns a mel filterbank for a given set of specifications. :param sampling_rate: sampling rate in Hz. :param window_size_sec: window size in seconds. :param n_filters: number of filters. :param f_min: minimum frequency covered by mel filterbank in Hz (default: 0). :param f_max: maximum frequency covered by mel filterbank in Hz (default: 8000). :return: m x d array representing the mel filterbank, where m is the FFT size and d is the number of mel filters. """ # calculate max and min frequency in mel f_min_mel = tools.hz_to_mel(f_min) f_max_mel = tools.hz_to_mel(f_max) # create vector with frequency points for filterbank in mel scale (equidistant) freq_points_mel = np.linspace(f_min_mel, f_max_mel, n_filters + 2) # transform it into Hertz scale freq_points_hz = tools.mel_to_hz(freq_points_mel) # calculate number of FFT frequency points fft_samples = int((2**tools.next_pow2( tools.sec_to_samples(window_size_sec, sampling_rate)) / 2) + 1) # find the corresponding indices for the filterbank in the FFT f = [] for i in range(n_filters + 2): f.append(np.round((fft_samples) * freq_points_hz[i] / f_max)) # initialize filterbank matrix H H = np.zeros((fft_samples, n_filters)) # calculate filterbank matrix H for m in range(1, n_filters + 1): for k in range(fft_samples): if k < f[m - 1]: H[k, m - 1] = 0 elif f[m - 1] <= k and k < f[m]: H[k, m - 1] = (2 * (k - f[m - 1])) / ((f[m + 1] - f[m - 1]) * (f[m] - f[m - 1])) elif f[m] <= k and k <= f[m + 1]: H[k, m - 1] = (2 * (f[m + 1] - k)) / ((f[m + 1] - f[m - 1]) * (f[m + 1] - f[m])) elif k > f[m + 1]: H[k, m - 1] = 0 return H
def make_frames(audio_data, sampling_rate, window_size, hop_size): """ Splits an audio signal into subsequent frames. :param audio_data: array representing the audio signal. :param sampling_rate: sampling rate in Hz. :param window_size: window size in seconds. :param hop_size: hop size (frame shift) in seconds. :return: n x m array of signal frames, where n is the number of frames and m is the window size in samples. """ # transform window size in seconds to samples and calculate next higher power of two window_size_samples = tools.sec_to_samples(window_size, sampling_rate) window_size_samples = 2**tools.next_pow2(window_size_samples) # assign hamming window hamming_window = np.hamming(window_size_samples) # transform hop size in seconds to samples hop_size_samples = tools.sec_to_samples(hop_size, sampling_rate) # get number of frames from function in tools.py n_frames = tools.get_num_frames(len(audio_data), window_size_samples, hop_size_samples) # initialize nxm matrix (n is number of frames, m is window size) # initialize with zeros to avoid zero padding frames = np.zeros([n_frames, window_size_samples], dtype=float) # write frames in matrix for i in range(n_frames): start = i * hop_size_samples end = i * hop_size_samples + window_size_samples frames[i, 0:len(audio_data[start:end])] = audio_data[start:end] frames[i, :] = frames[i, :] * hamming_window return frames
def generator(model, hmm, x_dirs, y_dirs, sampling_rate, parameters, viterbi_training=False): """ creates feature-target-pairs out of files lists for training. :param model: trained dnn model :param hmm: hmm class instance :param x_dirs: *.wav file list :param y_dirs: *.TextGrid file list :param sampling_rate: sampling frequency in hz :param parameters: parameters for feature extraction :param viterbi_training: flag for viterbi training :return: x, y: feature-target-pair """ # set random seed random.seed(42) # init A for viterbo training hmm.A_count = np.ceil(hmm.A) # same values for all utterances window_size_samples = tools.next_pow2_samples(parameters['window_size'], sampling_rate) hop_size_samples = tools.sec_to_samples(parameters['hop_size'], sampling_rate) # generator while True: x_dirs, y_dirs = tools.shuffle_list(x_dirs, y_dirs) for audio_file, target_dir in zip(x_dirs, y_dirs): # get features and target y = tools.praat_file_to_word_target(target_dir, sampling_rate, window_size_samples, hop_size_samples, hmm) x, _ = torchaudio.load(audio_file) # to have the same number of frames as the targets num_frames = np.floor(x.shape[1] / hop_size_samples) x = x[:, :int(num_frames * hop_size_samples) - 1] yield x, y, target_dir
def __init__(self, feature_parameters, hmm, dropout=0.0, test_dropout_enabled=False): super(BaseModel, self).__init__() self.feature_parameters = feature_parameters self.hop_size_samples = tools.sec_to_samples( self.feature_parameters['hop_size'], self.feature_parameters['sampling_rate']) self.left_context = feature_parameters['left_context'] self.right_context = feature_parameters['right_context'] self.n_mfcc = feature_parameters['num_ceps'] self.dropout = dropout self.hmm = hmm self.test_dropout_enabled = test_dropout_enabled # mfcc self.mfcc = torchaudio.transforms.MFCC(n_mfcc=self.n_mfcc) # delta and deltadeltas self.deltas = torchaudio.transforms.ComputeDeltas()
def preprocess_dataset(model_type, data_dir, feature_parameters, device='cuda'): def load_raw_data_dir(dataset_dir, device='cuda'): dataset_dir = dataset_dir.resolve() # To resolve symlinks! # find raw data wav_files = [ f for f in sorted( dataset_dir.joinpath('wav').resolve().glob('*.wav')) ] praat_files = [ f for f in sorted( dataset_dir.joinpath('TextGrid').resolve().glob('*.TextGrid')) ] lab_files = [ f for f in sorted( dataset_dir.joinpath('lab').resolve().glob('*.lab')) ] # load raw data X = [] Y = [] texts = [] for wav_file, praat_file, lab_file in tqdm( zip(wav_files, praat_files, lab_files), total=len(wav_files), bar_format=' load raw {l_bar}{bar:30}{r_bar}'): # sanity check assert wav_file.stem == praat_file.stem == lab_file.stem ## load x x, _ = torchaudio.load(wav_file) # round to the next `full` frame num_frames = np.floor(x.shape[1] / hop_size_samples) x = x[:, :int(num_frames * hop_size_samples)].to(device) X.append(x) ## load y # optional: convert praats into jsons # dataset_dir.joinpath('align').mkdir(parents=True, exist_ok=True) # tg = tgio.openTextgrid(praat_file) # align_dict = tools.textgrid_to_dict(tg) # json_file = Path(str(praat_file).replace('TextGrid', 'align')).with_suffix('.json') # json_file.write_text(json.dumps(align_dict, indent=4)) # y = tools.json_file_to_target(json_file, sampling_rate, window_size_samples, hop_size_samples, hmm) y = tools.praat_file_to_target(praat_file, sampling_rate, window_size_samples, hop_size_samples, hmm) y = torch.from_numpy(y).to(device) Y.append(y) ## load text text = lab_file.read_text().strip() texts.append(text) return wav_files, X, Y, texts """ Creates two datasets: - plain is simply a pre-processed version of TIDIGITS - aligned replaces the targets Y with more precise targets (obtained via viterbi training) """ # check if data dir exist raw_data_dir = Path(data_dir).joinpath('raw') assert raw_data_dir.is_dir() # data config sampling_rate = feature_parameters['sampling_rate'] window_size_samples = tools.next_pow2_samples( feature_parameters['window_size'], sampling_rate) hop_size_samples = tools.sec_to_samples(feature_parameters['hop_size'], sampling_rate) # check if dataset is already pre-processed plain_out_dir = Path(data_dir).joinpath(model_type, 'plain') aligend_out_dir = Path(data_dir).joinpath(model_type, 'aligned') if plain_out_dir.joinpath('hmm.h5').is_file() and aligend_out_dir.joinpath( 'hmm.h5').is_file(): logging.info(f"[+] Dataset already pre-processed") return shutil.rmtree(plain_out_dir, ignore_errors=True) plain_out_dir.mkdir(parents=True) shutil.rmtree(aligend_out_dir, ignore_errors=True) aligend_out_dir.mkdir(parents=True) # Step 1: plain data # -> wavs are split into individual frames (the Xs) # -> each frame is mapped to the corresponding target state # of the hmm (the Ys) # # As these targets are always depend on a particular hmm, # we save the hmm alongside with the data hmm = HMM.HMM('word') pickle.dump(hmm, plain_out_dir.joinpath('hmm.h5').open('wb')) # pre-proccess plain data dataset_names = [ d.name for d in Path(raw_data_dir).glob('*') if d.is_dir() ] for dataset_name in dataset_names: logging.info(f"[+] Pre-process {dataset_name}") wav_files, X, Y, texts = load_raw_data_dir( raw_data_dir.joinpath(dataset_name)) ## dump plain X_out_dir = plain_out_dir.joinpath(dataset_name, 'X') X_out_dir.mkdir(parents=True) Y_out_dir = plain_out_dir.joinpath(dataset_name, 'Y') Y_out_dir.mkdir(parents=True) text_out_dir = plain_out_dir.joinpath(dataset_name, 'text') text_out_dir.mkdir(parents=True) wav_out_dir = plain_out_dir.joinpath(dataset_name, 'wavs') wav_out_dir.mkdir(parents=True) for wav_file, x, y, text in tqdm( zip(wav_files, X, Y, texts), total=len(wav_files), bar_format=' dump plain {l_bar}{bar:30}{r_bar}'): filename = wav_file.stem torch.save(y, Y_out_dir.joinpath(filename).with_suffix('.pt')) torch.save(x, X_out_dir.joinpath(filename).with_suffix('.pt')) text_out_dir.joinpath(filename).with_suffix('.txt').write_text( text) shutil.copyfile(wav_file, wav_out_dir.joinpath(filename).with_suffix('.wav')) # Step 2: align data # -> for the plain data we only used relatively vague alignements between # input frame and target # -> to improve this we create a second dataset that uses a hmm # that is trained with viterbi to obtain more precise alignments # first we need to train the hmm with viterbi training dataset = Dataset(plain_out_dir.joinpath('TRAIN'), feature_parameters) model = init_model(model_type, feature_parameters, hmm) model.train_model(dataset, epochs=12, batch_size=32) model.train_model(dataset, epochs=1, batch_size=32, viterbi_training=True) model.hmm.A = hmm.modifyTransitions(model.hmm.A_count) model.train_model(dataset, epochs=2, batch_size=32, viterbi_training=True) # again, save hmm alongside the data pickle.dump(hmm, aligend_out_dir.joinpath('hmm.h5').open('wb')) # pre-proccess aligned data dataset_names = [ d.name for d in Path(raw_data_dir).glob('*') if d.is_dir() ] for dataset_name in dataset_names: logging.info(f"[+] Pre-process {dataset_name}") # wav_files, X, Y, texts = load_raw_data_dir(raw_data_dir.joinpath(dataset_name), device=device) dst_path = plain_out_dir.joinpath(dataset_name) dataset = Dataset(dst_path, feature_parameters) ## dump plain X_out_dir = aligend_out_dir.joinpath(dataset_name, 'X') X_out_dir.mkdir(parents=True) Y_out_dir = aligend_out_dir.joinpath(dataset_name, 'Y') Y_out_dir.mkdir(parents=True) text_out_dir = aligend_out_dir.joinpath(dataset_name, 'text') text_out_dir.mkdir(parents=True) wav_out_dir = aligend_out_dir.joinpath(dataset_name, 'wavs') wav_out_dir.mkdir(parents=True) with tqdm( total=len(wav_files), bar_format=' dump aligned {l_bar}{bar:30}{r_bar}') as pbar: for X_batch, Y_batch, texts_batch, y_true_length, x_true_length, filenames in dataset.generator( return_filename=True, batch_size=32, return_x_length=True): posteriors = model.features_to_posteriors(X_batch) Y_batch = hmm.viterbi_train(posteriors, y_true_length, Y_batch, texts_batch) for filename, x, y, y_length, x_length, text in zip( filenames, X_batch, Y_batch, y_true_length, x_true_length, texts_batch): torch.save(y.clone()[:y_length], Y_out_dir.joinpath(filename).with_suffix('.pt')) torch.save(x.clone()[:x_length].unsqueeze(dim=0), X_out_dir.joinpath(filename).with_suffix('.pt')) text_out_dir.joinpath(filename).with_suffix( '.txt').write_text(text) shutil.copyfile( dst_path.joinpath('wavs', filename).with_suffix('.wav'), wav_out_dir.joinpath(filename).with_suffix('.wav')) pbar.update(1)
def preprocess(data_dir, feature_parameters): def load_raw_data_dir(dataset_dir, device='cuda'): dataset_dir = dataset_dir.resolve() # To resolve symlinks! # find raw data wav_files = [ f for f in sorted( dataset_dir.joinpath('wav').resolve().glob('*.wav')) ] praat_files = [ f for f in sorted( dataset_dir.joinpath('TextGrid').resolve().glob('*.TextGrid')) ] lab_files = [ f for f in sorted( dataset_dir.joinpath('lab').resolve().glob('*.lab')) ] # load raw data X = [] Y = [] texts = [] for wav_file, praat_file, lab_file in tqdm( zip(wav_files, praat_files, lab_files), total=len(wav_files), bar_format=' load raw {l_bar}{bar:30}{r_bar}'): # sanity check assert wav_file.stem == praat_file.stem == lab_file.stem, f'{wav_file.stem} {praat_file.stem} {lab_file.stem}' ## load x x, _ = torchaudio.load(wav_file) # round to the next `full` frame num_frames = np.floor(x.shape[1] / hop_size_samples) x = x[:, :int(num_frames * hop_size_samples)].to(device) X.append(x) ## load y # optional: convert praats into jsons # dataset_dir.joinpath('align').mkdir(parents=True, exist_ok=True) # tg = tgio.openTextgrid(praat_file) # align_dict = tools.textgrid_to_dict(tg) # json_file = Path(str(praat_file).replace('TextGrid', 'align')).with_suffix('.json') # json_file.write_text(json.dumps(align_dict, indent=4)) # y = tools.json_file_to_target(json_file, sampling_rate, window_size_samples, hop_size_samples, hmm) y = tools.praat_file_to_target(praat_file, sampling_rate, window_size_samples, hop_size_samples, hmm) y = torch.from_numpy(y).to(device) Y.append(y) ## load text text = lab_file.read_text().strip() texts.append(text) return wav_files, X, Y, texts raw_data_dir = Path(data_dir).joinpath('raw') assert raw_data_dir.is_dir() # data config sampling_rate = feature_parameters['sampling_rate'] window_size_samples = tools.next_pow2_samples( feature_parameters['window_size'], sampling_rate) hop_size_samples = tools.sec_to_samples(feature_parameters['hop_size'], sampling_rate) plain_out_dir = Path(data_dir).joinpath('plain') plain_out_dir.mkdir() hmm = HMM.HMM('word') pickle.dump(hmm, plain_out_dir.joinpath('hmm.h5').open('wb')) # pre-proccess plain data dataset_names = [ d.name for d in Path(raw_data_dir).glob('*') if d.is_dir() ] for dataset_name in dataset_names: wav_files, X, Y, texts = load_raw_data_dir( raw_data_dir.joinpath(dataset_name)) ## dump plain X_out_dir = plain_out_dir.joinpath(dataset_name, 'X') X_out_dir.mkdir(parents=True) Y_out_dir = plain_out_dir.joinpath(dataset_name, 'Y') Y_out_dir.mkdir(parents=True) text_out_dir = plain_out_dir.joinpath(dataset_name, 'text') text_out_dir.mkdir(parents=True) wav_out_dir = plain_out_dir.joinpath(dataset_name, 'wavs') wav_out_dir.mkdir(parents=True) for wav_file, x, y, text in tqdm( zip(wav_files, X, Y, texts), total=len(wav_files), bar_format=' dump plain {l_bar}{bar:30}{r_bar}'): filename = wav_file.stem torch.save(y, Y_out_dir.joinpath(filename).with_suffix('.pt')) torch.save(x, X_out_dir.joinpath(filename).with_suffix('.pt')) text_out_dir.joinpath(filename).with_suffix('.txt').write_text( text) shutil.copyfile(wav_file, wav_out_dir.joinpath(filename).with_suffix('.wav'))
params.data_dir = params.data_dir.joinpath(params.model_type) assert params.model_type in params.attack_dir, "It seems you are trying to evalute " \ "results generated for a different model type" # assert params.model_type in str(params.data_dir), "You are using the wrong hmm (and aligned data)!" feature_parameters = { 'window_size': 25e-3, 'hop_size': 12.5e-3, 'feature_type': 'raw', 'num_ceps': 13, 'left_context': 4, 'right_context': 4, 'sampling_rate': tools.get_sampling_rate(params.data_dir.parent) } feature_parameters['hop_size_samples'] = tools.sec_to_samples( feature_parameters['hop_size'], feature_parameters['sampling_rate']) feature_parameters['window_size_samples'] = tools.next_pow2_samples( feature_parameters['window_size'], feature_parameters['sampling_rate']) tools.set_seed(params.seed) attack_dir = Path(params.attack_dir) assert os.path.exists(attack_dir) if not attack_dir.joinpath('log.txt').is_file(): assert len(list(attack_dir.iterdir()) ) == 1, "more than one instance of attack exist!" attack_dir = list(attack_dir.iterdir())[0] attack_step_dirs = [s for s in attack_dir.iterdir() if s.is_dir()]