def run(model_path, dataset_json, batch_size=8, tag="best", out_file=None): use_cuda = torch.cuda.is_available() model, preproc = speech.load(model_path, tag=tag) ldr = loader.make_loader(dataset_json, preproc, batch_size) model.cuda() if use_cuda else model.cpu() model.set_eval() results = eval_loop(model, ldr) results = [(preproc.decode(label), preproc.decode(pred)) for label, pred in results] cer = speech.compute_cer(results) print("CER {:.3f}".format(cer)) if out_file is not None: with open(out_file, 'w') as fid: for label, pred in results: res = {'prediction' : pred, 'label' : label} json.dump(res, fid) fid.write("\n")
def run(config, use_cuda): opt_cfg = config["optimizer"] data_cfg = config["data"] model_cfg = config["model"] aud_cfg = config['audio'] batch_size = opt_cfg["batch_size"] load_pre = True if load_pre: # Todo: add code for checking if pretrained actually exists. If not, init model and rest model, _, preproc = speech.load("ctc_best", tag="best") else: preproc = loader.Preprocessor(data_cfg["train_set"], aud_cfg, start_and_end=data_cfg["start_and_end"]) # eval('print("Hello")') will actually call print("Hello") model_class = eval("models." + model_cfg["class"]) # define model model = model_class(preproc.input_dim, preproc.vocab_size, model_cfg) model = model.cuda() if use_cuda else model.cpu() optimizer = torch.optim.SGD(model.parameters(), lr=opt_cfg["learning_rate"], momentum=opt_cfg["momentum"]) # Dataloader is a subclass of pytorch.utils.dataloader. Can iterate train_ldr = loader.make_loader(data_cfg["train_set"], preproc, batch_size) dev_ldr = loader.make_loader(data_cfg["dev_set"], preproc, batch_size) print("Epochs to train:", opt_cfg["epochs"]) run_state = (0, 0) best_so_far = float("inf") for e in range(opt_cfg["epochs"]): start = time.time() run_state = run_epoch(model, optimizer, train_ldr, *run_state) msg = "Epoch {} completed in {:.2f} (s)." print(msg.format(e, time.time() - start)) if (e % 10 == 0) or (e == (opt_cfg["epochs"] - 1)): dev_loss, dev_cer = eval_dev(model, dev_ldr, preproc) # Log for tensorboard tb.log_value("dev_loss", dev_loss, e) tb.log_value("dev_cer", dev_cer, e) speech.save(model, optimizer, preproc, config["save_path"]) # Save the best model on the dev set if dev_cer < best_so_far: best_so_far = dev_cer speech.save(model, optimizer, preproc, config["save_path"], tag="best")
def main(model_path: str, json_path: str, use_cuda: bool, log_name: str, use_augmentation: bool): """ runs the eval_dev loop in train continually while saving relevant date to a log file """ # create logger logger = logging.getLogger("eval-dev_log") logger.setLevel(logging.DEBUG) # create file handler which logs even debug messages fh = logging.FileHandler(log_name + ".log") fh.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s', "%Y-%m-%d %H:%M:%S") fh.setFormatter(formatter) logger.addHandler(fh) #loading model and preproc model, preproc = speech.load(model_path, tag="best") model.cuda() if use_cuda else model.cpu() print(f"spec_aug status:{preproc.spec_augment}") # creating loader dev_ldr = loader.make_loader(json_path, preproc, batch_size=1) iterations = 500 logger.info("============= Trial info ============") logger.info(f"model path: {model_path}") logger.info(f"json path: {json_path}") logger.info(f"use_augmentation: {use_augmentation}") logger.info(f"preproc: {preproc}") logger.info(f"model: {model}") for i in range(iterations): logger.info(f"\n=================================================\n") logger.info(f"Iteration: {i}") loss, cer = eval_dev(model, dev_ldr, preproc, logger, use_augmentation)
def stego_spectro(config, audio_pth='tests/test1.wav', target_text=('aa', 'jh')): config = config['audio'] hypo_path = audio_pth[:-4] + "_stego_hypo.wav" # load model model, _, preproc = speech.load("ctc_best", tag="best") # Freeze model parameters for parameter in model.parameters(): parameter.requires_grad = False model = model.cuda() if use_cuda else model.cpu() # load audio file as a spectogram orig, fs = array_from_wave(audio_pth) print(fs) assert config['fs'] == fs orig_spec, _ = preproc.preprocess(pth=audio_pth) print("orig shape:", orig.shape, "\norig_spec shape:", orig_spec.shape) # pass through model to get original text out = model.infer_recording(orig_spec.unsqueeze(0))[0] print("Decoded text in audio:", preproc.decode(out)) delta = nn.Parameter(torch.zeros_like(orig_spec), requires_grad=True) print("delta shape:", delta.shape) target = torch.tensor(preproc.encode(target_text)).unsqueeze(0) target_list = target[0].tolist() print("Encoded target:", target[0]) edit_dists = [compute_cer([(target_list, out)])] # parameters thresh = orig.max() / 3 thresh_decay = 0.75 check_every = 50 num_iter = 30000 # Optimizer optimizer = torch.optim.Adam([delta], lr=0.01) lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.995, verbose=True) lr_step = 50 logs, losses = [], [] for e in range(1, 10000): to_feed = orig_spec + delta inp = to_feed.unsqueeze(0) batch = (inp, target) loss = model.loss(batch) if e % check_every == 0: cur_out = list(model.infer_batch(batch)[0][0]) global start_time print(start_time) print("Time taken for", e, "iterations:", time.time() - start_time) print("Delta: {}".format(torch.abs(orig_spec - delta).sum())) d_max = delta.max().item() # delta = torch.clamp(delta, max=d_max * 0.9).detach() print( "Iteration: {}, Loss: {}, cur_out: {}, d_max: {}, thresh: {}". format(e, loss, cur_out, d_max, thresh)) edit_dists.append((e, compute_cer([(target_list, cur_out)]))) print(edit_dists) inverse_delta(config, to_feed, preproc, name=audio_pth[:-4] + '_spectro_hypo_' + str(e)) # to_write = orig_spec + delta.clone().detach() # transpose to convert it from time x freq to freq x time # to_feed = to_feed.detach().squeeze().T if cur_out == target[0].tolist(): print("Got target output") audio_encrypted = inverse_delta(config, to_feed, preproc, name=audio_pth[:-4] + '_spectro_best_' + str(e)) thresh = thresh_decay * min(thresh, d_max) pesq_nb = pesq_score(orig.numpy(), audio_encrypted, fs, 'nb') pesq_wb = pesq_score(orig.numpy(), audio_encrypted, fs, 'wb') print('PESQ score: {} {}'.format(pesq_nb, pesq_wb)) logs.append((e, pesq_nb, pesq_wb)) # dump data pkl_path = audio_pth[:-4] + '_spectro_data.pkl' with open(pkl_path, 'wb') as f: pickle.dump((losses, logs, edit_dists), f) # if e % 100 == 0: # print("Writing audio") # inverse_delta(config, to_feed, preproc, name='spectro_hypo') loss.backward() losses.append(loss.item()) optimizer.step() # dont know why this worked with torch.no_grad(): delta += delta.clamp_(min=-thresh, max=thresh) - delta if e % lr_step == 0: lr_scheduler.step()
def stego_audio(config_full, audio_pth, noise_snr, inp_target, is_text, dump_suffix=''): # config['audio'] contains hop, window and fs params if noise_snr is not None: dump_suffix += '_noise_' + str(noise_snr) config = config_full['audio'] hypo_path = audio_pth[:-4] + '_hypo.wav' pkl_path = audio_pth[:-4] + '_{}.pkl'.format(dump_suffix) # load model model, _, preproc = speech.load("ctc_best", tag="best") # Freeze model params. Even if we don't, it doesnt matter since optimiser has only been passed delta as param for parameter in model.parameters(): parameter.requires_grad = False # transfer to cuda model = model.cuda() if use_cuda else model.cpu() # load audio file as a spectrogram orig, fs = array_from_wave(audio_pth) assert config['fs'] == fs orig_spec, _ = preproc.preprocess(audio=orig) # pass through model to get original text out = model.infer_recording(orig_spec.unsqueeze(0))[0] print("Decoded text in audio: {}\nDecoded Sequence: {}".format( preproc.decode(out), out)) # define delta delta = nn.Parameter(torch.zeros_like(orig), requires_grad=True).float() if is_text: target = torch.tensor(preproc.encode(inp_target)).unsqueeze(0) target_list = target[0].tolist() else: target = torch.tensor(inp_target).unsqueeze(0) target_list = list(inp_target) print("Target to encode:", target[0]) edit_dists = [(0, compute_cer([(target_list, out)]))] # parameters thresh = orig.max() / 3 thresh_decay = 0.75 check_every = 50 num_iter = 10000 # RMS/3 ToDo: convert to SNR based calculation if noise_snr is not None: noise_std = (torch.mean(orig**2).item() / (10**(noise_snr / 10)))**0.5 print("Noise std:", noise_std) # optimizer optimizer = torch.optim.Adam([delta], lr=0.01) optimizer.zero_grad() step_every = 5 # if we are not adding noise, need to step at every tier because same computation if delta not updated if noise_snr is None: step_every = 1 lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.995, verbose=True) lr_step = 50 logs, losses = [], [] for e in range(1, num_iter): # optimizer audio_to_feed = orig + delta if noise_snr is not None: audio_to_feed += torch.normal(mean=0, std=noise_std, size=audio_to_feed.size()) to_feed, _ = preproc.preprocess(audio=audio_to_feed) inp = to_feed.unsqueeze(0) batch = (inp, target) loss = model.loss(batch) if e % check_every == 0: cur_out = list(model.infer_batch(batch)[0][0]) global start_time print(start_time) print("Time taken for", e, "iterations:", time.time() - start_time) print("Delta: {}".format(torch.abs(orig - delta).sum())) d_max = delta.max().item() print( "Iteration: {}, Loss: {}, cur_out: {}, d_max: {}, thresh: {}". format(e, loss, cur_out, d_max, thresh)) to_write = orig + delta.clone().detach() wave_from_array(to_write, fs, hypo_path) # store edit distance edit_dists.append((e, compute_cer([(target_list, cur_out)]))) if cur_out == target_list: print("Got target output") best_path = audio_pth[:-4] + '_' + '_'.join( [str(e), dump_suffix]) + '.wav' wave_from_array(to_write, fs, best_path) thresh = thresh_decay * min(thresh, d_max) pesq_nb = pesq_score(orig.numpy(), to_write.numpy(), fs, 'nb') pesq_wb = pesq_score(orig.numpy(), to_write.numpy(), fs, 'wb') print('PESQ score: {} {}'.format(pesq_nb, pesq_wb)) logs.append((e, pesq_nb, pesq_wb)) # dump data with open(pkl_path, 'wb') as f: pickle.dump((losses, logs, edit_dists), f) loss.backward() losses.append(loss.item()) if e % step_every == 0: optimizer.step() optimizer.zero_grad() # don't know why this worked with torch.no_grad(): delta += delta.clamp_(min=-thresh, max=thresh) - delta if e % lr_step == 0: lr_scheduler.step() if e % 200 == 0: with open(pkl_path, 'wb') as f: pickle.dump((losses, logs, edit_dists), f) # dump data with open(pkl_path, 'wb') as f: pickle.dump((losses, logs, edit_dists), f) return losses, logs, edit_dists