示例#1
0
def run(model_path, dataset_json,
        batch_size=8, tag="best",
        out_file=None):

    use_cuda = torch.cuda.is_available()

    model, preproc = speech.load(model_path, tag=tag)
    ldr = loader.make_loader(dataset_json,
            preproc, batch_size)

    model.cuda() if use_cuda else model.cpu()
    model.set_eval()

    results = eval_loop(model, ldr)
    results = [(preproc.decode(label), preproc.decode(pred))
               for label, pred in results]
    cer = speech.compute_cer(results)
    print("CER {:.3f}".format(cer))

    if out_file is not None:
        with open(out_file, 'w') as fid:
            for label, pred in results:
                res = {'prediction' : pred,
                       'label' : label}
                json.dump(res, fid)
                fid.write("\n")
示例#2
0
def run(config, use_cuda):
    opt_cfg = config["optimizer"]
    data_cfg = config["data"]
    model_cfg = config["model"]
    aud_cfg = config['audio']
    batch_size = opt_cfg["batch_size"]

    load_pre = True

    if load_pre:
        # Todo: add code for checking if pretrained actually exists. If not, init model and rest
        model, _, preproc = speech.load("ctc_best", tag="best")
    else:
        preproc = loader.Preprocessor(data_cfg["train_set"], aud_cfg, start_and_end=data_cfg["start_and_end"])
        # eval('print("Hello")') will actually call print("Hello")
        model_class = eval("models." + model_cfg["class"])
        # define model
        model = model_class(preproc.input_dim, preproc.vocab_size, model_cfg)

    model = model.cuda() if use_cuda else model.cpu()
    optimizer = torch.optim.SGD(model.parameters(), lr=opt_cfg["learning_rate"],
                                momentum=opt_cfg["momentum"])
    # Dataloader is a subclass of pytorch.utils.dataloader. Can iterate
    train_ldr = loader.make_loader(data_cfg["train_set"], preproc, batch_size)
    dev_ldr = loader.make_loader(data_cfg["dev_set"], preproc, batch_size)

    print("Epochs to train:", opt_cfg["epochs"])
    run_state = (0, 0)
    best_so_far = float("inf")
    for e in range(opt_cfg["epochs"]):
        start = time.time()

        run_state = run_epoch(model, optimizer, train_ldr, *run_state)

        msg = "Epoch {} completed in {:.2f} (s)."
        print(msg.format(e, time.time() - start))
        if (e % 10 == 0) or (e == (opt_cfg["epochs"] - 1)):
            dev_loss, dev_cer = eval_dev(model, dev_ldr, preproc)

            # Log for tensorboard
            tb.log_value("dev_loss", dev_loss, e)
            tb.log_value("dev_cer", dev_cer, e)

        speech.save(model, optimizer, preproc, config["save_path"])

        # Save the best model on the dev set
        if dev_cer < best_so_far:
            best_so_far = dev_cer
            speech.save(model, optimizer, preproc, config["save_path"], tag="best")
示例#3
0
def main(model_path: str, json_path: str, use_cuda: bool, log_name: str,
         use_augmentation: bool):
    """
    runs the eval_dev loop in train continually while saving
    relevant date to a log file
    """

    # create logger
    logger = logging.getLogger("eval-dev_log")
    logger.setLevel(logging.DEBUG)
    # create file handler which logs even debug messages
    fh = logging.FileHandler(log_name + ".log")
    fh.setLevel(logging.DEBUG)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s',
                                  "%Y-%m-%d %H:%M:%S")
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    #loading model and preproc
    model, preproc = speech.load(model_path, tag="best")
    model.cuda() if use_cuda else model.cpu()
    print(f"spec_aug status:{preproc.spec_augment}")
    # creating loader
    dev_ldr = loader.make_loader(json_path, preproc, batch_size=1)

    iterations = 500

    logger.info("============= Trial info ============")
    logger.info(f"model path: {model_path}")
    logger.info(f"json path: {json_path}")
    logger.info(f"use_augmentation: {use_augmentation}")
    logger.info(f"preproc: {preproc}")
    logger.info(f"model: {model}")

    for i in range(iterations):
        logger.info(f"\n=================================================\n")
        logger.info(f"Iteration: {i}")

        loss, cer = eval_dev(model, dev_ldr, preproc, logger, use_augmentation)
示例#4
0
def stego_spectro(config,
                  audio_pth='tests/test1.wav',
                  target_text=('aa', 'jh')):
    config = config['audio']
    hypo_path = audio_pth[:-4] + "_stego_hypo.wav"
    # load model
    model, _, preproc = speech.load("ctc_best", tag="best")
    # Freeze model parameters
    for parameter in model.parameters():
        parameter.requires_grad = False
    model = model.cuda() if use_cuda else model.cpu()
    # load audio file as a spectogram
    orig, fs = array_from_wave(audio_pth)
    print(fs)
    assert config['fs'] == fs
    orig_spec, _ = preproc.preprocess(pth=audio_pth)
    print("orig shape:", orig.shape, "\norig_spec shape:", orig_spec.shape)
    # pass through model to get original text
    out = model.infer_recording(orig_spec.unsqueeze(0))[0]
    print("Decoded text in audio:", preproc.decode(out))

    delta = nn.Parameter(torch.zeros_like(orig_spec), requires_grad=True)
    print("delta shape:", delta.shape)
    target = torch.tensor(preproc.encode(target_text)).unsqueeze(0)
    target_list = target[0].tolist()
    print("Encoded target:", target[0])
    edit_dists = [compute_cer([(target_list, out)])]
    # parameters
    thresh = orig.max() / 3
    thresh_decay = 0.75
    check_every = 50
    num_iter = 30000
    # Optimizer
    optimizer = torch.optim.Adam([delta], lr=0.01)
    lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,
                                                          gamma=0.995,
                                                          verbose=True)
    lr_step = 50
    logs, losses = [], []

    for e in range(1, 10000):
        to_feed = orig_spec + delta
        inp = to_feed.unsqueeze(0)
        batch = (inp, target)
        loss = model.loss(batch)

        if e % check_every == 0:
            cur_out = list(model.infer_batch(batch)[0][0])
            global start_time
            print(start_time)
            print("Time taken for", e, "iterations:", time.time() - start_time)
            print("Delta: {}".format(torch.abs(orig_spec - delta).sum()))
            d_max = delta.max().item()
            # delta = torch.clamp(delta, max=d_max * 0.9).detach()
            print(
                "Iteration: {}, Loss: {}, cur_out: {}, d_max: {}, thresh: {}".
                format(e, loss, cur_out, d_max, thresh))
            edit_dists.append((e, compute_cer([(target_list, cur_out)])))
            print(edit_dists)
            inverse_delta(config,
                          to_feed,
                          preproc,
                          name=audio_pth[:-4] + '_spectro_hypo_' + str(e))
            # to_write = orig_spec + delta.clone().detach()
            # transpose to convert it from time x freq to freq x time
            # to_feed = to_feed.detach().squeeze().T
            if cur_out == target[0].tolist():
                print("Got target output")
                audio_encrypted = inverse_delta(config,
                                                to_feed,
                                                preproc,
                                                name=audio_pth[:-4] +
                                                '_spectro_best_' + str(e))
                thresh = thresh_decay * min(thresh, d_max)
                pesq_nb = pesq_score(orig.numpy(), audio_encrypted, fs, 'nb')
                pesq_wb = pesq_score(orig.numpy(), audio_encrypted, fs, 'wb')
                print('PESQ score: {} {}'.format(pesq_nb, pesq_wb))
                logs.append((e, pesq_nb, pesq_wb))

                # dump data
                pkl_path = audio_pth[:-4] + '_spectro_data.pkl'
                with open(pkl_path, 'wb') as f:
                    pickle.dump((losses, logs, edit_dists), f)

            # if e % 100 == 0:
            #     print("Writing audio")
            #     inverse_delta(config, to_feed, preproc, name='spectro_hypo')

        loss.backward()
        losses.append(loss.item())
        optimizer.step()
        # dont know why this worked
        with torch.no_grad():
            delta += delta.clamp_(min=-thresh, max=thresh) - delta

        if e % lr_step == 0:
            lr_scheduler.step()
示例#5
0
def stego_audio(config_full,
                audio_pth,
                noise_snr,
                inp_target,
                is_text,
                dump_suffix=''):
    # config['audio'] contains hop, window and fs params
    if noise_snr is not None:
        dump_suffix += '_noise_' + str(noise_snr)
    config = config_full['audio']
    hypo_path = audio_pth[:-4] + '_hypo.wav'
    pkl_path = audio_pth[:-4] + '_{}.pkl'.format(dump_suffix)
    # load model
    model, _, preproc = speech.load("ctc_best", tag="best")
    # Freeze model params. Even if we don't, it doesnt matter since optimiser has only been passed delta as param
    for parameter in model.parameters():
        parameter.requires_grad = False
    # transfer to cuda
    model = model.cuda() if use_cuda else model.cpu()
    # load audio file as a spectrogram
    orig, fs = array_from_wave(audio_pth)
    assert config['fs'] == fs
    orig_spec, _ = preproc.preprocess(audio=orig)
    # pass through model to get original text
    out = model.infer_recording(orig_spec.unsqueeze(0))[0]
    print("Decoded text in audio: {}\nDecoded Sequence: {}".format(
        preproc.decode(out), out))
    # define delta
    delta = nn.Parameter(torch.zeros_like(orig), requires_grad=True).float()
    if is_text:
        target = torch.tensor(preproc.encode(inp_target)).unsqueeze(0)
        target_list = target[0].tolist()
    else:
        target = torch.tensor(inp_target).unsqueeze(0)
        target_list = list(inp_target)
    print("Target to encode:", target[0])

    edit_dists = [(0, compute_cer([(target_list, out)]))]
    # parameters
    thresh = orig.max() / 3
    thresh_decay = 0.75
    check_every = 50
    num_iter = 10000
    # RMS/3 ToDo: convert to SNR based calculation
    if noise_snr is not None:
        noise_std = (torch.mean(orig**2).item() / (10**(noise_snr / 10)))**0.5
        print("Noise std:", noise_std)
    # optimizer
    optimizer = torch.optim.Adam([delta], lr=0.01)
    optimizer.zero_grad()
    step_every = 5
    # if we are not adding noise, need to step at every tier because same computation if delta not updated
    if noise_snr is None:
        step_every = 1

    lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,
                                                          gamma=0.995,
                                                          verbose=True)
    lr_step = 50
    logs, losses = [], []

    for e in range(1, num_iter):
        # optimizer
        audio_to_feed = orig + delta
        if noise_snr is not None:
            audio_to_feed += torch.normal(mean=0,
                                          std=noise_std,
                                          size=audio_to_feed.size())
        to_feed, _ = preproc.preprocess(audio=audio_to_feed)
        inp = to_feed.unsqueeze(0)
        batch = (inp, target)

        loss = model.loss(batch)

        if e % check_every == 0:
            cur_out = list(model.infer_batch(batch)[0][0])
            global start_time
            print(start_time)
            print("Time taken for", e, "iterations:", time.time() - start_time)
            print("Delta: {}".format(torch.abs(orig - delta).sum()))
            d_max = delta.max().item()
            print(
                "Iteration: {}, Loss: {}, cur_out: {}, d_max: {}, thresh: {}".
                format(e, loss, cur_out, d_max, thresh))
            to_write = orig + delta.clone().detach()
            wave_from_array(to_write, fs, hypo_path)
            # store edit distance
            edit_dists.append((e, compute_cer([(target_list, cur_out)])))

            if cur_out == target_list:
                print("Got target output")
                best_path = audio_pth[:-4] + '_' + '_'.join(
                    [str(e), dump_suffix]) + '.wav'
                wave_from_array(to_write, fs, best_path)
                thresh = thresh_decay * min(thresh, d_max)
                pesq_nb = pesq_score(orig.numpy(), to_write.numpy(), fs, 'nb')
                pesq_wb = pesq_score(orig.numpy(), to_write.numpy(), fs, 'wb')
                print('PESQ score: {} {}'.format(pesq_nb, pesq_wb))
                logs.append((e, pesq_nb, pesq_wb))

                # dump data
                with open(pkl_path, 'wb') as f:
                    pickle.dump((losses, logs, edit_dists), f)

        loss.backward()
        losses.append(loss.item())
        if e % step_every == 0:
            optimizer.step()
            optimizer.zero_grad()

        # don't know why this worked
        with torch.no_grad():
            delta += delta.clamp_(min=-thresh, max=thresh) - delta

        if e % lr_step == 0:
            lr_scheduler.step()

        if e % 200 == 0:
            with open(pkl_path, 'wb') as f:
                pickle.dump((losses, logs, edit_dists), f)

    # dump data
    with open(pkl_path, 'wb') as f:
        pickle.dump((losses, logs, edit_dists), f)

    return losses, logs, edit_dists