예제 #1
0
def decode_results(model, decoded_output, decoded_offsets):
    results = {
        "output": [],
        "_meta": {
            "acoustic_model": {
                "name": os.path.basename(args.model_path)
            },
            "language_model": {
                "name":
                os.path.basename(args.lm_path) if args.lm_path else None,
            },
            "decoder": {
                "lm": args.lm_path is not None,
                "alpha": args.alpha if args.lm_path is not None else None,
                "beta": args.beta if args.lm_path is not None else None,
                "type": args.decoder,
            }
        }
    }
    results['_meta']['acoustic_model'].update(WaveToLetter.get_meta(model))

    for b in range(len(decoded_output)):
        for pi in range(min(args.top_paths, len(decoded_output[b]))):
            result = {'transcription': decoded_output[b][pi]}
            if args.offsets:
                result['offsets'] = decoded_offsets[b][pi]
            results['output'].append(result)
    return results
예제 #2
0
        tensorboard_writer = SummaryWriter(args.log_dir)

    try:
        os.makedirs(save_folder)
    except OSError as e:
        if e.errno == errno.EEXIST:
            print('Model Save directory already exists.')
        else:
            raise
    criterion = CTCLoss()

    avg_loss, start_epoch, start_iter = 0, 0, 0
    if args.continue_from:  # Starting from previous model
        print("Loading checkpoint model %s" % args.continue_from)
        package = torch.load(args.continue_from, map_location=lambda storage, loc: storage)
        model = WaveToLetter.load_model_package(package)
        audio_conf = WaveToLetter.get_audio_conf(model)
        labels = WaveToLetter.get_labels(model)
        parameters = model.parameters()
        optimizer = torch.optim.SGD(parameters, lr=args.lr,
                                    momentum=args.momentum, nesterov=True)

        if args.noise_dir is not None:
            model = WaveToLetter.setAudioConfKey(model,'noise_dir',args.noise_dir)
            model = WaveToLetter.setAudioConfKey(model,'noise_prob',args.noise_prob)
            model = WaveToLetter.setAudioConfKey(model,'noise_max',args.noise_max)
            model = WaveToLetter.setAudioConfKey(model,'noise_min',args.noise_min)

        if not args.finetune:  # Don't want to restart training
            # if args.cuda:
            #     model.cuda()
예제 #3
0
    def attack(self, iterations, target_path, lr, bandwidth):
        flag = 0
        model = WaveToLetter.load_model(args.model_path)
        model = model.to(device)
        model.eval()  #eval. This is different from stage1
        signal = self.get_signal(self.audio_path)
        orig = self.get_signal(self.orig_path)

        index_max, index_min = self.attention(signal)
        start_attack_time = time.time()
        for i in range(iterations):
            print('Iteration:', str(i))
            # print(signal[index_max:index_max+20])
            # print(signal[index_max])

            # if args.printSilence:
            #     print()
            #     print(sigindex_maxnal.shape)
            #     print("20:", signal[index_max:index_max+20])

            mfcc = pytorch_mfcc.MFCC(samplerate=self.sample_rate,
                                     winlen=self.window_size,
                                     winstep=self.window_stride,
                                     numcep=13,
                                     nfilt=26,
                                     nfft=512,
                                     lowfreq=0,
                                     highfreq=None,
                                     preemph=0,
                                     ceplifter=22,
                                     appendEnergy=False).cuda()
            mfccs = mfcc(signal)
            mfccs = self.normalize(mfccs)

            if args.printSilence:
                print("mfccs", mfccs)
            inputsMags = self.mfccs_to_inputs(mfccs)
            out = model(inputsMags)

            path = self.orig_path.split(
                'wav')[0] + 'txt' + self.orig_path.split('wav')[1]
            fp = open(path)
            transcriptReal = fp.readlines()[0]
            print("Ref:", transcriptReal.lower())

            seq_length = out.size(1)
            sizes = Variable(torch.Tensor([1.0]).mul_(int(seq_length)).int(),
                             requires_grad=False)

            if args.printSilence:
                print("out", out)
                print("softmax", F.softmax(out, dim=-1).data)

            decoded_output, _, = decoder.decode(
                F.softmax(out, dim=-1).data, sizes)
            transcript = decoded_output[0][0]
            print("Hyp:", transcript.lower())

            out = out.transpose(0, 1)
            if args.target:
                transcriptTarget = args.target
            else:
                fp = open(target_path)
                transcriptTarget = fp.readlines()[0]
            print("Tar:", transcriptTarget.lower())

            if transcript.lower() == transcriptTarget.lower() and i > 0:
                if args.target:
                    target_path = args.target
                save_path = self.save(signal, target_path, self.orig_path, lr,
                                      iterations, i, bandwidth)
                generate_time = time.time() - start_attack_time
                print('Time taken (s): {generate_time:.4f}\t'.format(
                    generate_time=generate_time))
                self.save_figure(signal, save_path)
                break

            target = list(
                filter(None, [
                    self.labels_map.get(x)
                    for x in list(transcriptTarget.upper())
                ]))
            targets = torch.IntTensor(target)
            target_sizes = torch.IntTensor([len(target)])
            ctcloss = self.criterion(out, targets, sizes, target_sizes)
            # print("ctcloss:", ctcloss)
            # print("delta_2:", 100*torch.sum((signal - orig)**2))
            # loss = ctcloss + 100*torch.sum((signal - orig)**2)
            loss = ctcloss
            print("loss:", loss)

            loss.backward()

            grad = np.array(signal.grad)
            is_nan = np.isnan(grad)
            is_nan_new = is_nan[is_nan == True]
            for j in range(len(grad)):
                if is_nan[j]:
                    grad[j] = 10

            wer = decoder.wer(transcript.lower(),
                              transcriptTarget.lower()) / float(
                                  len(transcriptTarget.lower().split()))

            # the iterative proportional clipping method

            # print('grad:{}'.format(grad[index_max:index_max+20]))
            perturbation = lr * torch.from_numpy(grad)
            # print('perturbation', perturbation[index_max])
            signal_next_relative = torch.clamp(
                (signal.data - perturbation) / orig,
                min=1 - bandwidth,
                max=1 + bandwidth)
            # print("signal_next_relative1:", signal_next_relative[index_max])
            signal.data = signal_next_relative.mul(orig)
            # print(signal_next_relative[index_max]*orig[index_max])
            # print("signal.data:", signal.data[index_max])

            # if (i + 1) % 15000 == 0 and flag < 1:
            #     # anneal lr
            #     # lr *= 0.5
            #     lr = lr / args.learning_anneal
            #     flag += 1
            # print("wer", wer)
            # print("lr", lr)
            print("\n")
            signal.grad.data.zero_()
        print("Come to the end")
예제 #4
0
beam_args.add_argument('--cutoff-top-n', default=40, type=int,
                       help='Cutoff number in pruning, only top cutoff_top_n characters with highest probs in '
                            'vocabulary will be used in beam search, default 40.')
beam_args.add_argument('--cutoff-prob', default=1.0, type=float,
                       help='Cutoff probability in pruning,default 1.0, no pruning.')
beam_args.add_argument('--lm-workers', default=4, type=int, help='Number of LM processes to use')
parser.add_argument('--mixPrec', default=False,dest='mixPrec', action='store_true', help='Use mix prec for inference even if it was not avail for training.')

parser.add_argument('--usePCEN', default=True,dest='usePcen', action='store_true', help='Use cuda to train model')
args = parser.parse_args()

if __name__ == '__main__':
	torch.set_grad_enabled(False)
	device = torch.device("cuda" if args.cuda else "cpu")
	# device = torch.device("cpu")
	model = WaveToLetter.load_model(args.model_path, cuda=args.cuda)
	if args.fuse_layers:
		model.module.convertTensorType()
	model = model.to(device)
	model.eval()
	avgTime = []
	labels = WaveToLetter.get_labels(model)
	audio_conf = WaveToLetter.get_audio_conf(model)
	# model.module.fuse_model()
	# model.qconfig = torch.quantization.default_qconfig
	# torch.quantization.prepare(model, inplace=True)
	if args.decoder == "beam":
		from decoder import BeamCTCDecoder

		decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta,
								 cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob,