def decode_results(model, decoded_output, decoded_offsets): results = { "output": [], "_meta": { "acoustic_model": { "name": os.path.basename(args.model_path) }, "language_model": { "name": os.path.basename(args.lm_path) if args.lm_path else None, }, "decoder": { "lm": args.lm_path is not None, "alpha": args.alpha if args.lm_path is not None else None, "beta": args.beta if args.lm_path is not None else None, "type": args.decoder, } } } results['_meta']['acoustic_model'].update(WaveToLetter.get_meta(model)) for b in range(len(decoded_output)): for pi in range(min(args.top_paths, len(decoded_output[b]))): result = {'transcription': decoded_output[b][pi]} if args.offsets: result['offsets'] = decoded_offsets[b][pi] results['output'].append(result) return results
tensorboard_writer = SummaryWriter(args.log_dir) try: os.makedirs(save_folder) except OSError as e: if e.errno == errno.EEXIST: print('Model Save directory already exists.') else: raise criterion = CTCLoss() avg_loss, start_epoch, start_iter = 0, 0, 0 if args.continue_from: # Starting from previous model print("Loading checkpoint model %s" % args.continue_from) package = torch.load(args.continue_from, map_location=lambda storage, loc: storage) model = WaveToLetter.load_model_package(package) audio_conf = WaveToLetter.get_audio_conf(model) labels = WaveToLetter.get_labels(model) parameters = model.parameters() optimizer = torch.optim.SGD(parameters, lr=args.lr, momentum=args.momentum, nesterov=True) if args.noise_dir is not None: model = WaveToLetter.setAudioConfKey(model,'noise_dir',args.noise_dir) model = WaveToLetter.setAudioConfKey(model,'noise_prob',args.noise_prob) model = WaveToLetter.setAudioConfKey(model,'noise_max',args.noise_max) model = WaveToLetter.setAudioConfKey(model,'noise_min',args.noise_min) if not args.finetune: # Don't want to restart training # if args.cuda: # model.cuda()
def attack(self, iterations, target_path, lr, bandwidth): flag = 0 model = WaveToLetter.load_model(args.model_path) model = model.to(device) model.eval() #eval. This is different from stage1 signal = self.get_signal(self.audio_path) orig = self.get_signal(self.orig_path) index_max, index_min = self.attention(signal) start_attack_time = time.time() for i in range(iterations): print('Iteration:', str(i)) # print(signal[index_max:index_max+20]) # print(signal[index_max]) # if args.printSilence: # print() # print(sigindex_maxnal.shape) # print("20:", signal[index_max:index_max+20]) mfcc = pytorch_mfcc.MFCC(samplerate=self.sample_rate, winlen=self.window_size, winstep=self.window_stride, numcep=13, nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0, ceplifter=22, appendEnergy=False).cuda() mfccs = mfcc(signal) mfccs = self.normalize(mfccs) if args.printSilence: print("mfccs", mfccs) inputsMags = self.mfccs_to_inputs(mfccs) out = model(inputsMags) path = self.orig_path.split( 'wav')[0] + 'txt' + self.orig_path.split('wav')[1] fp = open(path) transcriptReal = fp.readlines()[0] print("Ref:", transcriptReal.lower()) seq_length = out.size(1) sizes = Variable(torch.Tensor([1.0]).mul_(int(seq_length)).int(), requires_grad=False) if args.printSilence: print("out", out) print("softmax", F.softmax(out, dim=-1).data) decoded_output, _, = decoder.decode( F.softmax(out, dim=-1).data, sizes) transcript = decoded_output[0][0] print("Hyp:", transcript.lower()) out = out.transpose(0, 1) if args.target: transcriptTarget = args.target else: fp = open(target_path) transcriptTarget = fp.readlines()[0] print("Tar:", transcriptTarget.lower()) if transcript.lower() == transcriptTarget.lower() and i > 0: if args.target: target_path = args.target save_path = self.save(signal, target_path, self.orig_path, lr, iterations, i, bandwidth) generate_time = time.time() - start_attack_time print('Time taken (s): {generate_time:.4f}\t'.format( generate_time=generate_time)) self.save_figure(signal, save_path) break target = list( filter(None, [ self.labels_map.get(x) for x in list(transcriptTarget.upper()) ])) targets = torch.IntTensor(target) target_sizes = torch.IntTensor([len(target)]) ctcloss = self.criterion(out, targets, sizes, target_sizes) # print("ctcloss:", ctcloss) # print("delta_2:", 100*torch.sum((signal - orig)**2)) # loss = ctcloss + 100*torch.sum((signal - orig)**2) loss = ctcloss print("loss:", loss) loss.backward() grad = np.array(signal.grad) is_nan = np.isnan(grad) is_nan_new = is_nan[is_nan == True] for j in range(len(grad)): if is_nan[j]: grad[j] = 10 wer = decoder.wer(transcript.lower(), transcriptTarget.lower()) / float( len(transcriptTarget.lower().split())) # the iterative proportional clipping method # print('grad:{}'.format(grad[index_max:index_max+20])) perturbation = lr * torch.from_numpy(grad) # print('perturbation', perturbation[index_max]) signal_next_relative = torch.clamp( (signal.data - perturbation) / orig, min=1 - bandwidth, max=1 + bandwidth) # print("signal_next_relative1:", signal_next_relative[index_max]) signal.data = signal_next_relative.mul(orig) # print(signal_next_relative[index_max]*orig[index_max]) # print("signal.data:", signal.data[index_max]) # if (i + 1) % 15000 == 0 and flag < 1: # # anneal lr # # lr *= 0.5 # lr = lr / args.learning_anneal # flag += 1 # print("wer", wer) # print("lr", lr) print("\n") signal.grad.data.zero_() print("Come to the end")
beam_args.add_argument('--cutoff-top-n', default=40, type=int, help='Cutoff number in pruning, only top cutoff_top_n characters with highest probs in ' 'vocabulary will be used in beam search, default 40.') beam_args.add_argument('--cutoff-prob', default=1.0, type=float, help='Cutoff probability in pruning,default 1.0, no pruning.') beam_args.add_argument('--lm-workers', default=4, type=int, help='Number of LM processes to use') parser.add_argument('--mixPrec', default=False,dest='mixPrec', action='store_true', help='Use mix prec for inference even if it was not avail for training.') parser.add_argument('--usePCEN', default=True,dest='usePcen', action='store_true', help='Use cuda to train model') args = parser.parse_args() if __name__ == '__main__': torch.set_grad_enabled(False) device = torch.device("cuda" if args.cuda else "cpu") # device = torch.device("cpu") model = WaveToLetter.load_model(args.model_path, cuda=args.cuda) if args.fuse_layers: model.module.convertTensorType() model = model.to(device) model.eval() avgTime = [] labels = WaveToLetter.get_labels(model) audio_conf = WaveToLetter.get_audio_conf(model) # model.module.fuse_model() # model.qconfig = torch.quantization.default_qconfig # torch.quantization.prepare(model, inplace=True) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob,