def __init__(self, trainroot, testroot, alpha=0.85): self.trainroot = trainroot self.testroot = testroot # ensure everytime the random is the same random.seed(cfg.manualSeed) np.random.seed(cfg.manualSeed) torch.manual_seed(cfg.manualSeed) self.logger = self.setLogger() self.alpha = alpha self.train_loss = [] self.test_loss = [] self.test_loss_t = [] self.test_loss_c = [] self.acc = {'acc': [], 'acc_t': [], 'acc_c': []} self.device = torch.device( "cuda" if cfg.use_cuda and torch.cuda.is_available() else "cpu") self.model = self.net_init().to(self.device) if cfg.label_type == "both": self.codec = dataset.Codec(cfg.alphabet[0]) self.codec_color = dataset.Codec(cfg.alphabet[1]) else: self.codec = dataset.Codec(cfg.alphabet) self.loss_fn = nn.CTCLoss() if not cfg.dealwith_lossnan else nn.CTCLoss(zero_infinity=True) self.loss_fn = self.loss_fn.to(self.device) if cfg.adam: self.optim = optim.Adam(self.model.parameters(), lr=cfg.lr, betas=(cfg.beta1, 0.999)) elif cfg.adadelta: self.optim = optim.Adadelta(self.model.parameters()) else: self.optim = optim.RMSprop(self.model.parameters(), lr=cfg.lr)
def __init__(self, eos, blank, enc_n_units, vocab, dropout=0., lsm_prob=0., fc_list=None, param_init=0.1, backward=False): super(CTC, self).__init__() self.eos = eos self.blank = blank self.vocab = vocab self.lsm_prob = lsm_prob self.bwd = backward self.space = -1 # TODO(hirofumi): fix later # for cache self.prev_spk = '' self.lmstate_final = None # for posterior plot self.prob_dict = {} self.data_dict = {} # Fully-connected layers before the softmax if fc_list is not None and len(fc_list) > 0: _fc_list = [int(fc) for fc in fc_list.split('_')] fc_layers = OrderedDict() for i in range(len(_fc_list)): input_dim = enc_n_units if i == 0 else _fc_list[i - 1] fc_layers['fc' + str(i)] = nn.Linear(input_dim, _fc_list[i]) fc_layers['dropout' + str(i)] = nn.Dropout(p=dropout) fc_layers['fc' + str(len(_fc_list))] = nn.Linear( _fc_list[-1], vocab) self.output = nn.Sequential(fc_layers) else: self.output = nn.Linear(enc_n_units, vocab) self.use_warpctc = LooseVersion( torch.__version__) < LooseVersion("1.4.0") if self.use_warpctc: import warpctc_pytorch self.ctc_loss = warpctc_pytorch.CTCLoss(size_average=True) else: if LooseVersion(torch.__version__) < LooseVersion("1.7.0"): self.ctc_loss = nn.CTCLoss(reduction="sum") else: self.ctc_loss = nn.CTCLoss(reduction="sum", zero_infinity=True) self.forced_aligner = CTCForcedAligner()
def training_loop(model, kwargs, train_dataset, train_batch_loader, eval_dataset): device = 'cuda:0' if torch.cuda.is_available() and kwargs['cuda'] else 'cpu' model.to(device) greedy_decoder = GreedyDecoder(model.labels) criterion = nn.CTCLoss(blank=0,reduction='none') parameters = model.parameters() optimizer = torch.optim.SGD(parameters,lr=kwargs['lr'],momentum=kwargs['momentum'],nesterov=True,weight_decay=1e-5) scaling_factor = model.get_scaling_factor() epochs=kwargs['epochs'] print('Train dataset size:%d' % len(train_dataset)) batch_count = math.ceil(len(train_dataset) / kwargs['batch_size']) for epoch in range(epochs): with timing.EpochTimer(epoch,_log_to_tensorboard) as et: model.train() total_loss = 0 for idx, data in et.across_epoch('Data Loading time', tqdm.tqdm(enumerate(train_batch_loader),total=batch_count)): inputs, input_lengths, targets, target_lengths, file_paths, texts = data with et.timed_action('Model execution time'): out = model(torch.FloatTensor(inputs).to(device)) out = out.transpose(1,0) output_lengths = [l // scaling_factor for l in input_lengths] with et.timed_action('Loss and BP time'): loss = criterion(out, targets.to(device), torch.IntTensor(output_lengths), torch.IntTensor(target_lengths)) optimizer.zero_grad() loss.mean().backward() optimizer.step() total_loss += loss.mean().item() log_loss_to_tensorboard(epoch, total_loss / batch_count) evaluate(model,eval_dataset,greedy_decoder,epoch,kwargs) if epoch != 0 and epoch % kwargs['epochs_per_save'] == 0 : save_epoch_model(model,epoch, kwargs['model_dir']) if kwargs['model_dir']: save_model(model, kwargs['model_dir']+'/final.pth') print('Finished at %s' % time.asctime())
def build_output(self, input: Tuple[int, int, int, int], block: str) -> Union[Tuple[None, None, None], Tuple[Tuple[int, int, int, int], str, Callable]]: """ Builds an output layer. """ pattern = re.compile(r'(O)(?P<name>{\w+})?(?P<dim>2|1|0)(?P<type>l|s|c)(?P<aug>a)?(?P<out>\d+)') m = pattern.match(block) if not m: return None, None, None dim = int(m.group('dim')) nl = m.group('type') outdim = int(m.group('out')) if dim == 0: raise ValueError('categorical output not supported, yet.') if nl == 'c' and dim == 2: raise ValueError('CTC not supported for heatmap output') if nl in ['l', 's'] and int(m.group('out')) >= 1: self.criterion = nn.BCELoss() elif nl == 'c': self.criterion = nn.CTCLoss(reduction='sum', zero_infinity=True) else: raise ValueError('unsupported output specification') # heatmap output if dim == 2: act = 's' if nl == 'l' else 'm' fn = layers.ActConv2D(input[1], outdim, (1, 1), (1, 1), act) logger.debug('{}\t\tconv\tkernel 1 x 1 filters {} stride 1 activation {}'.format(self.idx+1, outdim, nl)) return fn.get_shape(input), self.get_layer_name(m.group('type'), m.group('name')), fn else: aug = True if m.group('aug') else False lin = layers.LinSoftmax(input[1], int(m.group('out')), aug) logger.debug('{}\t\tlinear\taugmented {} out {}'.format(self.idx+1, aug, m.group('out'))) return lin.get_shape(input), self.get_layer_name(m.group(1), m.group('name')), lin
def __init__( self, pad_id=0, smoothing_coef=0.0, sample_wise=False, aux_ctc=False, ctc_initial_coef=0.1, ctc_blank_id=None, eps=1e-5, ): assert (not aux_ctc) or ( ctc_blank_id is not None), "Should be a blank id if using CTC loss" super().__init__() self.pad_id = pad_id self.smoothing_coef = smoothing_coef self.sample_wise = sample_wise self.aux_ctc = aux_ctc self.ctc_coef = ctc_initial_coef self.eps = eps if aux_ctc: self.ctc = nn.CTCLoss(blank=ctc_blank_id, reduction='none', zero_infinity=True) self.ctc = self.ctc.to(self._device)
def main(learning_rate=5e-4, batch_size=20, epochs=10, train_url="train-clean-100", test_url="test-clean", experiment=Experiment(api_key='dummy_key', disabled=True)): hparams = { "n_cnn_layers": 3, "n_rnn_layers": 5, "rnn_dim": 512, "n_class": 29, "n_feats": 128, "stride":2, "dropout": 0.1, "learning_rate": learning_rate, "batch_size": batch_size, "epochs": epochs } experiment.log_parameters(hparams) use_cuda = torch.cuda.is_available() torch.manual_seed(7) device = torch.device("cuda" if use_cuda else "cpu") if not os.path.isdir("./data"): os.makedirs("./data") train_dataset = torchaudio.datasets.LIBRISPEECH("./data", url=train_url, download=True) test_dataset = torchaudio.datasets.LIBRISPEECH("./data", url=test_url, download=True) kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = data.DataLoader(dataset=train_dataset, batch_size=hparams['batch_size'], shuffle=True, collate_fn=lambda x: data_processing(x, 'train'), **kwargs) test_loader = data.DataLoader(dataset=test_dataset, batch_size=hparams['batch_size'], shuffle=False, collate_fn=lambda x: data_processing(x, 'valid'), **kwargs) model = SpeechRecognitionModel( hparams['n_cnn_layers'], hparams['n_rnn_layers'], hparams['rnn_dim'], hparams['n_class'], hparams['n_feats'], hparams['stride'], hparams['dropout'] ).to(device) print(model) print('Num Model Parameters', sum([param.nelement() for param in model.parameters()])) optimizer = optim.AdamW(model.parameters(), hparams['learning_rate']) criterion = nn.CTCLoss(blank=28).to(device) scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=hparams['learning_rate'], steps_per_epoch=int(len(train_loader)), epochs=hparams['epochs'], anneal_strategy='linear') iter_meter = IterMeter() for epoch in range(1, epochs + 1): train(model, device, train_loader, criterion, optimizer, scheduler, epoch, iter_meter, experiment) test(model, device, test_loader, criterion, epoch, iter_meter, experiment)
def __init__(self, upstream_dim, upstream_rate, downstream_expert, expdir, **kwargs): super(DownstreamExpert, self).__init__() self.expdir = expdir self.upstream_dim = upstream_dim self.corpus = downstream_expert["corpus"] # Text tokenizer self.tokenizer = load_text_encoder(**downstream_expert["text"]) modelrc = downstream_expert["model"] self.projector = nn.Linear(upstream_dim, modelrc["project_dim"]) model_select = downstream_expert["model"]["select"] self.model = eval(model_select)( modelrc["project_dim"], self.tokenizer.vocab_size, upstream_rate=upstream_rate, **modelrc.get(model_select, {}), ) self.objective = nn.CTCLoss( blank=self.tokenizer.pad_idx, zero_infinity=modelrc["zero_infinity"], ) self.save_best_on = downstream_expert.get("save_best_on", "dev") self.metrics = downstream_expert["metric"] self.metric_higher_better = downstream_expert["metric_higher_better"] self.register_buffer( "best_score", torch.ones(1) * (0 if self.metric_higher_better else 1 << 31))
def get_objective(objective): if isinstance(objective, str): objective = objective.lower() if objective in ['l1', 'l1loss']: return nn.L1Loss() elif objective in ['nll', 'nllloss']: return nn.NLLLoss() elif objective in ['nll2d', 'nllloss2d']: return nn.NLLLoss2d() elif objective in ['poissonnll', 'poissonnllloss']: return nn.PoissonNLLLoss() elif objective in ['kldiv', 'kldivloss']: return nn.KLDivLoss() elif objective in ['mse', 'mseloss']: return nn.MSELoss() elif objective in ['bce', 'bceloss']: return nn.BCELoss() elif objective in ['smoothl1', 'smoothl1loss']: return nn.SmoothL1Loss() elif objective in ['crossentropy', 'cross_entropy']: return nn.CrossEntropyLoss() elif objective in ['ctc', 'ctcloss']: return nn.CTCLoss() else: raise ValueError('unknown argument!') elif isinstance(objective, _Loss): return objective else: raise ValueError('unknown argument {}'.format(objective))
def __init__(self, file_path, epochs, batch_size=4): self.temppath = file_path.split(".") self.logpath = self.temppath[0] + "log.txt" print("Cuda : " + str(torch.cuda.is_available())) self.device = torch.device( 'cuda:0' if torch.cuda.is_available() else 'cpu') #set cpu or gpu self.file_path = file_path self.net = model.SpeechRecognition() self.criterion = nn.CTCLoss(blank=28).to(self.device) if torch.cuda.is_available(): self.net.cuda() else: self.net.cpu() if file_path is not None and path.exists(file_path): self.load() self.net.to(self.device) #set training waveform data transformer self.train_audio_transforms = nn.Sequential( torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128), torchaudio.transforms.FrequencyMasking(freq_mask_param=30), torchaudio.transforms.TimeMasking(time_mask_param=100)) #set testing waveform data transformer self.valid_audio_transforms = torchaudio.transforms.MelSpectrogram() #train self.train(epochs=epochs, batch_size=batch_size)
def __init__(self, blank=0, reduction='mean', use_baidu=False): super(CTCLoss, self).__init__() self.use_baidu_implement = use_baidu if self.use_baidu_implement: self.internal_loss = internalCTC(reduction=reduction, blank=blank) else: self.internal_loss = nn.CTCLoss(blank=blank, reduction=reduction)
def forward(self, images, targets=None): bs, _, _, _ = images.size() #print(images.shape) x = F.relu(self.conv1(images)) #print(x.shape) x = self.maxpool1(x) #print(x.shape) x = F.relu(self.conv2(x)) #print(x.shape) x = self.maxpool2(x) #print(x.shape) x = x.permute(0, 3, 1, 2) #print(x.shape) x = x.view(bs, x.size(1), -1) #print(x.shape) x = F.relu(self.linear1(x)) #print(x.shape) x = self.dropout1(x) x, _ = self.gru1(x) # 1, 75, 64 x = self.output(x) # 1, 75, 20 x = x.permute(1, 0, 2) # 75, 1, 20 if targets is not None: log_softmax_values = F.log_softmax(x, 2) input_lengths = torch.full( size = (bs,), fill_value=log_softmax_values.size(0), dtype=torch.int32 ) target_lengths = torch.full( size=(bs,), fill_value=targets.size(1), dtype=torch.int32 ) loss = nn.CTCLoss(blank=0)(log_softmax_values, targets, input_lengths, target_lengths) return x, loss return x, None
def __init__(self, num_features, num_classes, base_config, blank_index=0): super(Wav2LetterPlus, self).__init__() self.layers = nn.ModuleList ([ Wav2LetterPlusSubBlock(num_features, 256, 11, 2, 1, 0.2, 1), Wav2LetterPlusSubBlock(256, 256, 11, 1, 1, 0.2, 3), Wav2LetterPlusSubBlock(256, 384, 13, 1, 1, 0.2, 3), Wav2LetterPlusSubBlock(384, 512, 17, 1, 1, 0.2, 3), Wav2LetterPlusSubBlock(512, 640, 21, 1, 1, 0.3, 3), Wav2LetterPlusSubBlock(640, 768, 25, 1, 1, 0.3, 3), Wav2LetterPlusSubBlock(768, 896, 29, 1, 2, 0.4, 1), Wav2LetterPlusSubBlock(896, 1024, 1, 1, 1, 0.4, 1), ]) self.output_layers = nn.ModuleList([ Wav2LetterPlusSubBlock(1024, num_classes, 1, 1, 1, activation=False, batch_norm=False) for _ in range(1+n_left_context_heads+n_right_context_heads) ]) self.n_left_context_heads = n_left_context_heads self.n_right_context_heads = n_right_context_heads self.ctc_criterion = nn.CTCLoss(blank=blank_index, reduction='mean', zero_infinity=True) self.ct_criterion = ct_loss.CTLoss(blank_index=blank_index, version='numpy') ##### hyper parameters ##### ctc_loss_weight = base_config.get('ctc_loss_weight', 1) ct_loss_left_weight = base_config.get('ct_loss_left_weight', 0) ct_loss_right_weight = base_config.get('ct_loss_right_weight', 0) n_left_context_heads = len(ct_loss_left_weight) n_right_context_heads = len(ct_loss_right_weight) eval_ct_steps = base_config.get('eval_steps') * base_config.get('eval_ct_steps', 0)
def _get_ctc_loss(output, trg, device): ctc_batch_size = output.shape[1] ctc_input_length = output.shape[0] ctc_inputs = output.log_softmax(2) ctc_trgs = trg.permute((1, 0)) ctc_input_lengths = torch.full(size=(ctc_batch_size, ), fill_value=ctc_input_length, dtype=torch.long) # TODO figure this part out better.. #ctc_trg_lengths = torch.full(size=(ctc_batch_size,), fill_value=ctc_input_length, dtype=torch.long) ctc_trg_list = [] for i in range(ctc_batch_size): nz = (ctc_trgs[i, :] == 0).nonzero() if len(nz) == 0 or nz[0] > ctc_input_length / 2: ctc_trg_list.append( torch.tensor([int(ctc_input_length / 2)]).to(device)) #ctc_trg_list.append(ctc_input_lengths[0:1].to(device)) else: ctc_trg_list.append(nz[0].detach()) try: ctc_trg_lengths = torch.cat(ctc_trg_list).to(device) except: import pdb pdb.set_trace() loss_ctc = nn.CTCLoss()(ctc_inputs, ctc_trgs, ctc_input_lengths, ctc_trg_lengths) loss = loss_ctc if torch.isinf(loss): import pdb pdb.set_trace() return loss
def __init__( self, num_classes: int, ignore_index: int, dim: int = -1, reduction='mean', ctc_weight: float = 0.3, cross_entropy_weight: float = 0.7, blank_id: int = None, smoothing: float = 0.1, ) -> None: super(JointCTCCrossEntropyLoss, self).__init__() self.num_classes = num_classes self.dim = dim self.ignore_index = ignore_index self.reduction = reduction.lower() self.ctc_weight = ctc_weight self.cross_entropy_weight = cross_entropy_weight self.ctc_loss = nn.CTCLoss(blank=blank_id, reduction=self.reduction, zero_infinity=True) if smoothing > 0.0: self.cross_entropy_loss = LabelSmoothedCrossEntropyLoss( num_classes=num_classes, ignore_index=ignore_index, smoothing=smoothing, reduction=reduction, dim=-1, ) else: self.cross_entropy_loss = nn.CrossEntropyLoss( reduction=self.reduction, ignore_index=self.ignore_index)
def __init__(self, net_class, net_cfg, train_cfg): super().__init__() self.net_cfg = net_cfg self.train_cfg = train_cfg self.loss_fn = nn.CTCLoss(blank=0, zero_infinity=False) self.model = net_class(**net_cfg)
def train(model, device): ctc_loss = nn.CTCLoss() model.train() optimizer = opt.SGD(model.parameters(), lr=learning_rate, weight_decay=1e-5) global batch_size for e in range(num_of_epochs): loss_sum = 0 for item, label, real_size, indices in train_loader: optimizer.zero_grad() item = item.to(device) probes = model(item, device) length = probes.size()[0] targets = indices.to(device) input_lengths = torch.full(size=(batch_size,), fill_value=length, dtype=torch.long).to(device) target_lengths = real_size.to(device) loss = ctc_loss(probes, targets, input_lengths, target_lengths) loss_sum += loss.item() loss.backward() optimizer.step() print("finish epoch #{} avg loss {} last loss {}".format(e, (loss_sum / len(train_loader)), loss))
def build_output( self, input: Tuple[int, int, int, int], block: str ) -> Union[Tuple[None, None, None], Tuple[Tuple[int, int, int, int], str, Callable]]: """ Builds an output layer. """ pattern = re.compile( r'(O)(?P<name>{\w+})?(?P<dim>2|1|0)(?P<type>l|s|c)(?P<aug>a)?(?P<out>\d+)' ) m = pattern.match(block) if not m: return None, None, None if int(m.group('dim')) != 1: raise ValueError('non-2d output not supported, yet') nl = m.group('type') if nl not in ['s', 'c']: raise ValueError('only softmax and ctc supported in output') if nl == 'c': self.criterion = nn.CTCLoss(reduction='none') aug = True if m.group('aug') else False lin = layers.LinSoftmax(input[1], int(m.group('out')), aug) logger.debug('{}\t\tlinear\taugmented {} out {}'.format( self.idx + 1, aug, m.group('out'))) return lin.get_shape(input), self.get_layer_name( m.group(1), m.group('name')), lin
def __init__(self, config): super().__init__() self.save_hyperparameters('config') self.hparams = config self.transform = ImageTransform( augmentation=config.get('augmentation', True), scale_height=config['dataset']['scale_height'], min_width=config['dataset']['min_width']) self.config = config self.beam_width = config['beam_width'] # define model self.cnn = initialize(config['cnn']) self.vocab = initialize(config['vocab'], add_blank=True) self.loss_fn = nn.CTCLoss(blank=self.vocab.BLANK_IDX) output_H = config['dataset']['scale_height'] // ( 32 // 2**config['cnn']['args']['droplast']) self.blstm = nn.LSTM(input_size=output_H * self.cnn.n_features, hidden_size=config['hidden_size'], num_layers=config['num_layers'], batch_first=True, dropout=config['dropout'], bidirectional=True) self.character_distribution = nn.Linear(2 * config['hidden_size'], self.vocab.size) self.ctc_string_tf = CTCStringTransform(self.vocab) self.string_tf = StringTransform(self.vocab)
def forward(self, images, targets=None): bs, _, _, _ = images.size() x = F.relu(self.conv_1(images)) x = self.pool_1(x) x = F.relu(self.conv_2(x)) x = self.pool_2(x) x = x.permute(0, 3, 1, 2) x = x.view(bs, x.size(1), -1) x = F.relu(self.linear_1(x)) x = self.drop_1(x) x, _ = self.lstm(x) x = self.output(x) x = x.permute(1, 0, 2) if targets is not None: log_probs = F.log_softmax(x, 2) input_lengths = torch.full(size=(bs, ), fill_value=log_probs.size(0), dtype=torch.int32) target_lengths = torch.full(size=(bs, ), fill_value=targets.size(1), dtype=torch.int32) loss = nn.CTCLoss(blank=0)(log_probs, targets, input_lengths, target_lengths) return x, loss return x, None
def __init__(self, input_len, target_len, blank=0): super(CTC, self).__init__() self.ctc = nn.CTCLoss(blank=blank, reduction='mean', zero_infinity=True) self.input_len = input_len self.target_len = target_len
def __init__( self, num_classes: int, # the number of classfication ignore_index: int, # indexes that are ignored when calcuating loss dim: int = -1, # dimension of caculation loss reduction='mean', # reduction method [sum, mean] ctc_weight: float = 0.3, # weight of ctc loss cross_entropy_weight: float = 0.7, # weight of cross entropy loss blank_id: int = None) -> None: super(JointCTCCrossEntropyLoss, self).__init__() self.num_classes = num_classes self.dim = dim self.ignore_index = ignore_index self.reduction = reduction.lower() self.ctc_weight = ctc_weight self.cross_entropy_weight = cross_entropy_weight if self.reduction == 'sum': self.reduction_method = torch.sum elif self.reduction == 'mean': self.reduction_method = torch.mean else: raise ValueError( "Unsupported reduction method {0}".format(reduction)) self.ctc_loss = nn.CTCLoss(blank=blank_id, reduction=self.reduction, zero_infinity=True) self.cross_entropy_loss = nn.CrossEntropyLoss( reduction=self.reduction, ignore_index=self.ignore_index)
def train_epoch_packed(model, optimizer, train_loader): model.train() criterion = nn.CTCLoss() criterion = criterion.to(DEVICE) batch_id=0 avg_loss = 0 before = time.time() print("Training", len(train_loader), "number of batches") print("Current running lr is: ",optimizer.param_groups[0]['lr']) for inputs, input_lens,targets, target_lens in train_loader: # lists, presorted, preloaded on GPU batch_id+=1 inputs, input_lens, targets, target_lens = inputs.to(DEVICE), input_lens.to(DEVICE),targets.to(DEVICE), target_lens.to(DEVICE) out, out_lens = model(inputs, input_lens) # print(target_lens) loss = criterion(out, targets, out_lens, target_lens) # criterion of the concatenated output optimizer.zero_grad() loss.backward() optimizer.step() avg_loss += loss.item() if batch_id % 100 == 0: after = time.time() # nwords = np.sum(np.array([len(l) for l in inputs])) # lpw = loss.item() / nwords print("Time elapsed: ", after - before) print("At batch",batch_id,'Loss', avg_loss/(100)) avg_loss = 0 before = after del inputs del input_lens del targets del target_lens del out del out_lens return model
def main(args): alphabet = alphabet_factory() device = torch.device('cpu') checkpoint = torch.load('model_best.pth', map_location=device) in_features = args.n_mfcc * (2 * args.n_context + 1) model = build_deepspeech(in_features=in_features, num_classes=len(alphabet)) model.load_state_dict(checkpoint['state_dict']) print_size_of_model(model) decoder = GreedyDecoder() if args.quantize: model = torch.quantization.quantize_dynamic(model, {nn.RNN, nn.Linear}, dtype=torch.qint8) logging.info('quantized model') print_size_of_model(model) transform = prepare_transformations(args) dataset = ProcessedDataset(get_dataset(args.datadir, "dev-clean"), transform, alphabet) collate_fn = collate_factory(model_length_function) criterion = nn.CTCLoss(blank=alphabet.mapping[alphabet.char_blank]) dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=0, collate_fn=collate_fn, drop_last=False) test_loop_fn(dataloader, model, criterion, device, 1, decoder, alphabet)
def get_criterion(opt, vocab): if opt.architecture == 'deepspeech2': criterion = nn.CTCLoss(blank=vocab.blank_id, reduction=opt.reduction, zero_infinity=True) elif opt.architecture == 'las' and opt.joint_ctc_attention: criterion = JointCTCCrossEntropyLoss( num_classes=len(vocab), ignore_index=vocab.pad_id, reduction=opt.reduction, ctc_weight=opt.ctc_weight, cross_entropy_weight=opt.cross_entropy_weight, blank_id=vocab.blank_id, dim=-1, ) elif opt.architecture == 'transformer': criterion = nn.CrossEntropyLoss( ignore_index=vocab.pad_id, reduction=opt.reduction ) else: criterion = LabelSmoothedCrossEntropyLoss( num_classes=len(vocab), ignore_index=vocab.pad_id, smoothing=opt.label_smoothing, reduction=opt.reduction, architecture=opt.architecture, dim=-1 ) return criterion
def forward(self, images, labels, lengths): r""" Overridden. """ images = images.to(self.device) labels = labels.to(self.device) T_length = 18 input_lengths, target_lengths = BIM.sparse_tuple_for_ctc(T_length, lengths) ctc_loss = nn.CTCLoss(blank=len(CHARS)-1, reduction='mean') # reduction: 'none' | 'mean' | 'sum' for i in range(self.iters) : images.requires_grad = True outputs = self.model(images) log_probs = outputs.permute(2, 0, 1) # for ctc loss: T x N x C, llog_probs.shape = (18, 100, 68) log_probs = log_probs.log_softmax(2).requires_grad_() cost = ctc_loss(log_probs, labels, input_lengths=input_lengths, target_lengths=target_lengths) grad = torch.autograd.grad(cost, images, retain_graph=False, create_graph=False)[0] adv_images = images + self.alpha*grad.sign() a = torch.clamp(images - self.eps, min=-1, max=1) b = (adv_images>=a).float()*adv_images + (a>adv_images).float()*a c = (b > images+self.eps).float()*(images+self.eps) + (images+self.eps >= b).float()*b images = torch.clamp(c, max=1).detach() adv_images = images return adv_images
def __init__(self, hparams: Namespace): super().__init__() self.hparams = hparams self.criterion = nn.CTCLoss() if Path(self.hparams.tokenizer_path).exists(): self.tokenizer = WordLevelTokenizer(self.hparams.tokenizer_path) else: train_turns = preprocess(self.hparams.train_path, self.hparams.ontology_path) self.tokenizer = get_tokenizer(train_turns, self.hparams.tokenizer_path) # embedding self.embedding = nn.Embedding(self.tokenizer.get_vocab_size(), self.hparams.embedding_dim) self.pos_embedding = PositionalEncoding( d_model=self.hparams.embedding_dim, dropout=self.hparams.dropout) # value decoder self.value_decoder = nn.ModuleList([ nn.MultiheadAttention(embed_dim=self.hparams.hidden_dim, num_heads=self.hparams.num_heads, dropout=self.hparams.dropout) for _ in range(3) ]) self.vocab_proj = nn.Linear(self.hparams.hidden_dim, self.tokenizer.get_vocab_size())
def __init__( self, num_classes: int, # the number of classfication ignore_index: int, # indexes that are ignored when calcuating loss dim: int = -1, # dimension of caculation loss reduction='mean', # reduction method [sum, mean] ctc_weight: float = 0.3, # weight of ctc loss cross_entropy_weight: float = 0.7, # weight of cross entropy loss blank_id: int = 0, # identification of blank token smoothing: float = 0.1, # ratio of smoothing (confidence = 1.0 - smoothing) ) -> None: super(JointCTCCrossEntropyLoss, self).__init__() self.num_classes = num_classes self.dim = dim self.ignore_index = ignore_index self.reduction = reduction.lower() self.ctc_weight = ctc_weight self.cross_entropy_weight = cross_entropy_weight self.ctc_loss = nn.CTCLoss(blank=blank_id, reduction=self.reduction, zero_infinity=True) if smoothing > 0.0: self.cross_entropy_loss = LabelSmoothedCrossEntropyLoss( num_classes=num_classes, ignore_index=ignore_index, smoothing=smoothing, reduction=reduction, dim=-1, ) else: self.cross_entropy_loss = nn.CrossEntropyLoss( reduction=self.reduction, ignore_index=self.ignore_index)
def get_criterion(config: DictConfig, vocab: Vocabulary) -> nn.Module: if config.model.architecture in ('deepspeech2', 'jasper'): criterion = nn.CTCLoss(blank=vocab.blank_id, reduction=config.train.reduction, zero_infinity=True) elif config.model.architecture in ( 'las', 'transformer') and config.model.joint_ctc_attention: criterion = JointCTCCrossEntropyLoss( num_classes=len(vocab), ignore_index=vocab.pad_id, reduction=config.train.reduction, ctc_weight=config.model.ctc_weight, cross_entropy_weight=config.model.cross_entropy_weight, blank_id=vocab.blank_id, dim=-1, smoothing=config.train.label_smoothing, ) elif config.model.architecture in ('rnnt', 'conformer'): criterion = TransducerLoss(blank_id=vocab.blank_id) elif config.model.architecture == 'transformer' and config.train.label_smoothing <= 0.0: criterion = nn.CrossEntropyLoss( ignore_index=vocab.pad_id, reduction=config.train.reduction, ) else: criterion = LabelSmoothedCrossEntropyLoss( num_classes=len(vocab), ignore_index=vocab.pad_id, smoothing=config.train.label_smoothing, reduction=config.train.reduction, dim=-1, ) return criterion
def __init__(self, vocab_size, decoder_dim, hidden_size, dropout=0.5): super(MIEsitmator, self).__init__() self.proj = nn.Sequential( LinearNorm(decoder_dim, hidden_size, bias=True, w_init_gain='relu'), nn.ReLU(), nn.Dropout(p=dropout)) self.ctc_proj = LinearNorm(hidden_size, vocab_size + 1, bias=True) self.ctc = nn.CTCLoss(blank=vocab_size, reduction='none')
def __init__(self, upstream_dim, upstream_rate, runner, downstream_expert, expdir, **kwargs): super(DownstreamExpert, self).__init__() self.expdir = expdir self.upstream_dim = upstream_dim self.corpus = downstream_expert['corpus'] # Text tokenizer self.tokenizer = load_text_encoder(**downstream_expert['text']) modelrc = downstream_expert['model'] self.projector = nn.Linear(upstream_dim, modelrc['project_dim']) model_select = downstream_expert['model']['select'] self.model = eval(model_select)( modelrc['project_dim'], self.tokenizer.vocab_size, upstream_rate=upstream_rate, **modelrc.get(model_select, {}), ) self.objective = nn.CTCLoss( blank=self.tokenizer.pad_idx, zero_infinity=modelrc['zero_infinity'], ) self.eval_dataloaders = runner['eval_dataloaders'] self.metrics = downstream_expert['metric'] self.metric_higher_better = downstream_expert['metric_higher_better'] self.register_buffer( 'best_score', torch.ones(1) * (0 if self.metric_higher_better else 1 << 31))