def _log_prob_gradient(self, grad_output): """ Parameters ---------- grad_output Returns ------- input_grad target_grad transition_grad """ losses = self.auxiliary_data # Compute the gradients for each example: def seq_grad(b): gtn.backward(losses[b]) # Compute gradients in parallel over the batch: gtn.parallel_for(seq_grad, range(len(losses))) transition_grad = self._transition_fst.grad().weights_to_numpy() duration_grad = self._duration_fst.grad().weights_to_numpy() input_grad = None target_grad = None transition_grad = torch.from_numpy(transition_grad) * grad_output.cpu() duration_grad = torch.from_numpy(duration_grad) * grad_output.cpu() return input_grad, target_grad, transition_grad, duration_grad
def viterbi(self, outputs): B, T, C = outputs.shape assert C == self.N, "Wrong number of classes in output." predictions = [None] * B def process(b): # create emission graph g_emissions = gtn.linear_graph(T, C, False) cpu_data = outputs[b].cpu().contiguous() g_emissions.set_weights(cpu_data.data_ptr()) # create transition graph g_transitions = utils.ASGLossFunction.create_transitions_graph( self.transitions) g_path = gtn.viterbi_path(gtn.intersect(g_emissions, g_transitions)) prediction = g_path.labels_to_list() collapsed_prediction = [p for p, _ in groupby(prediction)] if self.garbage_idx is not None: # remove garbage tokens collapsed_prediction = [ p for p in collapsed_prediction if p != self.garbage_idx ] predictions[b] = utils.unpack_replabels(collapsed_prediction, self.num_replabels) gtn.parallel_for(process, range(B)) return [torch.IntTensor(p) for p in predictions]
def viterbi(self, outputs): B, T, C = outputs.shape if self.transitions is not None: cpu_data = self.transition_params.cpu().contiguous() self.transitions.set_weights(cpu_data.data_ptr()) self.transitions.calc_grad = False self.tokens.arc_sort() paths = [None] * B def process(b): emissions = gtn.linear_graph(T, C, False) cpu_data = outputs[b].cpu().contiguous() emissions.set_weights(cpu_data.data_ptr()) if self.transitions is not None: full_graph = gtn.intersect(emissions, self.transitions) else: full_graph = emissions # Find the best path and remove back-off arcs: path = gtn.remove(gtn.viterbi_path(full_graph)) # Left compose the viterbi path with the "alignment to token" # transducer to get the outputs: path = gtn.compose(path, self.tokens) # When there are ambiguous paths (allow_repeats is true), we take # the shortest: path = gtn.viterbi_path(path) path = gtn.remove(gtn.project_output(path)) paths[b] = path.labels_to_list() gtn.parallel_for(process, range(B)) predictions = [torch.IntTensor(path) for path in paths] return predictions
def backward(ctx, grad_output): output_graphs, input_graphs, kernels = CTX_GRAPHS B, T, C = ctx.input_shape kernel_size = ctx.kernel_size stride = ctx.stride input_grad = torch.zeros((B, T, C)) deltas = grad_output.cpu().numpy() def process(b): for t, window in enumerate(output_graphs[b]): for c, out in enumerate(window): delta = make_scalar_graph(deltas[b, t, c]) gtn.backward(out, delta) grad = (input_graphs[b][t].grad().weights_to_numpy().reshape( kernel_size, -1)) input_grad[b, t * stride:t * stride + kernel_size] += grad gtn.parallel_for(process, range(B)) if ctx.needs_input_grad[4]: kernel_grads = [k.grad().weights_to_numpy() for k in kernels] kernel_grads = np.concatenate(kernel_grads) kernel_grads = torch.from_numpy(kernel_grads).to( grad_output.device) else: kernel_grads = None return ( input_grad.to(grad_output.device), None, # kernels None, # kernel_size None, # stride kernel_grads, None, # viterbi )
def argmax(self, inputs): seq_fsts = self.seq_fst() arc_scores = self.scores_to_arc(inputs) device = arc_scores.device arc_scores = arc_scores.cpu() batch_size, num_samples, num_classes = arc_scores.shape best_paths = [None] * batch_size def pred_seq(batch_index): obs_fst = linearFstFromArray(arc_scores[batch_index].reshape( num_samples, -1)) # Compose each sequence fst individually: it seems like composition # only works for lattices denom_fst = obs_fst for seq_fst in seq_fsts: denom_fst = gtn.compose(denom_fst, seq_fst) viterbi_path = gtn.viterbi_path(denom_fst) best_paths[batch_index] = gtn.remove( gtn.project_output(viterbi_path)) gtn.parallel_for(pred_seq, range(batch_size)) best_paths = torch.tensor( [self._getOutputString(p) for p in best_paths]).to(device) return best_paths
def backward(ctx, grad_output): """Backward computation. :param torch.tensor grad_output: backward passed gradient value :return: cumulative gradient output :rtype: (torch.Tensor, None, None, None) """ losses, scales, emissions_graphs, in_shape, ilens = ctx.auxiliary_data B, T, C = in_shape input_grad = torch.zeros((B, T, C)) def process(b): T = ilens[b] gtn.backward(losses[b], False) emissions = emissions_graphs[b] grad = emissions.grad().weights_to_numpy() input_grad[b][:T] = torch.from_numpy(grad).view(1, T, C) * scales[b] gtn.parallel_for(process, range(B)) if grad_output.is_cuda: input_grad = input_grad.cuda() input_grad *= grad_output / B return ( input_grad, None, # targets None, # ilens None, # blank_idx None, # reduction )
def forward(ctx, inputs, kernels, kernel_size, stride, kernel_params=None, viterbi=False): B, T, C = inputs.shape if T < kernel_size: # Padding should be done outside of this function: raise ValueError( f"Input ({T}) too short for kernel ({kernel_size})") cpu_inputs = inputs.cpu() output_graphs = [[] for _ in range(B)] input_graphs = [[] for _ in range(B)] if kernel_params is not None: cpu_data = kernel_params.cpu().contiguous() s = 0 for kernel in kernels: na = kernel.num_arcs() data_ptr = cpu_data[s:s + na].data_ptr() s += na kernel.set_weights(data_ptr) kernel.calc_grad = kernel_params.requires_grad kernel.zero_grad() def process(b): for t in range(0, T - kernel_size + 1, stride): input_graph = gtn.linear_graph(kernel_size, C, inputs.requires_grad) window = cpu_inputs[b, t:t + kernel_size, :].contiguous() input_graph.set_weights(window.data_ptr()) if viterbi: window_outputs = [ gtn.viterbi_score(gtn.intersect(input_graph, kernel)) for kernel in kernels ] else: window_outputs = [ gtn.forward_score(gtn.intersect(input_graph, kernel)) for kernel in kernels ] output_graphs[b].append(window_outputs) # Save for backward: if input_graph.calc_grad: input_graphs[b].append(input_graph) gtn.parallel_for(process, range(B)) global CTX_GRAPHS CTX_GRAPHS = (output_graphs, input_graphs, kernels) ctx.input_shape = inputs.shape ctx.kernel_size = kernel_size ctx.stride = stride outputs = [[[o.item() for o in window] for window in example] for example in output_graphs] return torch.tensor(outputs).to(inputs.device)
def forward(ctx, inputs, transitions, targets, reduction="none"): B, T, C = inputs.shape losses = [None] * B scales = [None] * B emissions_graphs = [None] * B transitions_graphs = [None] * B calc_trans_grad = transitions.requires_grad transitions = transitions.cpu() # avoid multiple cuda -> cpu copies def process(b): # create emission graph g_emissions = gtn.linear_graph(T, C, inputs.requires_grad) cpu_data = inputs[b].cpu().contiguous() g_emissions.set_weights(cpu_data.data_ptr()) # create transition graph g_transitions = ASGLossFunction.create_transitions_graph( transitions, calc_trans_grad) # create force align criterion graph g_fal = ASGLossFunction.create_force_align_graph(targets[b]) # compose the graphs g_fal_fwd = gtn.forward_score( gtn.intersect(gtn.intersect(g_fal, g_transitions), g_emissions)) g_fcc_fwd = gtn.forward_score( gtn.intersect(g_emissions, g_transitions)) g_loss = gtn.subtract(g_fcc_fwd, g_fal_fwd) scale = 1.0 if reduction == "mean": L = len(targets[b]) scale = 1.0 / L if L > 0 else scale elif reduction != "none": raise ValueError("invalid value for reduction '" + str(reduction) + "'") # Save for backward: losses[b] = g_loss scales[b] = scale emissions_graphs[b] = g_emissions transitions_graphs[b] = g_transitions gtn.parallel_for(process, range(B)) ctx.auxiliary_data = ( losses, scales, emissions_graphs, transitions_graphs, inputs.shape, ) loss = torch.tensor([losses[b].item() * scales[b] for b in range(B)]) return torch.mean(loss.cuda() if inputs.is_cuda else loss)
def backward(ctx, grad_output): losses, emissions_graphs, in_shape = ctx.auxiliary_data B, T, C = in_shape input_grad = torch.empty((B, T, C)) # Compute the gradients for each example: def backward_single(b): gtn.backward(losses[b]) emissions = emissions_graphs[b] grad = emissions.grad().weights_to_numpy() input_grad[b] = torch.from_numpy(grad).view(1, T, C) # Compute gradients in parallel over the batch: gtn.parallel_for(backward_single, range(B)) return input_grad.to(grad_output.device), None
def forward(ctx, log_probs, targets, ilens, blank_idx=0, reduction="none"): """Forward computation. :param torch.tensor log_probs: batched log softmax probabilities (B, Tmax, oDim) :param list targets: batched target sequences, list of lists :param int blank_idx: index of blank token :return: ctc loss value :rtype: torch.Tensor """ B, _, C = log_probs.shape losses = [None] * B scales = [None] * B emissions_graphs = [None] * B def process(b): # create emission graph T = ilens[b] g_emissions = gtn.linear_graph(T, C, log_probs.requires_grad) cpu_data = log_probs[b][:T].cpu().contiguous() g_emissions.set_weights(cpu_data.data_ptr()) # create criterion graph g_criterion = GTNCTCLossFunction.create_ctc_graph( targets[b], blank_idx) # compose the graphs g_loss = gtn.negate( gtn.forward_score(gtn.intersect(g_emissions, g_criterion))) scale = 1.0 if reduction == "mean": L = len(targets[b]) scale = 1.0 / L if L > 0 else scale elif reduction != "none": raise ValueError("invalid value for reduction '" + str(reduction) + "'") # Save for backward: losses[b] = g_loss scales[b] = scale emissions_graphs[b] = g_emissions gtn.parallel_for(process, range(B)) ctx.auxiliary_data = (losses, scales, emissions_graphs, log_probs.shape, ilens) loss = torch.tensor([losses[b].item() * scales[b] for b in range(B)]) return torch.mean(loss.cuda() if log_probs.is_cuda else loss)
def backward(ctx, grad_output): ( losses, scales, emissions_graphs, transitions_graphs, in_shape, ) = ctx.auxiliary_data B, T, C = in_shape input_grad = transitions_grad = None if ctx.needs_input_grad[0]: input_grad = torch.empty((B, T, C)) if ctx.needs_input_grad[1]: transitions_grad = torch.empty((B, C + 1, C)) def process(b): gtn.backward(losses[b], False) emissions = emissions_graphs[b] transitions = transitions_graphs[b] if input_grad is not None: grad = emissions.grad().weights_to_numpy() input_grad[b] = torch.from_numpy(grad).view(1, T, C) * scales[b] if transitions_grad is not None: grad = transitions.grad().weights_to_numpy() transitions_grad[b] = ( torch.from_numpy(grad).view(1, C + 1, C) * scales[b]) gtn.parallel_for(process, range(B)) if input_grad is not None: if grad_output.is_cuda: input_grad = input_grad.cuda() input_grad *= grad_output / B if transitions_grad is not None: if grad_output.is_cuda: transitions_grad = transitions_grad.cuda() transitions_grad = torch.mean(transitions_grad, 0) * grad_output return ( input_grad, transitions_grad, None, # target None, # reduction )
def _log_prob(self, inputs, targets, transition_params, duration_params): seq_fsts = self.seq_fst(transition_params=transition_params, duration_params=duration_params) device = inputs.device arc_scores = self.scores_to_arc(inputs) arc_labels = self.labels_to_arc(targets) arc_scores = arc_scores.cpu() arc_labels = arc_labels.cpu() batch_size, num_samples, num_classes = arc_scores.shape losses = [None] * batch_size obs_fsts = [None] * batch_size def seq_loss(batch_index): obs_fst = linearFstFromArray(arc_scores[batch_index].reshape( num_samples, -1)) gt_fst = fromSequence(arc_labels[batch_index]) # Compose each sequence fst individually: it seems like composition # only works for lattices denom_fst = obs_fst for seq_fst in seq_fsts: denom_fst = gtn.compose(denom_fst, seq_fst) denom_fst = gtn.project_output(denom_fst) num_fst = gtn.compose(denom_fst, gt_fst) loss = gtn.subtract(gtn.forward_score(num_fst), gtn.forward_score(denom_fst)) losses[batch_index] = loss obs_fsts[batch_index] = obs_fst gtn.parallel_for(seq_loss, range(batch_size)) self.auxiliary_data = losses losses = torch.tensor([lp.item() for lp in losses]).to(device) return losses
def test_parallel_func(self): B = 3 inputs1 = [gtn.scalar_graph(k) for k in [1.0, 2.0, 3.0]] inputs2 = [gtn.scalar_graph(k) for k in [1.0, 2.0, 3.0]] out = [None] * B def process(b): out[b] = gtn.add(gtn.add(inputs1[b], inputs1[b]), gtn.negate(inputs2[b])) gtn.parallel_for(process, range(B)) expected = [] for b in range(B): expected.append( gtn.add(gtn.add(inputs1[b], inputs1[b]), gtn.negate(inputs2[b]))) self.assertEqual(len(out), len(expected)) for i in range(len(expected)): self.assertTrue(gtn.equal(out[i], expected[i]))
def forward(ctx, inputs, targets): B, T, C = inputs.shape losses = [None] * B emissions_graphs = [None] * B # Move data to the host: device = inputs.device inputs = inputs.cpu() targets = targets.cpu() # Compute the loss for the b-th example: def forward_single(b): emissions = gtn.linear_graph(T, C, inputs.requires_grad) data = inputs[b].contiguous() emissions.set_weights(data.data_ptr()) target = GTNLossFunction.make_target_graph(targets[b]) # Score the target: target_score = gtn.forward_score(gtn.intersect(target, emissions)) # Normalization term: norm = gtn.forward_score(emissions) # Compute the loss: loss = gtn.subtract(norm, target_score) # Save state for backward: losses[b] = loss emissions_graphs[b] = emissions # Compute the loss in parallel over the batch: gtn.parallel_for(forward_single, range(B)) ctx.auxiliary_data = (losses, emissions_graphs, inputs.shape) # Put losses back in a torch tensor and move them back to the device: return torch.tensor([l.item() for l in losses]).to(device)
def backward(ctx, grad_output) -> Tuple: losses, emissions_graphs, transitions = ctx.graphs scales = ctx.scales B, T, C = ctx.input_shape calc_emissions = ctx.needs_input_grad[0] input_grad = torch.empty((B, T, C)) if calc_emissions else None def process(b: int) -> None: scale = make_scalar_graph(scales[b]) gtn.backward(losses[b], scale) emissions = emissions_graphs[b] if calc_emissions: grad = emissions.grad().weights_to_numpy() input_grad[b] = torch.tensor(grad).view(1, T, C) gtn.parallel_for(process, range(B)) if calc_emissions: input_grad = input_grad.to(grad_output.device) input_grad *= grad_output / B if ctx.needs_input_grad[4]: grad = transitions.grad().weights_to_numpy() transition_grad = torch.tensor(grad).to(grad_output.device) transition_grad *= grad_output / B else: transition_grad = None return ( input_grad, None, # target None, # tokens None, # lexicon transition_grad, # transition params None, # transitions graph None, )
def forward(ctx, log_probs, targets, blank_idx=0, reduction="none"): B, T, C = log_probs.shape losses = [None] * B scales = [None] * B emissions_graphs = [None] * B def process(b): # create emission graph g_emissions = gtn.linear_graph(T, C, log_probs.requires_grad) cpu_data = log_probs[b].cpu().contiguous() g_emissions.set_weights(cpu_data.data_ptr()) # create criterion graph g_criterion = CTCLossFunction.create_ctc_graph( targets[b], blank_idx) # compose the graphs g_loss = gtn.negate( gtn.forward_score(gtn.intersect(g_emissions, g_criterion))) scale = 1.0 if reduction == "mean": L = len(targets[b]) scale = 1.0 / L if L > 0 else scale elif reduction != "none": raise ValueError("invalid value for reduction '" + str(reduction) + "'") # Save for backward: losses[b] = g_loss scales[b] = scale emissions_graphs[b] = g_emissions gtn.parallel_for(process, range(B)) ctx.auxiliary_data = (losses, scales, emissions_graphs, log_probs.shape) loss = torch.tensor([losses[b].item() * scales[b] for b in range(B)]) return torch.mean(loss.cuda() if log_probs.is_cuda else loss)
def backward(ctx, grad_output): losses, scales, emissions_graphs, in_shape = ctx.auxiliary_data B, T, C = in_shape input_grad = torch.empty((B, T, C)) def process(b): gtn.backward(losses[b], False) emissions = emissions_graphs[b] grad = emissions.grad().weights_to_numpy() input_grad[b] = torch.from_numpy(grad).view(1, T, C) * scales[b] gtn.parallel_for(process, range(B)) if grad_output.is_cuda: input_grad = input_grad.cuda() input_grad *= grad_output / B return ( input_grad, None, # targets None, # blank_idx None, # reduction )
def forward( ctx, inputs, targets, tokens, lexicon, transition_params=None, transitions=None, reduction="none", ): B, T, C = inputs.shape losses = [None] * B emissions_graphs = [None] * B if transitions is not None: if transition_params is None: raise ValueError("Specified transitions, but not transition params.") cpu_data = transition_params.cpu().contiguous() transitions.set_weights(cpu_data.data_ptr()) transitions.calc_grad = transition_params.requires_grad transitions.zero_grad() def process(b): # Create emissions graph: emissions = gtn.linear_graph(T, C, inputs.requires_grad) cpu_data = inputs[b].cpu().contiguous() emissions.set_weights(cpu_data.data_ptr()) target = make_chain_graph(targets[b]) target.arc_sort(True) # Create token to grapheme decomposition graph tokens_target = gtn.remove(gtn.project_output(gtn.compose(target, lexicon))) tokens_target.arc_sort() # Create alignment graph: alignments = gtn.project_input( gtn.remove(gtn.compose(tokens, tokens_target)) ) alignments.arc_sort() # Add transition scores: if transitions is not None: alignments = gtn.intersect(transitions, alignments) alignments.arc_sort() loss = gtn.forward_score(gtn.intersect(emissions, alignments)) # Normalize if needed: if transitions is not None: norm = gtn.forward_score(gtn.intersect(emissions, transitions)) loss = gtn.subtract(loss, norm) losses[b] = gtn.negate(loss) # Save for backward: if emissions.calc_grad: emissions_graphs[b] = emissions gtn.parallel_for(process, range(B)) ctx.graphs = (losses, emissions_graphs, transitions) ctx.input_shape = inputs.shape # Optionally reduce by target length: if reduction == "mean": scales = [(1 / len(t) if len(t) > 0 else 1.0) for t in targets] else: scales = [1.0] * B ctx.scales = scales loss = torch.tensor([l.item() * s for l, s in zip(losses, scales)]) return torch.mean(loss.to(inputs.device))
def indexed_func(): gtn.parallel_for(process, range(B))