def __call__(self, xs, labels): # B, T, F, 2048, labels: B, T, F, 12 xs = F.transpose(xs, (0, 2, 1, 3)) # B, F, T, 2048 orig_labels = labels labels = F.transpose(labels, (0, 2, 1, 3)) # B, F, T, 12 mini_batch, frame_node, T, _ = xs.shape xs = xs.reshape(xs.shape[0] * xs.shape[1], xs.shape[2], xs.shape[3]) labels = labels.reshape(labels.shape[0] * labels.shape[1], labels.shape[2], labels.shape[3]) xs = list(F.separate(xs, axis=0)) # list of T, 2048 labels = list(F.separate(labels, axis=0)) # list of T, 12 output = F.stack(self.label_dep_rnn(xs, labels)) # B * F, T, 12 output = output.reshape(mini_batch, frame_node, T, -1) output = F.transpose(output, (0, 2, 1, 3)) # B, T, F, D output = output.reshape(-1, self.class_num) # B * T * F, 12 orig_labels = orig_labels.reshape(-1, self.class_num) assert output.shape == orig_labels.shape pick_index, accuracy_pick_index = self.get_loss_index( output, orig_labels) loss = F.sigmoid_cross_entropy( output[list(pick_index[0]), list(pick_index[1])], orig_labels[list(pick_index[0]), list(pick_index[1])]) accuracy = F.binary_accuracy( output[list(accuracy_pick_index[0]), list(accuracy_pick_index[1])], orig_labels[[ list(accuracy_pick_index[0]), list(accuracy_pick_index[1]) ]]) return loss, accuracy
def calc_accuracy(self, predictions, labels): batch_predictions = predictions # concat all individual predictions and slice for each time step batch_predictions = F.concat([F.expand_dims(p, axis=2) for p in batch_predictions], axis=2) t = F.reshape(labels, (1, self.args.timesteps, -1)) accuracies = [] with cuda.get_device_from_array(batch_predictions.data): for prediction, label in zip(F.separate(batch_predictions, axis=0), F.separate(t, axis=2)): classification = F.softmax(prediction, axis=2) classification = classification.data classification = self.xp.argmax(classification, axis=2) # classification = self.xp.transpose(classification, (1, 0)) words = self.strip_prediction(classification) labels = self.strip_prediction(label.data) for word, label in zip(words, labels): word = "".join(map(self.label_to_char, word)) label = "".join(map(self.label_to_char, label)) if word == label: self.num_correct_words += 1 self.num_words += 1 return word, label
def seq_rnn_embed(vxs, exs, rnn_layer, initial_state=None, reverse=False): """Embed given sequences using rnn.""" # vxs.shape == (..., S) # exs.shape == (..., S, E) # initial_state == (..., E) assert vxs.shape == exs.shape[: -1], "Sequence embedding dimensions do not match." lengths = np.sum(vxs != 0, -1).flatten() # (X,) seqs = F.reshape(exs, (-1, ) + exs.shape[-2:]) # (X, S, E) if reverse: toembed = [ F.flip(s[..., :l, :], -2) for s, l in zip(F.separate(seqs, 0), lengths) if l != 0 ] # Y x [(S1, E), (S2, E), ...] else: toembed = [ s[..., :l, :] for s, l in zip(F.separate(seqs, 0), lengths) if l != 0 ] # Y x [(S1, E), (S2, E), ...] if initial_state is not None: initial_state = F.reshape(initial_state, (-1, EMBED)) # (X, E) initial_state = initial_state[None, np.flatnonzero(lengths)] # (1, Y, E) hs, ys = rnn_layer(initial_state, toembed) # (1, Y, E), Y x [(S1, 2*E), (S2, 2*E), ...] hs = hs[0] # (Y, E) if hs.shape[0] == lengths.size: hs = F.reshape(hs, vxs.shape[:-1] + (EMBED, )) # (..., E) return hs # Add zero values back to match original shape embeds = np.zeros((lengths.size, EMBED), dtype=np.float32) # (X, E) idxs = np.nonzero(lengths) # (Y,) embeds = F.scatter_add(embeds, idxs, hs) # (X, E) embeds = F.reshape(embeds, vxs.shape[:-1] + (EMBED, )) # (..., E) return embeds # (..., E)
def draw_bboxes(self, bboxes, image): draw = ImageDraw.Draw(image) for boxes, colour in zip(F.separate(bboxes, axis=0), self.colours): num_boxes = boxes.shape[0] for i, bbox in enumerate(F.separate(boxes, axis=0)): # render all intermediate results with lower alpha as the others fill_colour = colour if i < num_boxes - 1: if not self.render_intermediate_bboxes: continue fill_colour += '88' bbox.data[...] = (bbox.data[...] + 1) / 2 bbox.data[0, :] *= self.image_size.width bbox.data[1, :] *= self.image_size.height x = self.xp.clip(bbox.data[0, :].reshape(self.out_size), 0, self.image_size.width) y = self.xp.clip(bbox.data[1, :].reshape(self.out_size), 0, self.image_size.height) top_left = (x[0, 0], y[0, 0]) top_right = (x[0, -1], y[0, -1]) bottom_left = (x[-1, 0], y[-1, 0]) bottom_right = (x[-1, -1], y[-1, -1]) corners = [top_left, top_right, bottom_right, bottom_left] next_corners = corners[1:] + [corners[0]] for first_corner, next_corner in zip(corners, next_corners): draw.line([first_corner, next_corner], fill=fill_colour, width=3)
def __call__(self, *inputs): images, labels = inputs[:2] with cuda.Device(self.device): _, bboxes = self.link(images) bboxes = cuda.to_cpu(bboxes.data) labels = cuda.to_cpu(labels) xp = cuda.get_array_module(bboxes) bboxes = self.extract_corners(bboxes) bboxes = self.scale_bboxes(bboxes, Size._make(images.shape[-2:])) ious = bbox_iou(bboxes.data.copy(), xp.squeeze(labels))[xp.eye(len(bboxes)).astype(xp.bool)] mean_iou = ious.mean() reporter.report({'mean_iou': mean_iou}) pred_bboxes = [bbox.data[xp.newaxis, ...].astype(xp.int32) for bbox in F.separate(bboxes, axis=0)] pred_scores = xp.ones((len(bboxes), 1)) pred_labels = xp.zeros_like(pred_scores) gt_bboxes = [bbox.data[...] for bbox in F.separate(labels, axis=0)] gt_labels = xp.zeros_like(pred_scores) result = chainercv.evaluations.eval_detection_voc( pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels ) reporter.report({'map': result['map']}) reporter.report({'ap/sheep': result['ap'][0]})
def draw_bboxes(self, bboxes, image): draw = ImageDraw.Draw(image) for i, sub_box in enumerate(F.separate(bboxes, axis=1)): for bbox, colour in zip(F.separate(sub_box, axis=0), self.colours): bbox.data[...] = (bbox.data[...] + 1) / 2 bbox.data[0, :] *= self.image_size.width bbox.data[1, :] *= self.image_size.height x = self.xp.clip( bbox.data[0, :].reshape(self.out_size), 0, self.image_size.width) + i * self.image_size.width y = self.xp.clip(bbox.data[1, :].reshape(self.out_size), 0, self.image_size.height) top_left = (x[0, 0], y[0, 0]) top_right = (x[0, -1], y[0, -1]) bottom_left = (x[-1, 0], y[-1, 0]) bottom_right = (x[-1, -1], y[-1, -1]) corners = [top_left, top_right, bottom_right, bottom_left] next_corners = corners[1:] + [corners[0]] for first_corner, next_corner in zip(corners, next_corners): draw.line([first_corner, next_corner], fill=colour, width=3)
def calc_loss(self, x, t): batch_predictions, _, grids = x self.xp = cuda.get_array_module(batch_predictions[0], t) # reshape labels batch_size = t.shape[0] t = F.reshape(t, (batch_size, self.num_timesteps, -1)) # reshape grids grid_shape = grids.shape if self.uses_original_data: grids = F.reshape(grids, ( self.num_timesteps, batch_size, 4, ) + grid_shape[1:]) else: grids = F.reshape(grids, ( self.num_timesteps, batch_size, 1, ) + grid_shape[1:]) losses = [] # with cuda.get_device_from_array(grids.data): # grid_list = F.separate(F.reshape(grids, (self.num_timesteps, -1,) + grids.shape[3:]), axis=0) # overlap_losses = [] # for grid_1, grid_2 in itertools.combinations(grid_list, 2): # overlap_losses.append(self.calc_iou_loss(grid_1, grid_2)) # losses.append(sum(overlap_losses) / max(len(overlap_losses), 1)) loss_weights = [1, 1.25, 2, 1.25] for i, (predictions, grid, labels) in enumerate(zip(batch_predictions, F.separate(grids, axis=0), F.separate(t, axis=1)), start=1): with cuda.get_device_from_array( getattr(predictions, 'data', predictions[0].data)): # adapt ctc weight depending on current prediction position and labels # if all labels are blank, we want this weight to be full weight! overall_loss_weight = loss_weights[i - 1] loss = self.calc_actual_loss(predictions, grid, labels) # label_lengths = self.get_label_lengths(labels) for sub_grid in F.separate(grid, axis=1): width, height = self.get_bbox_side_lengths(sub_grid) loss += self.area_loss_factor * self.calc_area_loss( width, height) loss += self.aspect_ratio_loss_factor * self.calc_aspect_ratio_loss( width, height) loss += self.calc_direction_loss(sub_grid) loss += self.calc_height_loss(height) loss *= overall_loss_weight losses.append(loss) return sum(losses) / len(losses)
def forward(self, xs, labels): # xs shape = (batch, T, F, D) ''' :param xs: appearance features of all boxes feature across all frames :param gs: geometry features of all polygons. each is 4 coordinates represent box :param crf_pact_structures: packaged graph structure contains supplementary information :return: ''' xp = chainer.cuda.get_array_module(xs.data) batch_size = xs.shape[0] T = xs.shape[1] frame_node = xs.shape[2] assert frame_node == self.frame_node_num dim = xs.shape[-1] orig_labels = labels # first frame node_id ==> other frame node_id in same corresponding box if self.spatial_edge_mode != SpatialEdgeMode.no_edge: if self.spatial_sequence_type == SpatialSequenceType.in_frame: input_space = F.separate(F.reshape(xs, shape=(batch_size * T, self.frame_node_num, dim)), axis=0) # batch x T, F, D labels = F.separate(F.reshape(labels, shape=(batch_size * T, self.frame_node_num, labels.shape[-1])), axis=0) # batch x T, F, D elif self.spatial_sequence_type == SpatialSequenceType.cross_time: input_space = F.separate(F.reshape(xs, shape=(batch_size, T * self.frame_node_num, dim)), axis=0) # batch ,T x F, D labels = F.separate(F.reshape(labels, shape=(batch_size, T * self.frame_node_num, labels.shape[-1])), axis=0) # batch, T x F, D if self.spatial_edge_mode == SpatialEdgeMode.rnn: _, _, space_out = self.space_bi_lstm(hx=None, cx=None, xs=list(input_space)) space_out = F.stack(space_out) # B, T, D space_out = F.reshape(space_out, (-1, self.space_mid_size)) space_out = self.space_output(space_out) elif self.spatial_edge_mode == SpatialEdgeMode.ld_rnn or self.spatial_edge_mode == SpatialEdgeMode.bi_ld_rnn: space_out = self.space_module(list(input_space), list(labels)) elif self.spatial_edge_mode == SpatialEdgeMode.no_edge: space_out = self.space_module(F.stack(input_space)) space_out = F.stack(space_out) # batch * T, F, D space_out = F.reshape(space_out, (batch_size, T, frame_node, self.out_size)) else: input_space = F.reshape(xs, shape=(-1, self.in_size)) space_out = self.space_module(input_space) space_out = F.reshape(space_out, (batch_size, T, frame_node, self.out_size)) temporal_out_dict = self.temporal_node_recurrent_forward(xs, orig_labels) # shape = F, B, T, mid_size temporal_out = F.stack([node_out_ for _, node_out_ in sorted(temporal_out_dict.items(), key=lambda e: int(e[0]))]) temporal_out = F.transpose(temporal_out, (1,2,0,3)) # shape = (B,T,F,D) if self.spatial_edge_mode == SpatialEdgeMode.no_edge and self.temporal_edge_mode != TemporalEdgeMode.no_temporal: return temporal_out elif self.temporal_edge_mode == TemporalEdgeMode.no_temporal and self.spatial_edge_mode!= SpatialEdgeMode.no_edge: return space_out elif self.temporal_edge_mode == TemporalEdgeMode.no_temporal and self.spatial_edge_mode == SpatialEdgeMode.no_edge: return temporal_out elif self.temporal_edge_mode != TemporalEdgeMode.no_temporal and self.spatial_edge_mode != SpatialEdgeMode.no_edge: final_out = space_out * temporal_out return final_out
def decode_prediction(self, prediction): words = [] for box in F.separate(prediction, axis=1): word = [ F.argmax(F.softmax(character), axis=1) for character in F.separate(box, axis=1) ] words.append(F.stack(word, axis=1)) return F.stack(words, axis=1)
def forward(self, xs): space_output = None temporal_output = None if self.temporal_edge_mode != TemporalEdgeMode.no_temporal: temporal_input = F.transpose(xs, axes=(0, 2, 1, 3)) # B, F, T, D assert temporal_input.shape[1] == config.BOX_NUM[self.database] all_temporal_output = [] for idx, temporal_input_each_box in enumerate( F.separate(temporal_input, axis=1)): # B,F,T,D =>F list of B, T, D # temporal_input_each_box : list of (T,D) temporal_input_each_box = list( F.separate(temporal_input_each_box, axis=0)) # list of (T,D) _, _, temporal_output = getattr( self, "temporal_lstm_{}".format(idx))(None, None, temporal_input_each_box) temporal_output = F.stack(temporal_output) # B,T,D all_temporal_output.append(temporal_output) all_temporal_output = F.stack(all_temporal_output, axis=1) # B,F,T,D temporal_output = F.transpose(all_temporal_output, axes=(0, 2, 1, 3)) # B,T,F,D if self.spatial_edge_mode != SpatialEdgeMode.no_edge: # B,T,F,D minibatch, T, frame_box, _ = xs.shape # B,T,F,D space_input = F.reshape(xs, shape=(xs.shape[0] * xs.shape[1], xs.shape[2], xs.shape[3])) # B*T,F,D space_input = list(F.separate(space_input, axis=0)) # list of F,D _, _, space_output = self.space_fc_lstm( None, None, space_input) # B*T, F, 1024 # B, T, F, D space_output = F.stack(space_output) # B*T, F, 1024 space_output = F.reshape(space_output, shape=(minibatch, T, frame_box, -1)) if self.temporal_edge_mode != TemporalEdgeMode.no_temporal and self.spatial_edge_mode != SpatialEdgeMode.no_edge: assert space_output.shape == temporal_output.shape fusion_output = F.concat([space_output, temporal_output], axis=3) elif self.spatial_edge_mode != SpatialEdgeMode.no_edge: fusion_output = space_output elif self.temporal_edge_mode != TemporalEdgeMode.no_temporal: fusion_output = temporal_output fc_input = F.reshape( fusion_output, shape=(fusion_output.shape[0] * fusion_output.shape[1] * fusion_output.shape[2], -1)) score = self.score_fc(fc_input) return F.reshape(score, shape=(fusion_output.shape[0], fusion_output.shape[1], fusion_output.shape[2], -1))
def loss_information(enc, x): p_logit = enc(x) p = F.sigmoid(p_logit) p_ave = F.sum(p, axis=0) / x.data.shape[0] cond_ent = F.sum(-p * F.log(p + 1e-8) - (1 - p) * F.log(1 - p + 1e-8)) / p.data.shape[0] marg_ent = F.sum(-p_ave * F.log(p_ave + 1e-8) - (1 - p_ave) * F.log(1 - p_ave + 1e-8)) p_ave = F.reshape(p_ave, (1, len(p_ave.data))) p_ave_separated = F.separate(p_ave, axis=1) p_separated = F.separate(F.expand_dims(p, axis=2), axis=1) p_ave_list_i = [] p_ave_list_j = [] p_list_i = [] p_list_j = [] for i in range(n_bit - 1): p_ave_list_i.extend(list(p_ave_separated[i + 1:])) p_list_i.extend(list(p_separated[i + 1:])) p_ave_list_j.extend([p_ave_separated[i] for n in range(n_bit - i - 1)]) p_list_j.extend([p_separated[i] for n in range(n_bit - i - 1)]) p_ave_pair_i = F.expand_dims(F.concat(tuple(p_ave_list_i), axis=0), axis=1) p_ave_pair_j = F.expand_dims(F.concat(tuple(p_ave_list_j), axis=0), axis=1) p_pair_i = F.expand_dims(F.concat(tuple(p_list_i), axis=1), axis=2) p_pair_j = F.expand_dims(F.concat(tuple(p_list_j), axis=1), axis=2) p_pair_stacked_i = F.concat( (p_pair_i, 1 - p_pair_i, p_pair_i, 1 - p_pair_i), axis=2) p_pair_stacked_j = F.concat( (p_pair_j, p_pair_j, 1 - p_pair_j, 1 - p_pair_j), axis=2) p_ave_pair_stacked_i = F.concat( (p_ave_pair_i, 1 - p_ave_pair_i, p_ave_pair_i, 1 - p_ave_pair_i), axis=1) p_ave_pair_stacked_j = F.concat( (p_ave_pair_j, p_ave_pair_j, 1 - p_ave_pair_j, 1 - p_ave_pair_j), axis=1) p_product = F.sum(p_pair_stacked_i * p_pair_stacked_j, axis=0) / len( p.data) p_ave_product = p_ave_pair_stacked_i * p_ave_pair_stacked_j pairwise_mi = 2 * F.sum(p_product * F.log( (p_product + 1e-8) / (p_ave_product + 1e-8))) return cond_ent, marg_ent, pairwise_mi
def calc_loss(self, predictions, labels): recognition_losses = [] assert predictions.shape[1] == labels.shape[ 1], "Number of boxes is not equal in predictions and labels" for box, box_labels in zip(F.separate(predictions, axis=1), F.separate(labels, axis=1)): assert box.shape[1] == box_labels.shape[ 1], "Number of predicted chars is not equal to number of chars in label" box_losses = [ F.softmax_cross_entropy(char, char_label, reduce="no") for char, char_label in zip(F.separate(box, axis=1), F.separate(box_labels, axis=1)) ] recognition_losses.append(F.stack(box_losses)) return F.mean(F.stack(recognition_losses))
def test_ctc_loss(): pytest.importorskip("torch") pytest.importorskip("warpctc_pytorch") import torch import warpctc_pytorch from espnet.nets.e2e_asr_th import pad_list n_out = 7 input_length = numpy.array([11, 17, 15], dtype=numpy.int32) label_length = numpy.array([4, 2, 3], dtype=numpy.int32) np_pred = [ numpy.random.rand(il, n_out).astype(numpy.float32) for il in input_length ] np_target = [ numpy.random.randint(0, n_out, size=ol, dtype=numpy.int32) for ol in label_length ] # NOTE: np_pred[i] seems to be transposed and used axis=-1 in e2e_asr.py ch_pred = F.separate(F.pad_sequence(np_pred), axis=-2) ch_target = F.pad_sequence(np_target, padding=-1) ch_loss = F.connectionist_temporal_classification(ch_pred, ch_target, 0, input_length, label_length).data th_pred = pad_list([torch.from_numpy(x) for x in np_pred], 0.0).transpose(0, 1) th_target = torch.from_numpy(numpy.concatenate(np_target)) th_ilen = torch.from_numpy(input_length) th_olen = torch.from_numpy(label_length) th_loss = warpctc_pytorch.CTCLoss(size_average=True)( th_pred, th_target, th_ilen, th_olen).data.numpy()[0] numpy.testing.assert_allclose(th_loss, ch_loss, 0.05)
def arr2list(arr, length=None): xs = F.separate(F.swapaxes(arr, 1, 2)) if length is not None: assert len(xs) == len(length) xs = [x[:l] for x, l in zip(xs, length)] return xs
def forward(self, equery, vmemory, ememory, mask, iteration=0): """Compute an attention over memory given the query.""" # equery.shape == (..., E) # vmemory.shape == (..., Ms, M) # ememory.shape == (..., Ms, E) # mask.shape == (..., Ms) # Setup memory embedding eq = F.repeat(equery[..., None, :], vmemory.shape[-2], -2) # (..., Ms, E) # Compute content based attention merged = F.concat( [eq, ememory, eq * ememory, F.squared_difference(eq, ememory)], -1) # (..., Ms, 4*E) inter = self.att_linear(merged, n_batch_axes=len(vmemory.shape) - 1) # (..., Ms, E) inter = F.tanh(inter) # (..., Ms, E) inter = F.dropout(inter, DROPOUT) # (..., Ms, E) # Split into sentences lengths = np.sum(np.any((vmemory != 0), -1), -1) # (...,) mems = [s[..., :l, :] for s, l in zip(F.separate(inter, 0), lengths) ] # B x [(M1, E), (M2, E), ...] _, bimems = self.att_birnn(None, mems) # B x [(M1, 2*E), (M2, 2*E), ...] bimems = F.pad_sequence(bimems) # (..., Ms, 2*E) att = self.att_score(bimems, n_batch_axes=len(vmemory.shape) - 1) # (..., Ms, 1) att = F.squeeze(att, -1) # (..., Ms) if mask is not None: att += mask * MINUS_INF # (..., Ms) return att
def seq_rnn_embed(vxs, exs, birnn, return_seqs=False): """Embed given sequences using rnn.""" # vxs.shape == (..., S) # exs.shape == (..., S, E) assert vxs.shape == exs.shape[: -1], "Sequence embedding dimensions do not match." lengths = np.sum(vxs != 0, -1).flatten() # (X,) seqs = F.reshape(exs, (-1, ) + exs.shape[-2:]) # (X, S, E) toembed = [ s[..., :l, :] for s, l in zip(F.separate(seqs, 0), lengths) if l != 0 ] # Y x [(S1, E), (S2, E), ...] hs, ys = birnn(None, toembed) # (2, Y, E), Y x [(S1, 2*E), (S2, 2*E), ...] if return_seqs: ys = F.pad_sequence(ys) # (Y, S, 2*E) ys = F.reshape(ys, ys.shape[:-1] + (2, EMBED)) # (Y, S, 2, E) ys = F.mean(ys, -2) # (Y, S, E) if ys.shape[0] == lengths.size: ys = F.reshape(ys, exs.shape) # (..., S, E) return ys embeds = np.zeros((lengths.size, vxs.shape[-1], EMBED), dtype=np.float32) # (X, S, E) idxs = np.nonzero(lengths) # (Y,) embeds = F.scatter_add(embeds, idxs, ys) # (X, S, E) embeds = F.reshape(embeds, exs.shape) # (..., S, E) return embeds # (..., S, E) hs = F.mean(hs, 0) # (Y, E) if hs.shape[0] == lengths.size: hs = F.reshape(hs, vxs.shape[:-1] + (EMBED, )) # (..., E) return hs # Add zero values back to match original shape embeds = np.zeros((lengths.size, EMBED), dtype=np.float32) # (X, E) idxs = np.nonzero(lengths) # (Y,) embeds = F.scatter_add(embeds, idxs, hs) # (X, E) embeds = F.reshape(embeds, vxs.shape[:-1] + (EMBED, )) # (..., E) return embeds # (..., E)
def translate(self, hxs, max_length=100): batch_size, _, _ = hxs.shape compute_context = self.attention(hxs) c = Variable(self.xp.zeros((batch_size, self.n_units), 'f')) h = Variable(self.xp.zeros((batch_size, self.n_units), 'f')) ys = self.xp.full(batch_size, tokens['<SOS>'], np.int32) results = [] for _ in range(max_length): eys = self.embed_y(ys) context = compute_context(h) concatenated = F.concat([eys, context]) c, h = self.lstm(c, h, concatenated) concatenated = F.concat([concatenated, h]) logit = self.w(self.maxout(concatenated)) y = F.reshape(F.argmax(logit, axis=1), (batch_size, )) results.append(y) results = F.separate(F.transpose(F.vstack(results)), axis=0) outs = [] for y in results: inds = np.argwhere(y == tokens['<EOS>']) if len(inds) > 0: y = y[:inds[0, 0]] outs.append(y) return outs
def translate(self, hxs, max_length): """Generate target sentences given hidden states of source sentences. Args: hxs: Hidden states for source sequences. Returns: ys: Generated sequences. """ batch_size, _, _ = hxs.shape compute_context = self.attention(hxs) c = Variable(self.xp.zeros((batch_size, self.n_units), 'f')) h = F.broadcast_to(self.bos_state, ((batch_size, self.n_units))) # first character's embedding previous_embedding = self.embed_y( Variable(self.xp.full((batch_size, ), EOS, 'i'))) results = [] for _ in range(max_length): context = compute_context(h) concatenated = F.concat((previous_embedding, context)) c, h = self.lstm(c, h, concatenated) concatenated = F.concat((concatenated, h)) logit = self.w(self.maxout(concatenated)) y = F.reshape(F.argmax(logit, axis=1), (batch_size, )) results.append(y) previous_embedding = self.embed_y(y) results = F.separate(F.transpose(F.vstack(results)), axis=0) ys = [get_subsequence_before_eos(result.data) for result in results] return ys
def __call__(self, input_tensor, cur_state): # input_tensor and cur_state is B,F,C,H,W h_cur, c_cur = cur_state # B, F, C, H, W mini_batch, frame_box_num, channel, height, width = input_tensor.shape combined = F.concat( [input_tensor, h_cur], axis=2) # concatenate along channel axis. B, F, C+hidden_dim, H, W assert frame_box_num == self.group_num combined = F.reshape(combined, shape=(mini_batch, frame_box_num * combined.shape[2], height, width)) # B, F * (C+hidden_dim), H, W conv_output = self.conv(combined) # B, F * 4 * hidden_dim, H, W conv_output = F.reshape( conv_output, shape=(mini_batch, self.group_num, 4, self.hidden_dim, self.height, self.width)) # B, F, 4, hidden_dim, H, W cc_i, cc_f, cc_o, cc_g = F.separate(conv_output, axis=2) # B, F, hidden_dim, H, W i = F.sigmoid(cc_i) f = F.sigmoid(cc_f) o = F.sigmoid(cc_o) g = F.tanh(cc_g) c_next = f * c_cur + i * g h_next = o * F.tanh(c_next) return h_next, c_next
def forward(self, xs, n_speakers, activation=None): ilens = [x.shape[0] for x in xs] # xs: (B, T, F) xs = F.pad_sequence(xs, padding=-1) pad_shape = xs.shape # emb: (B*T, E) emb = self.enc(xs) # emb: (B, T, F) emb = F.separate(emb.reshape(pad_shape[0], pad_shape[1], -1), axis=0) emb = [F.get_item(e, slice(0, ilen)) for e, ilen in zip(emb, ilens)] emb2 = [cp.random.permutation(e) for e in emb] # get name: main- num_speakers=n_speakers, to_train=1 # validation/main- num_speakers=n_speaker, to_train=0 # validation_1/main- num_speakers=None, to_train=0 name = reporter.get_current_reporter()._observer_names[id(self)] num_speakers = None if name == "validation_1/main" else n_speakers to_train = 1 if name == 'main' else 0 # h_0: (1, B, F) # c_0: (1, B, F) h_0, c_0 = self.encoder(emb2) # A: (B, n_spk, F) # P: (B, n_spk, 1) A, P = self.decoder(h_0, c_0, n_speakers=num_speakers, to_train=to_train) # yhat: (B, T, n_spk) ys = [F.matmul(e, a.T) for a, e in zip(A, emb)] return ys, P
def __call__(self, rois): batch_size, num_bboxes, num_channels, height, width = rois.shape rois = F.reshape(rois, (-1, num_channels, height, width)) # if not chainer.config.user_text_recognition_grayscale_input: # # convert data to grayscale # assert rois.shape[1] == 3, "rois are not in RGB, can not convert them to grayscale" # r, g, b = F.separate(rois, axis=1) # grey = 0.299 * r + 0.587 * g + 0.114 * b # rois = F.stack([grey, grey, grey], axis=1) h = self.feature_extractor(rois) _, num_channels, feature_height, feature_width = h.shape h = F.average_pooling_2d(h, (feature_height, feature_width)) h = F.reshape(h, (batch_size, num_bboxes, num_channels, -1)) all_predictions = [] for box in F.separate(h, axis=1): # box_predictions = [self.classifier(self.lstm(box)) for _ in range(self.num_chars)] box_predictions = [ self.classifier(box) for _ in range(self.num_chars) ] all_predictions.append(F.stack(box_predictions, axis=1)) # return shape: batch_size, num_bboxes, num_chars, num_classes return F.stack(all_predictions, axis=2)
def prob(self, x): assert x.shape[1] == len(self.distributions) prob_all = 1 for value, distribution in zip(list(F.separate(x, axis=1)), self.distributions): prob_all *= distribution.prob(value) return prob_all
def attend(self, encoded_features): self.out_lstm.reset_state() transformed_encoded_features = F.concat([ F.expand_dims(self.transform_encoded_features(feature), axis=1) for feature in encoded_features ], axis=1) concat_encoded_features = F.concat( [F.expand_dims(e, axis=1) for e in encoded_features], axis=1) lstm_output = self.xp.zeros_like(encoded_features[0]) outputs = [] for _ in range(self.num_labels): transformed_lstm_output = self.transform_out_lstm_feature( lstm_output) attended_feats = [] for transformed_encoded_feature in F.separate( transformed_encoded_features, axis=1): attended_feat = transformed_encoded_feature + transformed_lstm_output attended_feat = F.tanh(attended_feat) attended_feats.append( self.generate_attended_feat(attended_feat)) attended_feats = F.concat(attended_feats, axis=1) alphas = F.softmax(attended_feats, axis=1) lstm_input_feature = F.batch_matmul(alphas, concat_encoded_features, transa=True) lstm_input_feature = F.squeeze(lstm_input_feature, axis=1) lstm_output = self.out_lstm(lstm_input_feature) outputs.append(lstm_output) return outputs
def __call__(self, hs, ys): """CTC forward. Args: hs (list of chainer.Variable | N-dimension array): Input variable from encoder. ys (list of chainer.Variable | N-dimension array): Input variable of decoder. Returns: chainer.Variable: A variable holding a scalar value of the CTC loss. """ self.loss = None ilens = [x.shape[0] for x in hs] olens = [x.shape[0] for x in ys] # zero padding for hs y_hat = linear_tensor(self.ctc_lo, F.dropout( F.pad_sequence(hs), ratio=self.dropout_rate)) y_hat = F.separate(y_hat, axis=1) # ilen list of batch x hdim # zero padding for ys y_true = F.pad_sequence(ys, padding=-1) # batch x olen # get length info input_length = chainer.Variable(self.xp.array(ilens, dtype=np.int32)) label_length = chainer.Variable(self.xp.array(olens, dtype=np.int32)) logging.info(self.__class__.__name__ + ' input lengths: ' + str(input_length.data)) logging.info(self.__class__.__name__ + ' output lengths: ' + str(label_length.data)) # get ctc loss self.loss = F.connectionist_temporal_classification( y_hat, y_true, 0, input_length, label_length) logging.info('ctc loss:' + str(self.loss.data)) return self.loss
def __call__(self, hs, ys): '''CTC forward :param hs: :param ys: :return: ''' self.loss = None ilens = [x.shape[0] for x in hs] olens = [x.shape[0] for x in ys] # zero padding for hs y_hat = linear_tensor(self.ctc_lo, F.dropout( F.pad_sequence(hs), ratio=self.dropout_rate)) y_hat = F.separate(y_hat, axis=1) # ilen list of batch x hdim # zero padding for ys y_true = F.pad_sequence(ys, padding=-1) # batch x olen # get length info input_length = chainer.Variable(self.xp.array(ilens, dtype=np.int32)) label_length = chainer.Variable(self.xp.array(olens, dtype=np.int32)) logging.info(self.__class__.__name__ + ' input lengths: ' + str(input_length.data)) logging.info(self.__class__.__name__ + ' output lengths: ' + str(label_length.data)) # get ctc loss self.loss = F.connectionist_temporal_classification( y_hat, y_true, 0, input_length, label_length) logging.info('ctc loss:' + str(self.loss.data)) return self.loss
def translate( self, encoded: Variable, max_length: int = 100 ) -> List[ndarray]: sentence_count = encoded.shape[0] self.setup(encoded) cell, state, previous_words = self.get_initial_states(sentence_count) result = [] for _ in range(max_length): cell, state, context, concatenated = \ self.advance_one_step(cell, state, previous_words) logit, state = self.compute_logit(concatenated, state, context) output_id = F.reshape(F.argmax(logit, axis=1), (sentence_count,)) result.append(output_id) previous_words = output_id # Remove words after <EOS> outputs = F.separate(F.transpose(F.vstack(result)), axis=0) assert len(outputs) == sentence_count output_sentences = [] for output in outputs: assert output.shape == (max_length,) indexes = np.argwhere(output.data == EOS) if len(indexes) > 0: output = output[:indexes[0, 0] + 1] output_sentences.append(output.data) return output_sentences
def calc_accuracy(self, x, t): batch_predictions, _, _ = x self.xp = cuda.get_array_module(batch_predictions[0], t) batch_size = t.shape[0] t = F.reshape(t, (batch_size, self.num_timesteps, -1)) accuracies = [] for predictions, labels in zip(batch_predictions, F.separate(t, axis=1)): if isinstance(predictions, list): predictions = F.concat( [F.expand_dims(p, axis=0) for p in predictions], axis=0) with cuda.get_device_from_array(predictions.data): classification = F.softmax(predictions, axis=2) classification = classification.data classification = self.xp.argmax(classification, axis=2) classification = self.xp.transpose(classification, (1, 0)) words = self.strip_prediction(classification) labels = self.strip_prediction(labels.data) num_correct_words = 0 for word, label in zip(words, labels): word = "".join(map(self.label_to_char, word)) label = "".join(map(self.label_to_char, label)) if word == label: num_correct_words += 1 accuracy = num_correct_words / len(labels) accuracies.append(accuracy) overall_accuracy = sum(accuracies) / max(len(accuracies), 1) self.scale_area_loss_factor(overall_accuracy) return overall_accuracy
def _call_1layer(net: NStepRNNBase, hidden: Optional[ArrayLike], input: ArrayLike): if hidden is not None: hidden = hidden[np.newaxis] _, hidden = net(hx=hidden, xs=F.separate(input, axis=0)) hidden = F.stack(hidden, axis=0) return hidden
def test_ctc_loss(): pytest.importorskip("torch") pytest.importorskip("warpctc_pytorch") import torch from warpctc_pytorch import CTCLoss from e2e_asr_attctc_th import pad_list n_out = 7 n_batch = 3 input_length = numpy.array([11, 17, 15], dtype=numpy.int32) label_length = numpy.array([4, 2, 3], dtype=numpy.int32) np_pred = [numpy.random.rand(il, n_out).astype( numpy.float32) for il in input_length] np_target = [numpy.random.randint( 0, n_out, size=ol, dtype=numpy.int32) for ol in label_length] # NOTE: np_pred[i] seems to be transposed and used axis=-1 in e2e_asr_attctc.py ch_pred = F.separate(F.pad_sequence(np_pred), axis=-2) ch_target = F.pad_sequence(np_target, padding=-1) ch_loss = F.connectionist_temporal_classification( ch_pred, ch_target, 0, input_length, label_length).data th_pred = pad_list([torch.autograd.Variable(torch.from_numpy(x)) for x in np_pred]).transpose(0, 1) th_target = torch.autograd.Variable( torch.from_numpy(numpy.concatenate(np_target))) th_ilen = torch.autograd.Variable(torch.from_numpy(input_length)) th_olen = torch.autograd.Variable(torch.from_numpy(label_length)) # NOTE: warpctc_pytorch.CTCLoss does not normalize itself by batch-size while chainer's default setting does th_loss = (CTCLoss()(th_pred, th_target, th_ilen, th_olen) / n_batch).data.numpy()[0] numpy.testing.assert_allclose(th_loss, ch_loss, 0.05)
def __call__(self, images, localizations): points = F.spatial_transformer_grid(localizations, self.target_shape) rois = F.spatial_transformer_sampler(images, points) # h = self.data_bn(rois) h = F.relu(self.bn0(self.conv0(rois))) h = F.average_pooling_2d(h, 2, stride=2) h = self.rs1(h) h = self.rs2(h) h = F.max_pooling_2d(h, 2, stride=2) h = self.rs3(h) self.vis_anchor = h h = F.average_pooling_2d(h, 5, stride=1) h = F.relu(self.fc1(h)) # for each timestep of the localization net do the 'classification' h = F.reshape(h, (self.num_timesteps * 2 + 1, -1, self.fc1.out_size)) overall_predictions = [] for timestep in F.separate(h, axis=0): # go 2x num_labels plus 1 timesteps because of ctc loss lstm_predictions = [] self.lstm.reset_state() for _ in range(self.num_labels): lstm_prediction = self.lstm(timestep) classified = self.classifier(lstm_prediction) lstm_predictions.append(classified) overall_predictions.append(lstm_predictions) return overall_predictions, rois, points
def __call__(self, h): # type: (chainer.Variable) -> chainer.Variable xp = cuda.get_array_module(h) mb, node, ch = h.shape # type: int, int, int if self.q_star is None: self.q_star = [ xp.zeros((1, self.in_channels * 2)).astype('f') for _ in range(mb) ] self.hx, self.cx, q = self.lstm_layer(self.hx, self.cx, self.q_star) # self.hx: (mb, mb, ch) # self.cx: (mb, mb, ch) # q: List[(1, ch) * mb] q = functions.stack(q) # q: (mb, 1, ch) q_ = functions.transpose(q, axes=(0, 2, 1)) # q_: (mb, ch, 1) e = functions.matmul(h, q_) # e: (mb, node, 1) a = functions.softmax(e) # a: (mb, node, 1) a = functions.broadcast_to(a, h.shape) # a: (mb, node, ch) r = functions.sum((a * h), axis=1, keepdims=True) # r: (mb, 1, ch) q_star_ = functions.concat((q, r), axis=2) # q_star_: (mb, 1, ch*2) self.q_star = functions.separate(q_star_) return functions.reshape(q_star_, (mb, ch * 2))
def __call__(self, input_ids, input_mask, token_type_ids): final_hidden = self.bert.get_sequence_output( input_ids, input_mask, token_type_ids) batch_size = final_hidden.shape[0] seq_length = final_hidden.shape[1] hidden_size = final_hidden.shape[2] final_hidden_matrix = F.reshape( final_hidden, [batch_size * seq_length, hidden_size]) logits = self.output(final_hidden_matrix) logits = F.reshape(logits, [batch_size, seq_length, 2]) logits = logits - (1 - input_mask[:, :, None]) * 1000. # ignore pads logits = F.transpose(logits, [2, 0, 1]) unstacked_logits = F.separate(logits, axis=0) (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) return (start_logits, end_logits)
def forward(self, inputs, device): x, = inputs return functions.separate(x, self.axis)