def transduce(self, sent: ExpressionSequence) -> ExpressionSequence: if self.pos_encoding_type == "trigonometric": if self.position_encoding_block is None or self.position_encoding_block.shape[ 2] < len(sent): self.initialize_position_encoding( int(len(sent) * 1.2), self.input_dim if self.pos_encoding_combine == "add" else self.pos_encoding_size) encoding = dy.inputTensor( self.position_encoding_block[0, :, :len(sent)]) elif self.pos_encoding_type == "embedding": encoding = self.positional_embedder.embed_sent( len(sent)).as_tensor() if self.pos_encoding_type: if self.pos_encoding_combine == "add": sent = ExpressionSequence(expr_tensor=sent.as_tensor() + encoding, mask=sent.mask) else: # concat sent = ExpressionSequence(expr_tensor=dy.concatenate( [sent.as_tensor(), encoding]), mask=sent.mask) elif self.pos_encoding_type: raise ValueError(f"unknown encoding type {self.pos_encoding_type}") for module in self.modules: enc_sent = module.transduce(sent) sent = enc_sent self._final_states = [transducers.FinalTransducerState(sent[-1])] return sent
def transduce(self, src: ExpressionSequence) -> ExpressionSequence: sent_len = len(src) embeddings = dy.strided_select(dy.parameter(self.embedder), [1,1], [0,0], [self.input_dim, sent_len]) if self.op == 'sum': output = embeddings + src.as_tensor() elif self.op == 'concat': output = dy.concatenate([embeddings, src.as_tensor()]) else: raise ValueError(f'Illegal op {op} in PositionalTransducer (options are "sum"/"concat")') output_seq = ExpressionSequence(expr_tensor=output, mask=src.mask) self._final_states = [FinalTransducerState(output_seq[-1])] return output_seq
def transduce( self, src: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: src_tensor = src.as_tensor() out_mask = src.mask if self.downsample_by > 1: assert len(src_tensor.dim()[0])==2, \ f"Downsampling only supported for tensors of order two. Found dims {src_tensor.dim()}" (hidden_dim, seq_len), batch_size = src_tensor.dim() if seq_len % self.downsample_by != 0: raise ValueError( "For downsampling, sequence lengths must be multiples of the total reduce factor. " "Configure batcher accordingly.") src_tensor = dy.reshape(src_tensor, (hidden_dim * self.downsample_by, seq_len // self.downsample_by), batch_size=batch_size) if out_mask: out_mask = out_mask.lin_subsampled( reduce_factor=self.downsample_by) output = self.transform.transform(src_tensor) if self.downsample_by == 1: if len(output.dim()) != src_tensor.dim( ): # can happen with seq length 1 output = dy.reshape(output, src_tensor.dim()[0], batch_size=src_tensor.dim()[1]) output_seq = expression_seqs.ExpressionSequence(expr_tensor=output, mask=out_mask) self._final_states = [FinalTransducerState(output_seq[-1])] return output_seq
def transduce(self, src: ExpressionSequence) -> ExpressionSequence: src = src.as_tensor() src_height = src.dim()[0][0] src_width = src.dim()[0][1] # src_channels = 1 batch_size = src.dim()[1] # convolution and pooling layers # src dim is ((40, 1000), 128) src = padding(src, self.filter_width[0]+3) l1 = dy.rectify(dy.conv2d(src, dy.parameter(self.filters1), stride = [self.stride[0], self.stride[0]], is_valid = True)) # ((1, 1000, 64), 128) pool1 = dy.maxpooling2d(l1, (1, 4), (1,2), is_valid = True) #((1, 499, 64), 128) pool1 = padding(pool1, self.filter_width[1]+3) l2 = dy.rectify(dy.conv2d(pool1, dy.parameter(self.filters2), stride = [self.stride[1], self.stride[1]], is_valid = True))# ((1, 499, 512), 128) pool2 = dy.maxpooling2d(l2, (1, 4), (1,2), is_valid = True)#((1, 248, 512), 128) pool2 = padding(pool2, self.filter_width[2]) l3 = dy.rectify(dy.conv2d(pool2, dy.parameter(self.filters3), stride = [self.stride[2], self.stride[2]], is_valid = True))# ((1, 248, 1024), 128) pool3 = dy.max_dim(l3, d = 1) my_norm = dy.l2_norm(pool3) + 1e-6 output = dy.cdiv(pool3,my_norm) output = dy.reshape(output, (self.num_filters[2],), batch_size = batch_size) return ExpressionSequence(expr_tensor=output)
def transduce(self, embed_sent: ExpressionSequence) -> ExpressionSequence: src = embed_sent.as_tensor() sent_len = src.dim()[0][1] batch_size = src.dim()[1] pad_size = (self.window_receptor - 1) / 2 #TODO adapt it also for even window size src = dy.concatenate([ dy.zeroes((self.input_dim, pad_size), batch_size=batch_size), src, dy.zeroes((self.input_dim, pad_size), batch_size=batch_size) ], d=1) padded_sent_len = sent_len + 2 * pad_size conv1 = dy.parameter(self.pConv1) bias1 = dy.parameter(self.pBias1) src_chn = dy.reshape(src, (self.input_dim, padded_sent_len, 1), batch_size=batch_size) cnn_layer1 = dy.conv2d_bias(src_chn, conv1, bias1, stride=[1, 1]) hidden_layer = dy.reshape(cnn_layer1, (self.internal_dim, sent_len, 1), batch_size=batch_size) if self.non_linearity is 'linear': hidden_layer = hidden_layer elif self.non_linearity is 'tanh': hidden_layer = dy.tanh(hidden_layer) elif self.non_linearity is 'relu': hidden_layer = dy.rectify(hidden_layer) elif self.non_linearity is 'sigmoid': hidden_layer = dy.logistic(hidden_layer) for conv_hid, bias_hid in self.builder_layers: hidden_layer = dy.conv2d_bias(hidden_layer, dy.parameter(conv_hid), dy.parameter(bias_hid), stride=[1, 1]) hidden_layer = dy.reshape(hidden_layer, (self.internal_dim, sent_len, 1), batch_size=batch_size) if self.non_linearity is 'linear': hidden_layer = hidden_layer elif self.non_linearity is 'tanh': hidden_layer = dy.tanh(hidden_layer) elif self.non_linearity is 'relu': hidden_layer = dy.rectify(hidden_layer) elif self.non_linearity is 'sigmoid': hidden_layer = dy.logistic(hidden_layer) last_conv = dy.parameter(self.last_conv) last_bias = dy.parameter(self.last_bias) output = dy.conv2d_bias(hidden_layer, last_conv, last_bias, stride=[1, 1]) output = dy.reshape(output, (sent_len, self.output_dim), batch_size=batch_size) output_seq = ExpressionSequence(expr_tensor=output) self._final_states = [FinalTransducerState(output_seq[-1])] return output_seq
def transduce(self, seq: ExpressionSequence) -> ExpressionSequence: seq_tensor = self.child.transduce(seq).as_tensor() + seq.as_tensor() if self.layer_norm: d = seq_tensor.dim() seq_tensor = dy.reshape(seq_tensor, (d[0][0], ), batch_size=d[0][1] * d[1]) seq_tensor = dy.layer_norm(seq_tensor, self.ln_g, self.ln_b) seq_tensor = dy.reshape(seq_tensor, d[0], batch_size=d[1]) return ExpressionSequence(expr_tensor=seq_tensor)
def transduce( self, expr_seq: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: """ transduce the sequence Args: expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated) Returns: expression sequence """ # Start with a [(length, model_size) x batch] tensor # B x T x H -> B x H x T x = expr_seq.as_tensor() x_len = x.size()[1] x_batch = x.size()[0] # Get the query key and value vectors q = self.lin_q(x).transpose(1, 2).contiguous() k = self.lin_k(x).transpose(1, 2).contiguous() v = self.lin_v(x).transpose(1, 2).contiguous() # q = bq + x * Wq # k = bk + x * Wk # v = bv + x * Wv # Split to batches [(length, head_dim) x batch * num_heads] tensor q, k, v = [ temp.view((x_batch * self.num_heads, self.head_dim, x_len)) for temp in (q, k, v) ] # Do scaled dot product [batch*num_heads, length, length], rows are keys, columns are queries attn_score = torch.matmul(k.transpose(1, 2), q) / sqrt(self.head_dim) if expr_seq.mask is not None: mask = torch.Tensor( np.repeat(expr_seq.mask.np_arr, self.num_heads, axis=0) * -1e10).to(xnmt.device) attn_score = attn_score + mask.unsqueeze(2) attn_prob = torch.nn.Softmax(dim=1)(attn_score) # attn_prob = dy.softmax(attn_score, d=1) if self.train and self.dropout > 0.0: attn_prob = tt.dropout(attn_prob, self.dropout) # Reduce using attention and resize to match [(length, model_size) x batch] o = torch.matmul(v, attn_prob).view(x_batch, self.input_dim, x_len).transpose(1, 2) # Final transformation o = self.lin_o(o) # o = bo + o * Wo expr_seq = expression_seqs.ExpressionSequence(expr_tensor=o, mask=expr_seq.mask) self._final_states = [ transducers.FinalTransducerState(expr_seq[-1], None) ] return expr_seq
def transduce( self, seq: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: if self.train and self.dropout > 0.0: seq_tensor = dy.dropout( self.child.transduce(seq).as_tensor(), self.dropout) + seq.as_tensor() else: seq_tensor = self.child.transduce( seq).as_tensor() + seq.as_tensor() if self.layer_norm: d = seq_tensor.dim() seq_tensor = dy.reshape(seq_tensor, (d[0][0], ), batch_size=d[0][1] * d[1]) seq_tensor = dy.layer_norm(seq_tensor, self.ln_g, self.ln_b) seq_tensor = dy.reshape(seq_tensor, d[0], batch_size=d[1]) return expression_seqs.ExpressionSequence(expr_tensor=seq_tensor)
def transduce( self, seq: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: if self.train and self.dropout > 0.0: seq_tensor = tt.dropout( self.child.transduce(seq).as_tensor(), self.dropout) + seq.as_tensor() else: seq_tensor = self.child.transduce( seq).as_tensor() + seq.as_tensor() if self.layer_norm: batch_size = tt.batch_size(seq_tensor) merged_seq_tensor = tt.merge_time_batch_dims(seq_tensor) transformed_seq_tensor = self.layer_norm_component.transform( merged_seq_tensor) seq_tensor = tt.unmerge_time_batch_dims(transformed_seq_tensor, batch_size) return expression_seqs.ExpressionSequence(expr_tensor=seq_tensor)
def transduce(self, src: ExpressionSequence) -> ExpressionSequence: src = src.as_tensor() # convolutional layer src = padding(src, src.dim()[0][0], src.dim()[0][1], self.filter_width, self.stride, src.dim()[1]) l1 = dy.rectify( dy.conv2d(src, dy.parameter(self.filter_conv), stride=[self.stride, self.stride], is_valid=True)) timestep = l1.dim()[0][1] features = l1.dim()[0][2] batch_size = l1.dim()[1] # transpose l1 to be (timesetp, dim), but keep the batch_size. rhn_in = dy.reshape(l1, (timestep, features), batch_size=batch_size) rhn_in = [dy.pick(rhn_in, i) for i in range(timestep)] for l in range(self.rhn_num_hidden_layers): rhn_out = [] # initialize a random vector for the first state vector, keep the same batch size. prev_state = dy.parameter(self.init[l]) # begin recurrent high way network for t in range(timestep): for m in range(0, self.rhn_microsteps): H = dy.affine_transform([ dy.parameter(self.recur[l][m][1]), dy.parameter(self.recur[l][m][0]), prev_state ]) T = dy.affine_transform([ dy.parameter(self.recur[l][m][3]), dy.parameter(self.recur[l][m][2]), prev_state ]) if m == 0: H += dy.parameter(self.linear[l][0]) * rhn_in[t] T += dy.parameter(self.linear[l][1]) * rhn_in[t] H = dy.tanh(H) T = dy.logistic(T) prev_state = dy.cmult(1 - T, prev_state) + dy.cmult( T, H) # ((1024, ), batch_size) rhn_out.append(prev_state) if self.residual and l > 0: rhn_out = [sum(x) for x in zip(rhn_out, rhn_in)] rhn_in = rhn_out # Compute the attention-weighted average of the activations rhn_in = dy.concatenate_cols(rhn_in) scores = dy.transpose(dy.parameter(self.attention[0][1])) * dy.tanh( dy.parameter(self.attention[0][0]) * rhn_in) # ((1,510), batch_size) scores = dy.reshape(scores, (scores.dim()[0][1], ), batch_size=scores.dim()[1]) attn_out = rhn_in * dy.softmax( scores ) # # rhn_in.as_tensor() is ((1024,510), batch_size) softmax is ((510,), batch_size) return ExpressionSequence(expr_tensor=attn_out)
def transduce(self, x: expression_seqs.ExpressionSequence) -> expression_seqs.ExpressionSequence: x_T = x.as_transposed_tensor() scores = x_T * dy.parameter(self.W) if x.mask is not None: scores = x.mask.add_to_tensor_expr(scores, multiplicator=-100.0, time_first=True) if self.pos_enc_max: seq_len = x_T.dim()[0][0] pos_enc = self.pos_enc[:seq_len,:] scores = dy.cmult(scores, dy.inputTensor(pos_enc)) attention = dy.softmax(scores) output_expr = x.as_tensor() * attention return expression_seqs.ExpressionSequence(expr_tensor=output_expr, mask=None)
def exprseq_pooling(self, exprseq): # Reduce to vector exprseq = ExpressionSequence( expr_tensor=exprseq.mask.add_to_tensor_expr( exprseq.as_tensor(), -1e10), mask=exprseq.mask) if exprseq.expr_tensor is not None: if len(exprseq.expr_tensor.dim()[0]) > 1: return dy.max_dim(exprseq.expr_tensor, d=1) else: return exprseq.expr_tensor else: return dy.emax(exprseq.expr_list)
def transduce( self, src: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: sent_len = src.sent_len() batch_size = tt.batch_size(src[0]) embeddings = self.embeddings( torch.tensor([list(range(sent_len))] * batch_size).to(xnmt.device)) # embeddings = dy.strided_select(dy.parameter(self.embedder), [1,1], [0,0], [self.input_dim, sent_len]) if self.op == 'sum': output = embeddings + src.as_tensor() elif self.op == 'concat': output = tt.concatenate([embeddings, src.as_tensor()]) else: raise ValueError( f'Illegal op {op} in PositionalTransducer (options are "sum"/"concat")' ) if self.train and self.dropout > 0.0: output = tt.dropout(output, self.dropout) output_seq = expression_seqs.ExpressionSequence(expr_tensor=output, mask=src.mask) self._final_states = [transducers.FinalTransducerState(output_seq[-1])] return output_seq
def transduce(self, src: ExpressionSequence) -> ExpressionSequence: src = src.as_tensor() src_height = src.dim()[0][0] src_width = 1 batch_size = src.dim()[1] W = dy.parameter(self.pW) b = dy.parameter(self.pb) src = dy.reshape(src, (src_height, src_width), batch_size=batch_size) # ((276, 80, 3), 1) # convolution and pooling layers l1 = (W*src)+b output = dy.cdiv(l1,dy.sqrt(dy.squared_norm(l1))) return ExpressionSequence(expr_tensor=output)
def __call__(self, x: ExpressionSequence) -> tt.Tensor: """ Move the time-dimension of an input expression into the batch dimension via a reshape. Args: x: expression of dimensions ((hidden, timesteps), batch_size) Returns: expression of dimensions ((hidden,), timesteps*batch_size) """ batch_size = x[0].dim()[1] model_dim = x[0].dim()[0][0] seq_len = x.sent_len() total_words = seq_len * batch_size input_tensor = x.as_tensor() return dy.reshape(input_tensor, (model_dim, ), batch_size=total_words)
def transduce( self, src: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: src_tensor = src.as_tensor() out_mask = src.mask if self.downsample_by > 1: assert src_tensor.dim()==3, \ f"Downsampling only supported for tensors of order two (+ batch). Found dims {src_tensor.size()}" batch_size, seq_len, hidden_dim = src_tensor.size() if seq_len % self.downsample_by != 0: raise ValueError( "For downsampling, sequence lengths must be multiples of the total reduce factor. " "Configure batcher accordingly.") src_tensor = src_tensor.view( (batch_size, seq_len // self.downsample_by, hidden_dim * self.downsample_by)) if out_mask: out_mask = out_mask.lin_subsampled( reduce_factor=self.downsample_by) output = self.transform.transform(src_tensor) output_seq = expression_seqs.ExpressionSequence(expr_tensor=output, mask=out_mask) self._final_states = [FinalTransducerState(output_seq[-1])] return output_seq
def transduce( self, es: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: output = self.transform(es.as_tensor(), es.mask) return expression_seqs.ExpressionSequence(expr_tensor=output, mask=es.mask)
def transduce( self, expr_seq: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: """ transduce the sequence, applying masks if given (masked timesteps simply copy previous h / c) Args: expr_seq: expression sequence (will be accessed via tensor_expr) Return: expression sequence """ if isinstance(expr_seq, list): mask_out = expr_seq[0].mask seq_len = len(expr_seq[0]) batch_size = expr_seq[0].dim()[1] tensors = [e.as_tensor() for e in expr_seq] input_tensor = dy.reshape(dy.concatenate(tensors), (seq_len, 1, self.input_dim), batch_size=batch_size) else: mask_out = expr_seq.mask seq_len = len(expr_seq) batch_size = expr_seq.dim()[1] input_tensor = dy.reshape(dy.transpose(expr_seq.as_tensor()), (seq_len, 1, self.input_dim), batch_size=batch_size) if self.dropout > 0.0 and self.train: input_tensor = dy.dropout(input_tensor, self.dropout) proj_inp = dy.conv2d_bias(input_tensor, dy.parameter(self.p_f), dy.parameter(self.p_b), stride=(self.stride, 1), is_valid=False) reduced_seq_len = proj_inp.dim()[0][0] proj_inp = dy.transpose( dy.reshape(proj_inp, (reduced_seq_len, self.hidden_dim * 3), batch_size=batch_size)) # proj_inp dims: (hidden, 1, seq_len), batch_size if self.stride > 1 and mask_out is not None: mask_out = mask_out.lin_subsampled(trg_len=reduced_seq_len) h = [dy.zeroes(dim=(self.hidden_dim, 1), batch_size=batch_size)] c = [dy.zeroes(dim=(self.hidden_dim, 1), batch_size=batch_size)] for t in range(reduced_seq_len): f_t = dy.logistic( dy.strided_select(proj_inp, [], [0, t], [self.hidden_dim, t + 1])) o_t = dy.logistic( dy.strided_select(proj_inp, [], [self.hidden_dim, t], [self.hidden_dim * 2, t + 1])) z_t = dy.tanh( dy.strided_select(proj_inp, [], [self.hidden_dim * 2, t], [self.hidden_dim * 3, t + 1])) if self.dropout > 0.0 and self.train: retention_rate = 1.0 - self.dropout dropout_mask = dy.random_bernoulli((self.hidden_dim, 1), retention_rate, batch_size=batch_size) f_t = 1.0 - dy.cmult( dropout_mask, 1.0 - f_t ) # TODO: would be easy to make a zoneout dynet operation to save memory i_t = 1.0 - f_t if t == 0: c_t = dy.cmult(i_t, z_t) else: c_t = dy.cmult(f_t, c[-1]) + dy.cmult(i_t, z_t) h_t = dy.cmult( o_t, c_t) # note: LSTM would use dy.tanh(c_t) instead of c_t if mask_out is None or np.isclose( np.sum(mask_out.np_arr[:, t:t + 1]), 0.0): c.append(c_t) h.append(h_t) else: c.append( mask_out.cmult_by_timestep_expr(c_t, t, True) + mask_out.cmult_by_timestep_expr(c[-1], t, False)) h.append( mask_out.cmult_by_timestep_expr(h_t, t, True) + mask_out.cmult_by_timestep_expr(h[-1], t, False)) self._final_states = [transducers.FinalTransducerState(dy.reshape(h[-1], (self.hidden_dim,), batch_size=batch_size), \ dy.reshape(c[-1], (self.hidden_dim,), batch_size=batch_size))] return expression_seqs.ExpressionSequence(expr_list=h[1:], mask=mask_out)