def set_dropout_masks(self, batch_size=1): if self.dropout_rate > 0.0 and self.train: retention_rate = 1.0 - self.dropout_rate scale = 1.0 / retention_rate self.dropout_mask_x = [dy.random_bernoulli((self.input_dim,), retention_rate, scale, batch_size=batch_size)] self.dropout_mask_x += [dy.random_bernoulli((self.hidden_dim,), retention_rate, scale, batch_size=batch_size) for _ in range(1, self.num_layers)] self.dropout_mask_h = [dy.random_bernoulli((self.hidden_dim,), retention_rate, scale, batch_size=batch_size) for _ in range(self.num_layers)]
def set_dropout_masks(self, batch_size: numbers.Integral = 1) -> None: if self.dropout_rate > 0.0 and self.train: retention_rate = 1.0 - self.dropout_rate scale = 1.0 / retention_rate self.dropout_mask_x = dy.random_bernoulli((self.input_dim, ), retention_rate, scale, batch_size=batch_size) self.dropout_mask_h = dy.random_bernoulli((self.hidden_dim, ), retention_rate, scale, batch_size=batch_size)
def _fast_sample(self, prob, temperature=1): temperature = temperature / 2 bern = dy.random_bernoulli(256, 0.5, scale=temperature) + (1.0 - temperature) prob = dy.cmult(prob, bern) # print prob.npvalue().argmax() return prob.npvalue().argmax()
def transduce( self, expr_seq: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: """ transduce the sequence, applying masks if given (masked timesteps simply copy previous h / c) Args: expr_seq: expression sequence (will be accessed via tensor_expr) Return: expression sequence """ if isinstance(expr_seq, list): mask_out = expr_seq[0].mask seq_len = len(expr_seq[0]) batch_size = expr_seq[0].dim()[1] tensors = [e.as_tensor() for e in expr_seq] input_tensor = dy.reshape(dy.concatenate(tensors), (seq_len, 1, self.input_dim), batch_size=batch_size) else: mask_out = expr_seq.mask seq_len = len(expr_seq) batch_size = expr_seq.dim()[1] input_tensor = dy.reshape(dy.transpose(expr_seq.as_tensor()), (seq_len, 1, self.input_dim), batch_size=batch_size) if self.dropout > 0.0 and self.train: input_tensor = dy.dropout(input_tensor, self.dropout) proj_inp = dy.conv2d_bias(input_tensor, dy.parameter(self.p_f), dy.parameter(self.p_b), stride=(self.stride, 1), is_valid=False) reduced_seq_len = proj_inp.dim()[0][0] proj_inp = dy.transpose( dy.reshape(proj_inp, (reduced_seq_len, self.hidden_dim * 3), batch_size=batch_size)) # proj_inp dims: (hidden, 1, seq_len), batch_size if self.stride > 1 and mask_out is not None: mask_out = mask_out.lin_subsampled(trg_len=reduced_seq_len) h = [dy.zeroes(dim=(self.hidden_dim, 1), batch_size=batch_size)] c = [dy.zeroes(dim=(self.hidden_dim, 1), batch_size=batch_size)] for t in range(reduced_seq_len): f_t = dy.logistic( dy.strided_select(proj_inp, [], [0, t], [self.hidden_dim, t + 1])) o_t = dy.logistic( dy.strided_select(proj_inp, [], [self.hidden_dim, t], [self.hidden_dim * 2, t + 1])) z_t = dy.tanh( dy.strided_select(proj_inp, [], [self.hidden_dim * 2, t], [self.hidden_dim * 3, t + 1])) if self.dropout > 0.0 and self.train: retention_rate = 1.0 - self.dropout dropout_mask = dy.random_bernoulli((self.hidden_dim, 1), retention_rate, batch_size=batch_size) f_t = 1.0 - dy.cmult( dropout_mask, 1.0 - f_t ) # TODO: would be easy to make a zoneout dynet operation to save memory i_t = 1.0 - f_t if t == 0: c_t = dy.cmult(i_t, z_t) else: c_t = dy.cmult(f_t, c[-1]) + dy.cmult(i_t, z_t) h_t = dy.cmult( o_t, c_t) # note: LSTM would use dy.tanh(c_t) instead of c_t if mask_out is None or np.isclose( np.sum(mask_out.np_arr[:, t:t + 1]), 0.0): c.append(c_t) h.append(h_t) else: c.append( mask_out.cmult_by_timestep_expr(c_t, t, True) + mask_out.cmult_by_timestep_expr(c[-1], t, False)) h.append( mask_out.cmult_by_timestep_expr(h_t, t, True) + mask_out.cmult_by_timestep_expr(h[-1], t, False)) self._final_states = [transducers.FinalTransducerState(dy.reshape(h[-1], (self.hidden_dim,), batch_size=batch_size), \ dy.reshape(c[-1], (self.hidden_dim,), batch_size=batch_size))] return expression_seqs.ExpressionSequence(expr_list=h[1:], mask=mask_out)
def transduce(self, es): ret = [] # wix, wih, bi = parameter(self.wix), parameter(self.wih), parameter(self.bi) # wfx, wfh, bf = parameter(self.wfx), parameter(self.wfh), parameter(self.bf) # wcx, wch, bc = parameter(self.wcx), parameter(self.wch), parameter(self.bc) # wox, woh, bo = parameter(self.wox), parameter(self.woh), parameter(self.bo) # wrx, wrh, whx, br = parameter(self.wrx), parameter(self.wrh), parameter(self.whx), parameter(self.br) # prev_c, prev_h = parameter(self.initc), parameter(self.inith) wix, wih, bi = self.wix, self.wih, self.bi wfx, wfh, bf = self.wfx, self.wfh, self.bf wcx, wch, bc = self.wcx, self.wch, self.bc wox, woh, bo = self.wox, self.woh, self.bo wrx, wrh, whx, br = self.wrx, self.wrh, self.whx, self.br prev_c, prev_h = self.initc, self.inith if self.dropout_x > 0.: retention_x = 1. - self.dropout_x scale_x = 1. / retention_x mask_x_i = random_bernoulli(self._input_dim, p=retention_x, scale=scale_x) mask_x_f = random_bernoulli(self._input_dim, p=retention_x, scale=scale_x) mask_x_c = random_bernoulli(self._input_dim, p=retention_x, scale=scale_x) mask_x_o = random_bernoulli(self._input_dim, p=retention_x, scale=scale_x) mask_x_r = random_bernoulli(self._input_dim, p=retention_x, scale=scale_x) if self.dropout_h > 0.: retention_h = 1. - self.dropout_h scale_h = 1. / retention_h mask_h_i = random_bernoulli(self._hidden_dim, p=retention_h, scale=scale_h) mask_h_f = random_bernoulli(self._hidden_dim, p=retention_h, scale=scale_h) mask_h_c = random_bernoulli(self._hidden_dim, p=retention_h, scale=scale_h) mask_h_o = random_bernoulli(self._hidden_dim, p=retention_h, scale=scale_h) mask_h_r = random_bernoulli(self._hidden_dim, p=retention_h, scale=scale_h) for x in es: ait = affine_transform([ bi, wix, cmult(mask_x_i, x) if self.dropout_x > 0. else x, wih, cmult(mask_h_i, prev_h) if self.dropout_h > 0. else prev_h ]) it = logistic(ait) aft = affine_transform([ bf, wfx, cmult(mask_x_f, x) if self.dropout_x > 0. else x, wfh, cmult(mask_h_f, prev_h) if self.dropout_h > 0. else prev_h ]) ft = logistic(aft) atct = affine_transform([ bc, wcx, cmult(mask_x_c, x) if self.dropout_x > 0. else x, wch, cmult(mask_h_c, prev_h) if self.dropout_h > 0. else prev_h ]) tct = tanh(atct) # ct = prev_c + cmult(tct - prev_c, it) ct = cmult(ft, prev_c) + cmult(it, tct) aot = affine_transform([ bo, wox, cmult(mask_x_o, x) if self.dropout_x > 0. else x, woh, cmult(mask_h_o, prev_h) if self.dropout_h > 0. else prev_h ]) ot = logistic(aot) h = cmult(tanh(ct), ot) art = affine_transform([ br, wrx, cmult(mask_x_r, x) if self.dropout_x > 0. else x, wrh, cmult(mask_h_r, prev_h) if self.dropout_h > 0. else prev_h ]) rt = logistic(art) highway_h = cmult(rt, h) + cmult(1. - rt, whx * x) ret.append(highway_h) prev_c = ct prev_h = highway_h return ret
def transduce(self, es): ret = [] # wix, wih, bi = parameter(self.wix), parameter(self.wih), parameter(self.bi) # wcx, wch, bc = parameter(self.wcx), parameter(self.wch), parameter(self.bc) # wox, woh, bo = parameter(self.wox), parameter(self.woh), parameter(self.bo) # prev_c, prev_h = parameter(self.initc), parameter(self.inith) wix, wih, bi = self.wix, self.wih, self.bi wcx, wch, bc = self.wcx, self.wch, self.bc wox, woh, bo = self.wox, self.woh, self.bo prev_c, prev_h = self.initc, self.inith if self.dropout_x > 0.: retention_x = 1. - self.dropout_x scale_x = 1. / retention_x mask_x_i = random_bernoulli(self._input_dim, p=retention_x, scale=scale_x) mask_x_c = random_bernoulli(self._input_dim, p=retention_x, scale=scale_x) mask_x_o = random_bernoulli(self._input_dim, p=retention_x, scale=scale_x) if self.dropout_h > 0.: retention_h = 1. - self.dropout_h scale_h = 1. / retention_h mask_h_i = random_bernoulli(self._hidden_dim, p=retention_h, scale=scale_h) mask_h_c = random_bernoulli(self._hidden_dim, p=retention_h, scale=scale_h) mask_h_o = random_bernoulli(self._hidden_dim, p=retention_h, scale=scale_h) for x in es: ait = affine_transform([ bi, wix, cmult(mask_x_i, x) if self.dropout_x > 0. else x, wih, cmult(mask_h_i, prev_h) if self.dropout_h > 0. else prev_h ]) it = logistic(ait) ft = 1. - it atct = affine_transform([ bc, wcx, cmult(mask_x_c, x) if self.dropout_x > 0. else x, wch, cmult(mask_h_c, prev_h) if self.dropout_h > 0. else prev_h ]) tct = tanh(atct) ct = prev_c + cmult(tct - prev_c, it) aot = affine_transform([ bo, wox, cmult(mask_x_o, x) if self.dropout_x > 0. else x, woh, cmult(mask_h_o, prev_h) if self.dropout_h > 0. else prev_h ]) ot = logistic(aot) h = cmult(tanh(ct), ot) ret.append(h) prev_c = ct prev_h = h return ret