def pick_neg_log(self, pred, gold): # TODO make this a static function in both classes if not isinstance(gold, int) and not isinstance(gold, np.int64): # calculate cross-entropy loss against the whole vector dy_gold = dynet.inputVector(gold) return -dynet.sum_elems(dynet.cmult(dy_gold, dynet.log(pred))) return -dynet.log(dynet.pick(pred, gold))
def select_action(tree, policy, choose_max=False, return_prob=False, mode='train'): prob, pairs = policy.selection_by_tree(tree, mode) if pairs is None: if return_prob: return None, None, None, None else: return None, None, None with np.errstate(all='raise'): try: prob_v = prob.npvalue() if choose_max: idx = np.argmax(prob_v) else: # if np.random.random() < policy.epsilon: # idx = np.random.randint(len(prob_v)) # while prob_v[idx] == 0: # idx = np.random.randint(len(prob_v)) # else: idx = np.random.choice(range(len(prob_v)), p=prob_v / np.sum(prob_v)) except: for para in policy.model_parameters: check_error(para, dy.parameter(policy.model_parameters[para])) check_error('history', policy.history.output()) check_error('pr', prob) action = prob[idx] policy.saved_actions[-1].append(action) policy.update_history(pairs[idx]) if return_prob: return pairs[idx], prob_v[idx], pairs, prob_v return pairs[idx], prob_v[idx], dy.mean_elems(dy.cmult(prob, dy.log(prob)))
def get_summer(s, size): # list of values (bidirection) => one value if s == "avg": return dy.average else: mask = [0. for _ in range(size // 2) ] + [1. for _ in range(size // 2)] mask2 = [1. for _ in range(size // 2) ] + [0. for _ in range(size // 2)] if s == "fend": return lambda x: dy.cmult(dy.inputVector(mask2), x[-1]) elif s == "bend": return lambda x: dy.cmult(dy.inputVector(mask), x[0]) elif s == "ends": return lambda x: dy.cmult(dy.inputVector(mask2), x[ -1]) + dy.cmult(dy.inputVector(mask), x[0]) else: return None
def __call__(self, input_exp, hidden_exp, mask=None): # two kinds of dropouts if self.idrop > 0.: input_exp = dy.dropout(input_exp, self.idrop) input_exp_g = input_exp_t = input_exp hidden_exp_g = hidden_exp_t = hidden_exp["H"] if self.gdrop > 0.: input_exp_g = dy.cmult(input_exp, self.masks[0]) hidden_exp_g = dy.cmult(hidden_exp_g, self.masks[1]) input_exp_t = dy.cmult(input_exp, self.masks[2]) hidden_exp_t = dy.cmult(hidden_exp_t, self.masks[3]) rzt = dy.affine_transform([ self.iparams["brz"], self.iparams["x2rz"], input_exp_g, self.iparams["h2rz"], hidden_exp_g ]) rzt = dy.logistic(rzt) rt, zt = dy.pick_range(rzt, 0, self.n_hidden), BK.pick_range( rzt, self.n_hidden, 2 * self.n_hidden) h_reset = dy.cmult(rt, hidden_exp_t) ht = dy.affine_transform([ self.iparams["bh"], self.iparams["x2h"], input_exp_t, self.iparams["h2h"], h_reset ]) ht = dy.tanh(ht) hidden = dy.cmult(zt, hidden_exp["H"]) + dy.cmult( (1. - zt), ht) # first one use original hh # mask: if 0 then pass through if mask is not None: mask_array = np.asarray(mask).reshape((1, -1)) m1 = dy.inputTensor(mask_array, True) # 1.0 for real words m0 = dy.inputTensor(1.0 - mask_array, True) # 1.0 for padding words (mask=0) hidden = hidden * m1 + hidden_exp["H"] * m0 return {"H": hidden}
def attend(self, encoded_inputs, h_t, input_masks=None): # encoded_inputs dimension is: seq len x 2*h x batch size, h_t dimension is h x batch size (for bilstm encoder) if len(encoded_inputs) == 1: # no need to attend if only one input state, compute output directly h_output = dn.tanh(self.w_c * dn.concatenate([h_t, encoded_inputs[0]])) # return trivial alphas (all 1's since one input gets all attention) if input_masks: # if batching alphas = dn.inputTensor([1] * len(input_masks[0]), batched=True) else: alphas = dn.inputTensor([1], batched=True) return h_output, alphas # iterate through input states to compute attention scores # scores = [v_a * dn.tanh(w_a * h_t + u_a * h_input) for h_input in blstm_outputs] w_a_h_t = self.w_a * h_t scores = [ self.v_a * dn.tanh(dn.affine_transform([w_a_h_t, self.u_a, h_input])) for h_input in encoded_inputs ] concatenated = dn.concatenate(scores) if input_masks: # if batching, multiply attention scores with input masks to zero-out scores for padded inputs dn_masks = dn.inputTensor(input_masks, batched=True) concatenated = dn.cmult(concatenated, dn_masks) # normalize scores alphas = dn.softmax(concatenated) # compute context vector with weighted sum for each seq in batch bo = dn.concatenate_cols(encoded_inputs) c = bo * alphas # c = dn.esum([h_input * dn.pick(alphas, j) for j, h_input in enumerate(blstm_outputs)]) # compute output vector using current decoder state and context vector h_output = dn.tanh(self.w_c * dn.concatenate([h_t, c])) return h_output, alphas
def __call__(self, x_embs): x_len = len(x_embs) # BiGRU hf = dy.concatenate_cols( self.fGRUBuilder.initial_state().transduce(x_embs)) hb = dy.concatenate_cols(self.bGRUBuilder.initial_state().transduce( x_embs[::-1])[::-1]) h = dy.concatenate([hf, hb]) # Selective Gate hb_1 = dy.pick(hb, index=0, dim=1) hf_n = dy.pick(hf, index=x_len - 1, dim=1) s = dy.concatenate([hb_1, hf_n]) # Selection sGate = dy.logistic(dy.colwise_add(self.Ws * h, self.Us * s + self.bs)) hp = dy.cmult(h, sGate) return hp, hb_1
def __call__(self, x, tm1s=None, test=False): if test: # Initial states s_tm1 = tm1s[0] c_tm1 = tm1s[1] w_tm1 = x # GRU s_t = self.GRUBuilder.initial_state().set_s([s_tm1]).add_input( dy.concatenate([w_tm1, c_tm1])).output() # Attention e_t = dy.pick( self.va * dy.tanh(dy.colwise_add(self.Ua * self.hp, self.Wa * s_tm1)), 0) a_t = dy.softmax(e_t) c_t = dy.esum([ dy.cmult(a_t_i, h_i) for a_t_i, h_i in zip(a_t, dy.transpose(self.hp)) ]) #c_t = self.hp*a_t # memory error? # Output r_t = dy.concatenate_cols([ Wr_j * w_tm1 + Ur_j * c_t + Vr_j * s_t for Wr_j, Ur_j, Vr_j in zip(self.Wr, self.Ur, self.Vr) ]) # Maxout m_t = dy.max_dim(r_t, d=1) y_t = dy.softmax(self.Wo * m_t) return s_t, c_t, y_t else: w_embs = x # Initial states s_tm1 = self.s_0 c_tm1 = self.c_0 GRU = self.GRUBuilder.initial_state().set_s([s_tm1]) y = [] for w_tm1 in w_embs: # GRU GRU = GRU.add_input(dy.concatenate([w_tm1, c_tm1])) s_t = GRU.output() # Attention e_t = dy.pick( self.va * dy.tanh( dy.colwise_add(self.Ua * self.hp, self.Wa * s_tm1)), 0) a_t = dy.softmax(e_t) c_t = dy.esum([ dy.cmult(a_t_i, h_i) for a_t_i, h_i in zip(a_t, dy.transpose(self.hp)) ]) #c_t = self.hp*a_t # memory error? # Output r_t = dy.concatenate_cols([ Wr_j * w_tm1 + Ur_j * c_t + Vr_j * s_t for Wr_j, Ur_j, Vr_j in zip(self.Wr, self.Ur, self.Vr) ]) # Maxout m_t = dy.max_dim(r_t, d=1) y_t = self.Wo * m_t y.append(y_t) # t -> tm1 s_tm1 = s_t c_tm1 = c_t return y
def cosine(self, e1, e2): return dynet.cdiv( dynet.dot_product(e1, e2), (dynet.cmult(dynet.squared_norm(e1), dynet.squared_norm(e2))))
def pick_neg_log(self, pred, gold): if hasattr(gold, "__len__"): # calculate cross-entropy loss against the whole vector dy_gold = dynet.inputVector(gold) return -dynet.sum_elems(dynet.cmult(dy_gold, dynet.log(pred))) return -dynet.log(dynet.pick(pred, gold))
def pick_neg_log(self, pred, gold): if not isinstance(gold, int): # calculate cross-entropy loss against the whole vector dy_gold = dynet.inputVector(gold) return -dynet.sum_elems(dynet.cmult(dy_gold, dynet.log(pred))) return -dynet.log(dynet.pick(pred, gold))