def add_inputs(self,inputs,masks,predict=False): if not self.init: print("No Initial state provided") return recur_states = [] cell_states = [] for input_tensor in inputs: hidden = self.hidden_previous cell = self.cell_previous if not predict: input_tensor = dy.cmult(input_tensor,self.input_drop_mask) hidden = dy.cmult(hidden,self.recur_drop_mask) gates = dy.affine_transform([self.b.expr(),self.WXH.expr(),dy.concatenate([input_tensor,hidden])]) iga = dy.pickrange(gates,0,self.recur_size) fga = dy.pickrange(gates,self.recur_size,2*self.recur_size) oga = dy.pickrange(gates,2*self.recur_size,3*self.recur_size) cga = dy.pickrange(gates,3*self.recur_size,4*self.recur_size) ig = dy.logistic(iga) fg = dy.logistic(fga) # +self.forget_bias og = dy.logistic(oga) c_tilda = dy.tanh(cga) new_cell = dy.cmult(cell,fg) + dy.cmult(c_tilda,ig) self.cell_previous = new_cell cell_states.append(new_cell) new_hidden = dy.cmult(dy.tanh(new_cell),og) self.hidden_previous = new_hidden recur_states.append(new_hidden) return recur_states,cell_states
def learn(self, wave, mgc, batch_size): # disc, wave = self.dio.ulaw_encode(wave) # from ipdb import set_trace # set_trace() last_proc = 0 dy.renew_cg() total_loss = 0 losses = [] cnt = 0 noise = np.random.normal(0, 1.0, (len(wave) + self.UPSAMPLE_COUNT)) for mgc_index in range(len(mgc)): curr_proc = int((mgc_index + 1) * 100 / len(mgc)) if curr_proc % 5 == 0 and curr_proc != last_proc: while last_proc < curr_proc: last_proc += 5 sys.stdout.write(' ' + str(last_proc)) sys.stdout.flush() if mgc_index < len(mgc) - 1: output, excitation, filter, vuv = self._predict_one(mgc[mgc_index], noise[ self.UPSAMPLE_COUNT * mgc_index:self.UPSAMPLE_COUNT * mgc_index + 2 * self.UPSAMPLE_COUNT]) # reconstruction error t_vect = wave[self.UPSAMPLE_COUNT * mgc_index:self.UPSAMPLE_COUNT * mgc_index + self.UPSAMPLE_COUNT] loss = dy.squared_distance(output, dy.inputVector(t_vect)) # dynamic error o1 = dy.pickrange(output, 0, self.UPSAMPLE_COUNT - 1) o2 = dy.pickrange(output, 1, self.UPSAMPLE_COUNT) delta = o2 - o1 real_delta = t_vect[1:self.UPSAMPLE_COUNT] - t_vect[0:self.UPSAMPLE_COUNT - 1] loss += dy.squared_distance(delta, dy.inputVector(real_delta)) # excitation error # loss += dy.sum_elems(excitation) # o1 = dy.pickrange(excitation, 0, self.UPSAMPLE_COUNT - 1) # o2 = dy.pickrange(excitation, 1, self.UPSAMPLE_COUNT) # loss += dy.sum_elems(dy.abs(o2 - o1)) losses.append(loss) cnt += self.UPSAMPLE_COUNT if len(losses) >= batch_size: loss = dy.esum(losses) total_loss += loss.value() loss.backward() self.trainer.update() losses = [] dy.renew_cg() if len(losses) > 0: loss = dy.esum(losses) total_loss += loss.value() loss.backward() self.trainer.update() dy.renew_cg() return total_loss / cnt
def transduce(self, inputs, masks, predict=False): if not self.init: print("No Initial state provided") return outputs = [] batch_size = inputs[0].dim()[1] for idx, input_tensor in enumerate(inputs): recur_s = [] cell_s = [] out = [] hidden = self.hidden_previous cell = self.cell_previous if not predict: input_tensor = dy.cmult(input_tensor, self.input_drop_mask) hidden = dy.cmult(hidden, self.recur_drop_mask) gates = dy.affine_transform([ self.b.expr(), self.WXH.expr(), dy.concatenate([input_tensor, hidden]) ]) iga = dy.pickrange(gates, 0, self.recur_size) fga = dy.pickrange(gates, self.recur_size, 2 * self.recur_size) oga = dy.pickrange(gates, 2 * self.recur_size, 3 * self.recur_size) cga = dy.pickrange(gates, 3 * self.recur_size, 4 * self.recur_size) ig = dy.logistic(iga) fg = dy.logistic(fga) # +self.forget_bias og = dy.logistic(oga) c_tilda = dy.tanh(cga) new_cell = dy.cmult(cell, fg) + dy.cmult(c_tilda, ig) new_hidden = dy.cmult(dy.tanh(new_cell), og) for jdx in range(batch_size): if masks[idx][jdx] == 1: h_t = dy.pick_batch_elem(new_hidden, jdx) recur_s.append(h_t) cell_s.append(dy.pick_batch_elem(new_cell, jdx)) out.append(h_t) else: recur_s.append(dy.pick_batch_elem(hidden, jdx)) cell_s.append(dy.pick_batch_elem(cell, jdx)) out.append(dy.zeros(self.recur_size)) new_cell = dy.concatenate_to_batch(cell_s) new_hidden = dy.concatenate_to_batch(recur_s) self.cell_previous = new_cell self.hidden_previous = new_hidden outputs.append(dy.concatenate_to_batch(out)) return outputs
def word_repr(self, char_seq): # obtain the word representation when given its character sequence wlen = len(char_seq) if 'rgW%d'%wlen not in self.param_exprs: self.param_exprs['rgW%d'%wlen] = dy.parameter(self.params['reset_gate_W'][wlen-1]) self.param_exprs['rgb%d'%wlen] = dy.parameter(self.params['reset_gate_b'][wlen-1]) self.param_exprs['cW%d'%wlen] = dy.parameter(self.params['com_W'][wlen-1]) self.param_exprs['cb%d'%wlen] = dy.parameter(self.params['com_b'][wlen-1]) self.param_exprs['ugW%d'%wlen] = dy.parameter(self.params['update_gate_W'][wlen-1]) self.param_exprs['ugb%d'%wlen] = dy.parameter(self.params['update_gate_b'][wlen-1]) chars = dy.concatenate(char_seq) reset_gate = dy.logistic(self.param_exprs['rgW%d'%wlen] * chars + self.param_exprs['rgb%d'%wlen]) comb = dy.concatenate([dy.tanh(self.param_exprs['cW%d'%wlen] * dy.cmult(reset_gate,chars) + self.param_exprs['cb%d'%wlen]),chars]) update_logits = self.param_exprs['ugW%d'%wlen] * comb + self.param_exprs['ugb%d'%wlen] update_gate = dy.transpose(dy.concatenate_cols([dy.softmax(dy.pickrange(update_logits,i*(wlen+1),(i+1)*(wlen+1))) for i in xrange(self.options['ndims'])])) # The following implementation of Softmax fucntion is not safe, but faster... #exp_update_logits = dy.exp(dy.reshape(update_logits,(self.options['ndims'],wlen+1))) #update_gate = dy.cdiv(exp_update_logits, dy.concatenate_cols([dy.sum_cols(exp_update_logits)] *(wlen+1))) #assert (not np.isnan(update_gate.npvalue()).any()) word = dy.sum_cols(dy.cmult(update_gate,dy.reshape(comb,(self.options['ndims'],wlen+1)))) return word
def split_rows(self, X, h): (n_rows, _), batch = X.dim() l = range(n_rows) steps = n_rows // h output = [] for i in range(0, n_rows, steps): output.append(dy.pickrange(X, i, i + steps)) return output
def step(self, x, hx, cx): if not self.test: if self.dropout_x > 0: x = dy.cmult(self.dropout_mask_x, x) if self.dropout_h > 0: hx = dy.cmult(self.dropout_mask_h, hx) gates = dy.affine_transform( [self.bias, self.weight_ih, x, self.weight_hh, hx]) i = dy.pickrange(gates, 0, self.n_hidden) f = dy.pickrange(gates, self.n_hidden, self.n_hidden * 2) g = dy.pickrange(gates, self.n_hidden * 2, self.n_hidden * 3) o = dy.pickrange(gates, self.n_hidden * 3, self.n_hidden * 4) i, f, g, o = dy.logistic(i), dy.logistic(f), dy.tanh(g), dy.logistic(o) cy = dy.cmult(f, cx) + dy.cmult(i, g) hy = dy.cmult(o, dy.tanh(cy)) return hy, cy
def add_input(self, x_t, mask=None): x_t = dynet.to_device(x_t, self.device) if self.dropout is None: x_t = x_t h_t = self.h_t bias = self.bias else: x_t = dynet.cmult(x_t, self.dropout_mask_x) h_t = dynet.cmult(self.h_t, self.dropout_mask_h) bias = self.bias # calculate all information for all gates in one big matrix multiplication gates = self.W * dynet.concatenate([x_t, h_t, bias]) # input gate i = dynet.logistic(dynet.pickrange(gates, 0, self.dim)) # forget gate f = 1.0 - i # output gate o = dynet.logistic(dynet.pickrange(gates, self.dim, self.dim * 2)) # input modulation gate g = dynet.tanh(dynet.pickrange(gates, self.dim * 2, self.dim * 3)) # cell state c_t = dynet.cmult(f, self.c_t) + dynet.cmult(i, g) # hidden state h_t = dynet.cmult(o, dynet.tanh(c_t)) if mask is None: self.c_t = c_t self.h_t = h_t else: self.c_t = (c_t * mask) + (self.c_t * (1.0 - mask)) self.h_t = (h_t * mask) + (self.h_t * (1.0 - mask)) if self.next_layer is not None: self.next_layer.add_input(self.h_t, mask)
def word_repr(self, char_seq): # obtain the word representation when given its character sequence wlen = len(char_seq) if 'rgW%d' % wlen not in self.param_exprs: self.param_exprs['rgW%d' % wlen] = dy.parameter( self.params['reset_gate_W'][wlen - 1]) self.param_exprs['rgb%d' % wlen] = dy.parameter( self.params['reset_gate_b'][wlen - 1]) self.param_exprs['cW%d' % wlen] = dy.parameter( self.params['com_W'][wlen - 1]) self.param_exprs['cb%d' % wlen] = dy.parameter( self.params['com_b'][wlen - 1]) self.param_exprs['ugW%d' % wlen] = dy.parameter( self.params['update_gate_W'][wlen - 1]) self.param_exprs['ugb%d' % wlen] = dy.parameter( self.params['update_gate_b'][wlen - 1]) chars = dy.concatenate(char_seq) reset_gate = dy.logistic(self.param_exprs['rgW%d' % wlen] * chars + self.param_exprs['rgb%d' % wlen]) comb = dy.concatenate([ dy.tanh(self.param_exprs['cW%d' % wlen] * dy.cmult(reset_gate, chars) + self.param_exprs['cb%d' % wlen]), chars ]) update_logits = self.param_exprs[ 'ugW%d' % wlen] * comb + self.param_exprs['ugb%d' % wlen] update_gate = dy.transpose( dy.concatenate_cols([ dy.softmax( dy.pickrange(update_logits, i * (wlen + 1), (i + 1) * (wlen + 1))) for i in xrange(self.options['ndims']) ])) # The following implementation of Softmax fucntion is not safe, but faster... #exp_update_logits = dy.exp(dy.reshape(update_logits,(self.options['ndims'],wlen+1))) #update_gate = dy.cdiv(exp_update_logits, dy.concatenate_cols([dy.sum_cols(exp_update_logits)] *(wlen+1))) #assert (not np.isnan(update_gate.npvalue()).any()) word = dy.sum_cols( dy.cmult(update_gate, dy.reshape(comb, (self.options['ndims'], wlen + 1)))) return word
def _predict_one(self, mgc, noise): mgc = dy.inputVector(mgc) outputs = [] noise_vec = dy.inputVector(noise[0:self.UPSAMPLE_COUNT]) [hidden_w, hidden_b] = self.mlp_excitation hidden_input = mgc # dy.concatenate([mgc, noise_vec]) for w, b in zip(hidden_w, hidden_b): hidden_input = dy.tanh(w.expr(update=True) * hidden_input + b.expr(update=True)) excitation = dy.logistic( self.excitation_w.expr(update=True) * hidden_input + self.excitation_b.expr(update=True)) [hidden_w, hidden_b] = self.mlp_filter hidden_input = mgc # dy.concatenate([mgc, noise_vec]) for w, b in zip(hidden_w, hidden_b): hidden_input = dy.tanh(w.expr(update=True) * hidden_input + b.expr(update=True)) filter = dy.tanh(self.filter_w.expr(update=True) * hidden_input + self.filter_b.expr(update=True)) [hidden_w, hidden_b] = self.mlp_vuv hidden_input = mgc # dy.concatenate([mgc, noise_vec]) for w, b in zip(hidden_w, hidden_b): hidden_input = dy.tanh(w.expr(update=True) * hidden_input + b.expr(update=True)) vuv = dy.logistic(self.vuv_w.expr(update=True) * hidden_input + self.vuv_b.expr(update=True)) # sample_vec = dy.inputVector(noise[self.UPSAMPLE_COUNT:self.UPSAMPLE_COUNT * 2]) # noise_vec = dy.inputVector(noise[0:self.UPSAMPLE_COUNT + self.FILTER_SIZE - 1]) mixed = excitation # * vuv + noise_vec * (1.0 - vuv) for ii in range(self.UPSAMPLE_COUNT): tmp = dy.cmult(filter, dy.pickrange(mixed, ii, ii + self.FILTER_SIZE)) outputs.append(dy.sum_elems(tmp)) outputs = dy.concatenate(outputs) # from ipdb import set_trace # set_trace() # mixed = dy.reshape(mixed, (self.UPSAMPLE_COUNT + self.FILTER_SIZE - 1, 1, 1)) # filter = dy.reshape(filter, (self.FILTER_SIZE, 1, 1, 1)) # outputs = dy.conv2d(mixed, filter, stride=(1, 1), is_valid=True) # outputs = dy.reshape(outputs, (self.UPSAMPLE_COUNT,)) # outputs = outputs + noise_vec * vuv return outputs, excitation, filter, vuv
e = dy.softsign(e1) # x/(1+|x|) # softmaxes e = dy.softmax(e1) e = dy.log_softmax(e1, restrict=[]) # restrict is a set of indices. # if not empty, only entries in restrict are part # of softmax computation, others get 0. e = dy.sum_cols(e1) # Picking values from vector expressions e = dy.pick(e1, k) # k is unsigned integer, e1 is vector. return e1[k] e = e1[k] # same e = dy.pickrange( e1, k, v) # like python's e1[k:v] for lists. e1 is an Expression, k,v integers. e = e1[k:v] # same e = dy.pickneglogsoftmax( e1, k) # k is unsigned integer. equiv to: (pick(-log(dy.softmax(e1)), k)) # Neural net stuff dy.noise( e1, stddev ) # add a noise to each element from a gausian with standard-dev = stddev dy.dropout(e1, p) # apply dropout with probability p # functions over lists of expressions e = dy.esum([e1, e2, ...]) # sum e = dy.average([e1, e2, ...]) # average
def add_input(self, x_t): x_t = dynet.to_device(x_t, self.device) h_t = self.calculate_h_t() if self.dropout: x_t = dynet.cmult(x_t, self.dropout_mask_x) h_t = dynet.cmult(h_t, self.dropout_mask_h) # bias bias = self.bias # calculate all information for all gates in one big matrix multiplication gates = self.W * dynet.concatenate([x_t, h_t, bias]) # input gate # i = dynet.logistic(dynet.pickrange(gates, 0, self.dim)) # output gate # o = dynet.logistic(dynet.pickrange(gates, self.dim, self.dim*2)) # input modulation gate # g = dynet.tanh(dynet.pickrange(gates, self.dim*2, self.dim*3)) # output gate o = dynet.logistic(dynet.pickrange(gates, 0, self.dim)) # input modulation gate g = dynet.tanh(dynet.pickrange(gates, self.dim, self.dim * 2)) # forget gate Wfx = self.Wf * dynet.concatenate([x_t, bias]) if len(self.h_t_sources) == 1 or self.path_dropout: if len(self.h_t_sources) == 1: idx = 0 else: idx = self.get_path() c_t = self.c_t_sources[idx] f_k = dynet.logistic(Wfx + self.Uf * h_t) # input gate i = 1. - f_k # cell state c_t = dynet.cmult(f_k, c_t) + dynet.cmult(i, g) else: weights = dynet.to_device(dynet.softmax(self.weights), self.device) if self.dropout: f_k = [ dynet.logistic(Wfx + self.Uf * dynet.cmult(h, self.dropout_mask_h)) * w for h, w in zip(self.h_t_sources, weights) ] else: f_k = [ dynet.logistic(Wfx + self.Uf * h) * w for h, w in zip(self.h_t_sources, weights) ] # input gate i = 1. - dynet.esum(f_k) # cell state c_t = dynet.esum( [dynet.cmult(f, c) for f, c in zip(f_k, self.c_t_sources)]) + dynet.cmult(i, g) # hidden state h_t = dynet.cmult(o, dynet.tanh(c_t)) if self.next_layer is not None: c_stack, h_stack = self.next_layer.add_input(h_t) return [c_t] + c_stack, [h_t] + h_stack else: return [c_t], [h_t]