def get_context(self, prev_state_bf): state_step_bf = self.states_mlp_bf(prev_state_bf) state_step_b1f = cgt.dimshuffle(state_step_bf, [0, 'x', 1]) # Compute the inner product <phi(s_i), psi(h_u)> where phi and psi are MLPs. # The below line computes the pointwise product of phi(s_i) and psi(h_u) and then sums to get the inner product. # scalar_energies_vec_bt = cgt.sqrt(cgt.sum(cgt.broadcast('*', state_step_b1f, self.features_post_mlp_btf, 'x1x,xxx'), axis=2)) # Compute tau=tanh(h_u*W + s_i*V), broadcasting to do all h_u mults at once. scalar_energies_vec_btf = cgt.tanh(cgt.broadcast('+', self.features_post_mlp_btf, state_step_b1f, 'xxx,x1x')) # The next two lines compute w^T*(tau) with a pointwise product and then a sum. scalar_energies_vec_btf = cgt.broadcast('*', self.mixing_vec_w, scalar_energies_vec_btf, '11x,xxx') scalar_energies_vec_bt = cgt.sum(scalar_energies_vec_btf, axis=2) # Softmax weights the blended features over their time dimesions. softmax_weights_bt = nn.softmax(scalar_energies_vec_bt, axis=1) # This weight multiplies all features. extended_softmax_bt1 = cgt.dimshuffle(softmax_weights_bt, [0, 1, 'x']) # Weight the features by it's temporally dependent softmax weight. pre_blended = cgt.broadcast('*', extended_softmax_bt1, self.features_post_mlp_btf, 'xx1,xxx') # Integrate out time. blended_features_bf = cgt.sum(pre_blended, axis=1) return blended_features_bf
def get_context_backup(self, prev_state_bf): state_step_bf = cgt.sigmoid(self.states_mlp_bf(prev_state_bf)) product_list = [] for time_step in range(0, 3): inner_product = cgt.sum(state_step_bf*self.features_post_mlp_btf[:, time_step, :], axis=1) product_list.append(inner_product) st = cgt.stack(product_list) st = cgt.dimshuffle(st, [1, 0]) softmax_weights = softmax(st) sum = None for time_step in range(0, 3): softmax_t_step = cgt.dimshuffle(softmax_weights[:, time_step], [0, 'x']) if sum is None: sum = cgt.broadcast('*', softmax_t_step, self.features_post_mlp_btf[:, time_step, :], 'x1,xx') else: sum += cgt.broadcast('*', softmax_t_step, self.features_post_mlp_btf[:, time_step, :], 'x1,xx') return sum
def make_prediction(self, max_label_length, ground_labels_basis_btc): context_i_bf = parameter(init_array(IIDGaussian(0.1), (self.batch_size, self.feature_size)), name=None) state_i_bf = parameter(init_array(IIDGaussian(0.1), (self.batch_size, self.decoder_size)), name=None) char_list = [] for iter_step in range(0, max_label_length): #Is this right? prev_out_bc = ground_labels_basis_btc[:, iter_step, :] state_i_bf = self.get_decoder_state(context_i_bf, prev_out_bc, state_i_bf) context_i_bf = self.get_context(state_i_bf) this_character_dist = self.get_character_distribution(state_i_bf, context_i_bf) char_list.append(cgt.argmax(this_character_dist, axis=1)) final = cgt.dimshuffle(cgt.stack(char_list), [1, 0]) return final
def pyramidLayer(nn_input, temporal_resolution_decrease=2): """ Batch by time by features. Decreases temporal resolution and increases feature dimension by a resolution decrease factor. """ t_steps = cgt.infer_shape(nn_input)[1] if t_steps % temporal_resolution_decrease != 0: raise ValueError('number of timesteps is not divisable by resolution decrease!') out_list = [] for iter_step in range(0, t_steps, temporal_resolution_decrease): concentrate_list = [] for sub_iter_step in range(0, temporal_resolution_decrease): concentrate_list.append(nn_input[:, iter_step + sub_iter_step, :]) out_list.append(cgt.concatenate(concentrate_list, axis=1)) return cgt.dimshuffle(cgt.stack(out_list), [1, 0, 2])
def __call__(self, x): input_btf = x input_tbf = cgt.dimshuffle(input_btf, [1, 0, 2]) seq_len, num_batch = input_tbf.shape[0], input_tbf.shape[1] def step(input_bh, hid_previous_bh): hid_pre_bh = self.hid_to_hid(hid_previous_bh) hid_pre_bh += self.in_to_hid(input_bh) return self.activation(hid_pre_bh) hid_init_bh = cgt.dot(cgt.ones((num_batch, 1)), self.hid_init) hid_out_tbf = unroll_recurrence( step_function=step, input_to_unroll_tbf=input_tbf, hid_init=[hid_init_bh], go_backwards=self.backwards, n_steps=self.timesteps) hid_out_btf = cgt.dimshuffle(hid_out_tbf, [1, 0, 2]) if self.backwards: hid_out_btf = cgt.flip(hid_out_btf, [1]) return hid_out_btf
def temporalDenseLayer(nn_input, num_units, activation=rectify, w_init=XavierNormal(), bias_init=Constant(0)): """ Batch by time by features. """ if len(nn_input.shape) > 3: nn_input = nn_input.reshape([nn_input.shape[0], nn_input.shape[1], nn_input.shape[2:]]) dims = cgt.infer_shape(nn_input) temporal_dims = dims[1] feature_dims = dims[2] affine_underbelly = Affine(feature_dims, num_units, weight_init=w_init, bias_init=bias_init) out_list = [] for iter_step in range(0, temporal_dims): input_slice = nn_input[:, iter_step, :] out_list.append(activation(affine_underbelly(input_slice))) return cgt.dimshuffle(cgt.stack(out_list), [1, 0, 2])
def __init__(self, input, n_in, n_out, W=None, b=None, activation=cgt.tanh, prefix=""): self.n_in = n_in self.n_out = n_out if W is None: # XXX replace with nn init W_values = np.asarray( rng.uniform( low=-np.sqrt(6. / (n_in + n_out)), high=np.sqrt(6. / (n_in + n_out)), size=(n_in, n_out) ), dtype=cgt.floatX ) if activation == cgt.sigmoid: W_values *= 4 W = cgt.shared(W_values, name=prefix+"_W") if b is None: b_values = np.zeros((n_out,), dtype=cgt.floatX) b = cgt.shared(b_values, name=prefix+"_b") self.W = W self.b = b # XXX broadcast api may change lin_output = cgt.broadcast("+", cgt.dot(input, self.W), cgt.dimshuffle(self.b, ["x", 0]), "xx,1x") self.output = ( lin_output if activation is None else activation(lin_output) ) # parameters of the model self.params = [self.W, self.b]
def __call__(self, input_btf): # (n_time_steps, n_batch, n_features) input_tbf = cgt.dimshuffle(input_btf, [1, 0, 2]) self.num_batches = cgt.infer_shape(input_tbf)[1] # Stack input weight matrices into a (num_inputs, 3*num_units) # matrix, which speeds up computation W_in_stacked = cgt.concatenate( [self.W_in_to_resetgate, self.W_in_to_updategate, self.W_in_to_hidden_update], axis=1) # Same for hidden weight matrices W_hid_stacked = cgt.concatenate( [self.W_hid_to_resetgate, self.W_hid_to_updategate, self.W_hid_to_hidden_update], axis=1) # Stack gate biases into a (3*num_units) vector b_stacked = cgt.concatenate( [self.b_resetgate, self.b_updategate, self.b_hidden_update], axis=1) # At each loop, input_n will be (n_time_steps, 3*num_units). # We define a slicing function that extract the input to each GRU gate def slice_w(x, n): return x[:, n*self.num_units:(n+1)*self.num_units] # Create single recurrent computation step function # input__n is the n'th vector of the input def step(input_n, hid_previous, W_hid_stacked, W_in_stacked, b_stacked): # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1} hid_input = cgt.dot(hid_previous, W_hid_stacked) # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c input_n = cgt.broadcast("+", input_n.dot(W_in_stacked), b_stacked, "xx,1x") # Reset and update gates resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0) updategate = slice_w(hid_input, 1) + slice_w(input_n, 1) resetgate = self.nonlinearity_resetgate(resetgate) updategate = self.nonlinearity_updategate(updategate) # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1}) hidden_update_in = slice_w(input_n, 2) hidden_update_hid = slice_w(hid_input, 2) hidden_update = hidden_update_in + resetgate*hidden_update_hid # Compute (1 - u_t)h_{t - 1} + u_t c_t hid = (1 - updategate)*hid_previous + updategate*hidden_update return hid sequences = [input_tbf] step_fun = step hid_init = cgt.dot(cgt.ones((self.num_batches, 1)), self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [W_hid_stacked] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function non_seqs += [W_in_stacked, b_stacked] # theano.scan only allows for positional arguments, so when # self.precompute_input is True, we need to supply fake placeholder # arguments for the input weights and biases. # Retrieve the dimensionality of the incoming layer hid_out = unroll_lstm( fn=step_fun, sequences=sequences, outputs_info=[hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=self.timesteps)[0] # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = cgt.dimshuffle(hid_out, [1, 0, 2]) # if scan is backward reverse the output if self.backwards: hid_out = cgt.flip(hid_out, [1]) return hid_out
def __call__(self, nn_input_btf): # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) nn_input_tbf = cgt.dimshuffle(nn_input_btf, [1, 0, 2]) seq_len, num_batch = nn_input_tbf.shape[0], nn_input_tbf.shape[1] def slice_w(x, n): return x[:, n*self.num_units:(n+1)*self.num_units] # Create single recurrent computation step function # input_n is the n'th vector of the input def step(input_n, cell_previous, hid_previous, W_hid_stacked, W_in_stacked, b_stacked): input_n = cgt.broadcast("+", cgt.dot(input_n, W_in_stacked), b_stacked, "xx,1x") # Calculate gates pre-activations and slice gates = input_n + cgt.dot(hid_previous, W_hid_stacked) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) outgate = self.nonlinearity_outgate(outgate) # Compute new cell value cell = forgetgate*cell_previous + ingate*cell_input # Compute new hidden unit activation hid = outgate*self.nonlinearity(cell) return [cell, hid] sequences = nn_input_tbf step_fun = step ones = cgt.ones((num_batch, 1)) cell_init = cgt.dot(ones, self.cell_init) hid_init = cgt.dot(ones, self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [self.W_hid_stacked] non_seqs += [self.W_in_stacked, self.b_stacked] cell_out, hid_out = unroll_lstm( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=self.timesteps) # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = cgt.dimshuffle(hid_out, [1, 0, 2]) # if scan is backward reverse the output if self.backwards: hid_out = cgt.flip(hid_out, [1]) return hid_out
def dimshuffle(x, *pattern): if isinstance(pattern[0], (list, tuple)): pattern = pattern[0] return cgt.dimshuffle(x, list(pattern))