def _build_baseline(self, emb_size, n_observations, lstm_n_cells, o, r, lr): b_input = Embedding(name="bemb", size=emb_size, n_features=n_observations, input=o) b_lstm = LstmRecurrent(name="blstm", size=lstm_n_cells, seq_output=True, out_cells=False, peepholes=False, output_initial_state=False, p_drop=0.0) b_lstm.connect(b_input) b_out = MLP([1], ['linear'], [0.0 ], name="mlpb") b_out.connect(b_lstm) b = b_out.output() b = b.reshape((b.shape[0], b.shape[1], )) params = b_out.get_params() loss = ((b - r)**2).sum() d_loss = theano.grad(loss, params) upd = [] for p, dp in zip(params, d_loss): upd.append((p, p - lr * dp)) self.blearn = theano.function([o, r, lr], loss, updates=upd) return b
def _build_model_o(self, emb_size, n_observations, lstm_n_cells, oclf_n_hidden, oclf_n_layers, n_actions, oclf_activation): o = tt.imatrix(name='o') # Dimensions: (time, seq_id) a = tt.imatrix(name='a') r = tt.matrix(name='r') lr = tt.scalar(name='lr') b = self._build_baseline(emb_size, n_observations, 10, o, r, lr) l_input = Embedding(name="emb", size=emb_size, n_features=n_observations, input=o) prev_layer = l_input l_lstm = LstmRecurrent(name="lstm", size=lstm_n_cells, seq_output=True, out_cells=False, peepholes=False, output_initial_state=False, p_drop=0.0) l_lstm.connect(prev_layer) prev_layer = l_lstm l_action = MLP([oclf_n_hidden ] * oclf_n_layers + [n_actions], [oclf_activation] * oclf_n_layers + ['softmax'], [0.0 ] * oclf_n_layers + [0.0 ], name="mlp") l_action.connect(prev_layer) prev_layer = l_action pi = prev_layer.output() # Flatten the actions so that they are stacked in a matrix. Timestep, by timestep. orig_shape = pi.shape pi = tt.reshape(pi, (pi.shape[0] * pi.shape[1], pi.shape[2])) col_actions = tt.reshape(a, (pi.shape[0], )) col_rewards = tt.reshape(r, (pi.shape[0], )) col_b = tt.reshape(b, (pi.shape[0], )) params = [x for x in l_action.get_params()] # if not x.name.startswith('mlp_0')] print params lin_actions_p = pi[tt.arange(pi.shape[0]), (col_actions)] #+ 1e-7 objective = tt.sum(tt.log(lin_actions_p) * (col_rewards - col_b)) # * 1.0 / orig_shape[1] pi = tt.reshape(pi, orig_shape) d_objective = theano.grad(objective, params) d_objective = clip_norms(d_objective, 5.0) upd = [] for p, dp in zip(params, d_objective): upd.append((p, p + lr * dp)) self.learn = theano.function([o, a, r, lr], [pi, objective, b] + d_objective, updates=upd) self.pi = theano.function([o], pi) self.params = params = [x for x in l_action.get_params()] self.orig_values = [] for param in params: val = np.copy(param.get_value()) self.orig_values.append(val)