def train(self): """train""" inputs = self.model.create_inputs(mode='train') output_dict = self.model.forward(inputs, mode='train') total_loss = 0 if 'click' in self._output_type: click_id = inputs['click_id'] click_prob = output_dict['click_prob'] click_loss = layers.reduce_mean( layers.cross_entropy(input=click_prob, label=click_id)) total_loss += click_loss if 'credit' in self._output_type: credit = inputs['credit'] * self._credit_scale credit_pred = output_dict['credit_pred'] credit_loss = layers.reduce_mean( layers.square_error_cost(input=credit_pred, label=credit)) total_loss += credit_loss if 'rate' in self._output_type: rate = layers.cast(inputs['click_id'], 'float32') * self._rate_scale rate_pred = output_dict['rate_pred'] rate_loss = layers.reduce_mean( layers.square_error_cost(input=rate_pred, label=rate)) total_loss += rate_loss if self.optimizer == 'Adam': optimizer = fluid.optimizer.Adam(learning_rate=self.lr, epsilon=1e-4) elif self.optimizer == 'SGD': optimizer = fluid.optimizer.SGD(learning_rate=self.lr) optimizer.minimize(total_loss) fetch_dict = OrderedDict() fetch_dict[ 'loss'] = total_loss # don't rename 'loss', which will be used in parallel exe in computational task if 'click' in self._output_type: fetch_dict['click_prob'] = click_prob fetch_dict['click_id'] = click_id fetch_dict['click_loss'] = click_loss if 'credit' in self._output_type: fetch_dict['credit_pred'] = credit_pred / self._credit_scale fetch_dict['credit'] = credit / self._credit_scale fetch_dict['credit_loss'] = credit_loss if 'rate' in self._output_type: fetch_dict['rate_pred'] = rate_pred / self._rate_scale fetch_dict['rate'] = rate / self._rate_scale fetch_dict['rate_loss'] = rate_loss return {'fetch_dict': fetch_dict}
def learn(self, obs, action, reward, next_obs, terminal): """ 使用DQN算法更新self.model的value网络 """ # 从target_model中获取 max Q' 的值,用于计算target_Q next_pred_value = self.target_model.value(next_obs) best_v = layers.reduce_max(next_pred_value, dim=1) best_v.stop_gradient = True # 阻止梯度传递 terminal = layers.cast(terminal, dtype='float32') target = reward + (1.0 - terminal) * self.gamma * best_v pred_value = self.model.value(obs) # 获取Q预测值 # 将action转onehot向量,比如:3 => [0,0,0,1,0],独热编码有好处 action_onehot = layers.one_hot(action, self.act_dim) action_onehot = layers.cast(action_onehot, dtype='float32') # 下面一行是逐元素相乘,拿到action对应的 Q(s,a) # 比如:pred_value = [[2.3, 5.7, 1.2, 3.9, 1.4]], action_onehot = [[0,0,0,1,0]] # ==> pred_action_value = [[3.9]] pred_action_value = layers.reduce_sum(layers.elementwise_mul( action_onehot, pred_value), dim=1) # 计算 Q(s,a) 与 target_Q的均方差,得到loss cost = layers.square_error_cost(pred_action_value, target) cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam(learning_rate=self.lr) # 使用Adam优化器 optimizer.minimize(cost) return cost
def define_learn(self, obs, action, reward, next_obs, terminal, weight): #Q(s,a|θ) pred_value = self.model.value(obs) #Q(s',a'|θ') targetQ_predict_value = self.target_model.value(next_obs) #Q(s',a'|θ) next_s_predcit_value = self.model.value(next_obs) #argMax[Q(s',a'|θ)] greedy_action = fluid_argmax(next_s_predcit_value) predict_onehot = fluid.layers.one_hot(greedy_action, self.action_dim) #Q(s',argMax[Q(s',a'|θ)]|θ') best_v = fluid.layers.reduce_sum(fluid.layers.elementwise_mul( predict_onehot, targetQ_predict_value), dim=1) best_v.stop_gradient = True #TD目标: R+γ*Q(s',argMax[Q(s',a'|θ)]|θ') target = reward + ( 1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * best_v action_onehot = layers.one_hot(action, self.action_dim) action_onehot = layers.cast(action_onehot, dtype='float32') pred_action_value = layers.reduce_sum(layers.elementwise_mul( action_onehot, pred_value), dim=1) #计算新的TD-Error newTd = layers.abs(target - pred_action_value) cost = layers.square_error_cost(pred_action_value, target) #weight表示样本的权重,影响cost的更新幅度 cost = weight * cost cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam(self.lr, epsilon=1e-3) optimizer.minimize(cost) return cost, newTd
def _ensemble_predict(self, obs): actor_outputs = [] for i in range(self.ensemble_num): actor_outputs.append(self.actors[i].predict(obs)) batch_actions = layers.concat(actor_outputs, axis=0) batch_obs = layers.expand(obs, expand_times=[self.ensemble_num, 1]) critic_outputs = [] for i in range(self.ensemble_num): critic_output = self.critics[i].predict(batch_obs, batch_actions) critic_output = layers.unsqueeze(critic_output, axes=[1]) critic_outputs.append(critic_output) score_matrix = layers.concat(critic_outputs, axis=1) # Normalize scores given by each critic sum_critic_score = layers.reduce_sum( score_matrix, dim=0, keep_dim=True) sum_critic_score = layers.expand( sum_critic_score, expand_times=[self.ensemble_num, 1]) norm_score_matrix = score_matrix / sum_critic_score actions_mean_score = layers.reduce_mean( norm_score_matrix, dim=1, keep_dim=True) best_score_id = layers.argmax(actions_mean_score, axis=0) best_score_id = layers.cast(best_score_id, dtype='int32') ensemble_predict_action = layers.gather(batch_actions, best_score_id) ensemble_predict_action = layers.squeeze( ensemble_predict_action, axes=[0]) return ensemble_predict_action
def value(self, obs): obs = obs / 255.0 out = self.conv1(obs) out = layers.pool2d(input=out, pool_size=2, pool_stride=2, pool_type='max') out = self.conv2(out) out = layers.pool2d(input=out, pool_size=2, pool_stride=2, pool_type='max') out = self.conv3(out) out = layers.pool2d(input=out, pool_size=2, pool_stride=2, pool_type='max') out = self.conv4(out) out = layers.flatten(out, axis=1) if self.algo == 'Dueling': As = self.fc2_adv(self.fc1_adv(out)) V = self.fc2_val(self.fc1_val(out)) Q = As + (V - layers.reduce_mean(As, dim=1, keep_dim=True)) else: Q = self.fc1(out) return Q
def _actor_learn(self, obs): action = self.model.policy(obs) Q = self.model.value(obs, action) cost = layers.reduce_mean(-1.0 * Q) optimizer = fluid.optimizer.AdamOptimizer(self.actor_lr) optimizer.minimize(cost, parameter_list=self.model.get_actor_params()) return cost
def value(self, obs): out = self.fc1(obs) out = self.fc2(out) #Q = self.fc3(h2) As = self.fc2_adv(self.fc1_adv(out)) V = self.fc2_val(self.fc1_val(out)) Q = As + (V - layers.reduce_mean(As, dim=1, keep_dim=True)) return Q
def value(self, obs): if self.algo == 'Dueling': As = self.fc3_adv(self.fc2_adv(self.fc1_adv(obs))) V = self.fc3_val(self.fc2_val(self.fc1_val(obs))) Q = As + (V - layers.reduce_mean(As, dim=1, keep_dim=True)) else: h1 = self.fc1(obs) h2 = self.fc2(h1) Q = self.fc3(h2) return Q
def train_actor(inputs): output_dict = self.model.forward(inputs, output_type='max_Q') max_Q = output_dict['Q'] actor_loss = layers.reduce_mean(-1.0 * max_Q) actor_lr = self.lr * 0.1 # actor lr should be smaller than critic lr, so critic can learn faster if self.optimizer == 'Adam': optimizer = fluid.optimizer.Adam(learning_rate=actor_lr, epsilon=1e-4) elif self.optimizer == 'SGD': optimizer = fluid.optimizer.SGD(learning_rate=actor_lr) optimizer.minimize(actor_loss, parameter_list=self.model.actor_param_names) return actor_loss
def test(self): """test""" inputs = self.model.create_inputs(mode='test') click_prob = self.model.forward(inputs) fetch_dict = OrderedDict() fetch_dict['click_prob'] = click_prob fetch_dict['click_id'] = inputs['click_id'] + layers.reduce_mean( click_prob ) * 0 # IMPORTANT!!! equals to label = label, otherwise parallel executor won't get this variable return {'fetch_dict': fetch_dict}
def train_critic(inputs, click_id): output_dict = self.model.forward(inputs, output_type='c_Q') c_Q = output_dict['Q'] target_Q = self.get_target_Q(inputs, click_id) target_Q.stop_gradient = True critic_loss = layers.reduce_mean(layers.square_error_cost(c_Q, target_Q)) if self.optimizer == 'Adam': optimizer = fluid.optimizer.Adam(learning_rate=self.lr, epsilon=1e-4) elif self.optimizer == 'SGD': optimizer = fluid.optimizer.SGD(learning_rate=self.lr) optimizer.minimize(critic_loss) return critic_loss
def test_sync_weights_with_batch_norm(self): model = TestModel3() target_model = deepcopy(model) program1 = fluid.Program() program2 = fluid.Program() with fluid.program_guard(program1): obs = layers.data(name='obs', shape=[32, 128, 128], dtype="float32") model_output = model.predict(obs) loss = layers.reduce_mean(model_output) optimizer = fluid.optimizer.AdamOptimizer(1e-3) optimizer.minimize(loss) with fluid.program_guard(program2): obs = layers.data(name='obs', shape=[32, 128, 128], dtype="float32") model_output = model.predict(obs) target_model_output = target_model.predict(obs) self.executor.run(fluid.default_startup_program()) N = 10 random_obs = np.random.random(size=(N, 32, 128, 128)).astype('float32') for i in range(N): x = np.expand_dims(random_obs[i], axis=0) outputs = self.executor.run( program2, feed={'obs': x}, fetch_list=[model_output, target_model_output]) self.assertNotEqual(np.sum(outputs[0].flatten()), np.sum(outputs[1].flatten())) # run optimizing to make parameters of batch_norm between model and target_model are different N = 100 random_obs = np.random.random(size=(N, 32, 128, 128)).astype('float32') for i in range(N): x = np.expand_dims(random_obs[i], axis=0) self.executor.run(program1, feed={'obs': x}) model.sync_weights_to(target_model) random_obs = np.random.random(size=(N, 32, 128, 128)).astype('float32') for i in range(N): x = np.expand_dims(random_obs[i], axis=0) outputs = self.executor.run( program2, feed={'obs': x}, fetch_list=[model_output, target_model_output]) self.assertEqual(np.sum(outputs[0].flatten()), np.sum(outputs[1].flatten()))
def learn(self, obs, action, reward): act_prob = self.model(obs) # log_prob = layers.cross_entropy(act_prob, action) log_prob = layers.reduce_sum( -1.0 * layers.log(act_prob) * layers.one_hot( action, act_prob.shape[1]), dim=1) cost = log_prob * reward cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam(self.lr) optimizer.minimize(cost) return cost
def _actor_learn(self, obs): action = self.model.policy(obs) Q = self.model.value(obs, action) cost = layers.reduce_mean(-1.0 * Q) # optimizer = fluid.optimizer.AdamOptimizer(self.actor_lrvalue) optimizer = fluid.optimizer.AdamOptimizer( learning_rate=fluid.layers.piecewise_decay( boundaries=self.boundaries, values=self.actor_lrvalue), regularization=fluid.regularizer.L2Decay(1e-4)) optimizer.minimize(cost, parameter_list=self.model.get_actor_params()) return cost
def learn(self, obs, action, reward): """ 用policy gradient 算法更新policy model """ act_prob = self.model(obs) # 获取输出动作概率 log_prob = layers.cross_entropy(act_prob, action) # 交叉熵 #log_prob = layers.reduce_sum(-1.0 * layers.log(act_prob) * layers.one_hot(action, act_prob.shape[1]),dim=1) cost = log_prob * reward cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam(self.lr) optimizer.minimize(cost) return cost
def test(self): """test""" inputs = self.model.create_inputs(mode='test') output_dict = self.model.forward(inputs, mode='test') fetch_dict = OrderedDict() if 'click' in self._output_type: fetch_dict['click_prob'] = output_dict['click_prob'] fetch_dict['click_id'] = inputs['click_id'] + layers.reduce_mean( output_dict['click_prob'] ) * 0 # IMPORTANT!!! equals to label = label, otherwise parallel executor won't get this variable if 'credit' in self._output_type: fetch_dict['credit_pred'] = output_dict[ 'credit_pred'] / self._credit_scale fetch_dict['credit'] = inputs['credit'] + layers.reduce_mean( output_dict['credit_pred']) * 0 if 'rate' in self._output_type: fetch_dict[ 'rate_pred'] = output_dict['rate_pred'] / self._rate_scale fetch_dict['rate'] = layers.cast(inputs['click_id'], 'float32') \ + layers.reduce_mean(output_dict['rate_pred']) * 0 return {'fetch_dict': fetch_dict}
def _critic_learn(self, obs, action, reward, next_obs, terminal): next_action = self.target_model.policy(next_obs) next_Q = self.target_model.value(next_obs, next_action) terminal = layers.cast(terminal, dtype='float32') target_Q = reward + (1.0 - terminal) * self.gamma * next_Q target_Q.stop_gradient = True Q = self.model.value(obs, action) cost = layers.square_error_cost(Q, target_Q) cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.AdamOptimizer(self.critic_lr) optimizer.minimize(cost) return cost
def test(self): """test""" inputs = self.model.create_inputs(mode='train') click_id = layers.cast(inputs['click_id'], 'float32') * self._reward_scale output_dict = self.model.forward(inputs, output_type='c_Q') c_Q = output_dict['Q'] target_Q = self.get_target_Q(inputs, click_id) loss = layers.reduce_mean(layers.square_error_cost(c_Q, target_Q)) fetch_dict = OrderedDict() fetch_dict['loss'] = loss fetch_dict['c_Q'] = c_Q / self._reward_scale fetch_dict['click_id'] = click_id / self._reward_scale return {'fetch_dict': fetch_dict}
def test(self): """test""" inputs = self.model.create_inputs(mode='train') reward = layers.cast(inputs['reward'], 'float32') c_Q = self.model.forward(inputs, output_type='c_Q') max_Q = self.target_model.forward(inputs, output_type='max_Q') target_Q = self.get_target_Q(max_Q, reward) loss = layers.reduce_mean(layers.square_error_cost(c_Q, target_Q)) fetch_dict = OrderedDict() fetch_dict['loss'] = loss fetch_dict['c_Q'] = c_Q fetch_dict['reward'] = reward return {'fetch_dict': fetch_dict}
def learn(self, obs, action, reward, next_obs, terminal): ''' :param obs: St :param action: At :param reward: Rt+1 :param next_obs: St+1 :param terminal: done, True代表episode结束 :return: 损失函数的值 ''' # 通过目标网络计算得到target_Q的值 target_Q_tensor = self.target_model.value(next_obs) # 计算St+1对应的价值向量 max_Q = layers.reduce_max(target_Q_tensor, dim=1) # 获取每行的最大值,按dim=1收缩 max_Q.stop_gradient = True # 停止梯度更新 # 由于terminal不是标量,所以不能直接用判断 terminal = layers.cast(terminal, dtype="float32") target_Q = reward + (1.0 - terminal) * self.gamma * max_Q # 通过主网络计算得到perdict_Q的值 predict_Q_tensor = self.model.value(obs) # 将action转成one-hot向量,并将每一位都变成浮点数 action_onehot = layers.one_hot(action, self.act_dim) action = layers.cast(action_onehot, dtype="float32") # 进行elementwise计算并降低张量阶数 # 比如 predict_Q_tensor = [[2.3, 5.7, 1.2, 3.9, 1.4], action_onehot=[[0, 0, 0, 1, 0] # [2.1, 3.7, 4.5, 6.7, 7.1]] [0, 1, 0, 0, 0]] # 那么elementwise乘法运算后的结果是 [[0, 0, 0, 3.9, 0] # [0, 3.7, 0, 0, 0]] # 再进行dim=1的reduce_sum操作后的结果是 [3.9, 3.7] predict_Q = layers.reduce_sum(layers.elementwise_mul( action_onehot, predict_Q_tensor), dim=1) # 得到这个batch每条数据的损失函数值的平均值 cost = layers.square_error_cost(predict_Q, target_Q) cost = layers.reduce_mean(cost) # 申明优化器(使用Adam优化器) optimizer = fluid.optimizer.Adam(learning_rate=self.lr) optimizer.minimize(cost) # 指定优化目标 return cost
def learn(self, obs, action, reward, next_obs, terminal): next_pred_value = self.target_model.value(next_obs) best_v = layers.reduce_max(next_pred_value, dim=-1) best_v.stop_gradient = True terminal = layers.cast(terminal, dtype="float32") target = reward + (1.0 - terminal) * self.gamma * best_v pred_value = self.model.value(obs) action_onehot = layers.one_hot(action, self.act_dim) action_onehot = layers.cast(action_onehot, dtype="float32") pred_action_value = layers.reduce_sum(layers.elementwise_mul( pred_value, action_onehot), dim=-1) cost = layers.square_error_cost(target, pred_action_value) cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam(learning_rate=self.lr) optimizer.minimize(cost) return cost
def _critic_learn(self, obs, action, reward, next_obs, terminal): next_action = self.target_model.policy(next_obs) next_Q = self.target_model.value(next_obs, next_action) terminal = layers.cast(terminal, dtype='float32') target_Q = reward + (1.0 - terminal) * self.gamma * next_Q target_Q.stop_gradient = True Q = self.model.value(obs, action) cost = layers.square_error_cost(Q, target_Q) cost = layers.reduce_mean(cost) # optimizer = fluid.optimizer.AdamOptimizer(self.critic_lrvalue) optimizer = fluid.optimizer.AdamOptimizer( learning_rate=fluid.layers.piecewise_decay( boundaries=self.boundaries, values=self.critic_lrvalue), regularization=fluid.regularizer.L2Decay(1e-4)) optimizer.minimize(cost) return cost
def learn(self, obs, action, reward): """ :param obs: [B,4] :param action: [B,1] :param reward: [B,] :return: """ act_prob = self.model(obs) # [B,2] # [B, 2] -> [B, ] log_prob = layers.reduce_sum( -1.0 * layers.log(act_prob) * layers.one_hot(action, depth=act_prob.shape[1]), dim=1, keep_dim=False) cost = log_prob * reward cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam(self.lr) optimizer.minimize(cost) return cost
def train(self): """train""" inputs = self.model.create_inputs(mode='train') reward = layers.cast(inputs['reward'], 'float32') c_Q = self.model.forward(inputs, output_type='c_Q') max_Q = self.target_model.forward(inputs, output_type='max_Q') target_Q = self.get_target_Q(max_Q, reward) loss = layers.reduce_mean(layers.square_error_cost(c_Q, target_Q)) if self.optimizer == 'Adam': optimizer = fluid.optimizer.Adam(learning_rate=self.lr, epsilon=1e-4) elif self.optimizer == 'SGD': optimizer = fluid.optimizer.SGD(learning_rate=self.lr) optimizer.minimize(loss) fetch_dict = OrderedDict() fetch_dict['loss'] = loss # don't rename 'loss', which will be used in parallel exe in computational task fetch_dict['c_Q'] = c_Q fetch_dict['reward'] = reward return {'fetch_dict': fetch_dict}
def train(self): """train""" inputs = self.model.create_inputs(mode='train') click_id = inputs['click_id'] click_prob = self.model.forward(inputs, mode='train') loss = layers.reduce_mean( layers.cross_entropy(input=click_prob, label=click_id)) if self.optimizer == 'Adam': optimizer = fluid.optimizer.Adam(learning_rate=self.lr, epsilon=1e-4) elif self.optimizer == 'SGD': optimizer = fluid.optimizer.SGD(learning_rate=self.lr) optimizer.minimize(loss) fetch_dict = OrderedDict() fetch_dict[ 'loss'] = loss # don't rename 'loss', which will be used in parallel exe in computational task fetch_dict['click_prob'] = click_prob fetch_dict['click_id'] = click_id return {'fetch_dict': fetch_dict}
def train(self): """train""" inputs = self.model.create_inputs(mode='train') click_id = layers.cast(inputs['click_id'], 'float32') * self._reward_scale output_dict = self.model.forward(inputs, output_type='c_Q') c_Q = output_dict['Q'] target_Q = self.get_target_Q(inputs, click_id) target_Q.stop_gradient = True loss = layers.reduce_mean(layers.square_error_cost(c_Q, target_Q)) if self.optimizer == 'Adam': optimizer = fluid.optimizer.Adam(learning_rate=self.lr, epsilon=1e-4) elif self.optimizer == 'SGD': optimizer = fluid.optimizer.SGD(learning_rate=self.lr) optimizer.minimize(loss) fetch_dict = OrderedDict() fetch_dict['loss'] = loss # don't rename 'loss', which will be used in parallel exe in computational task fetch_dict['c_Q'] = c_Q / self._reward_scale fetch_dict['click_id'] = click_id / self._reward_scale return {'fetch_dict': fetch_dict}
def ensemble_predict(self, obs): """ ensemble predict: 1. For actions of all actors, each critic will score them and normalize its scores; 2. For each actor, will calculate its score by average scores given by all critics 3. choose action of the actor whose score is best """ actor_outputs = [] for i in range(self.ensemble_num): actor_outputs.append(self.models[i].policy(obs)) batch_actions = layers.concat(actor_outputs, axis=0) batch_obs = layers.expand(obs, expand_times=[self.ensemble_num, 1]) critic_outputs = [] for i in range(self.ensemble_num): critic_output = self.models[i].value(batch_obs, batch_actions) critic_output = layers.unsqueeze(critic_output, axes=[1]) critic_outputs.append(critic_output) score_matrix = layers.concat(critic_outputs, axis=1) # Normalize scores given by each critic sum_critic_score = layers.reduce_sum(score_matrix, dim=0, keep_dim=True) sum_critic_score = layers.expand(sum_critic_score, expand_times=[self.ensemble_num, 1]) norm_score_matrix = score_matrix / sum_critic_score actions_mean_score = layers.reduce_mean(norm_score_matrix, dim=1, keep_dim=True) best_score_id = layers.argmax(actions_mean_score, axis=0) best_score_id = layers.cast(best_score_id, dtype='int32') ensemble_predict_action = layers.gather(batch_actions, best_score_id) return ensemble_predict_action