def build_program(self): self.pred_program = fluid.Program() self.learn_program = fluid.Program() self.next_q_program = fluid.Program() self.next_a_program = fluid.Program() with fluid.program_guard(self.pred_program): obs = layers.data( name='obs', shape=[self.obs_dim_n[self.agent_index]], dtype='float32') self.pred_act = self.alg.predict(obs) with fluid.program_guard(self.learn_program): obs_n = [ layers.data( name='obs' + str(i), shape=[self.obs_dim_n[i]], dtype='float32') for i in range(self.n) ] act_n = [ layers.data( name='act' + str(i), shape=[self.act_dim_n[i]], dtype='float32') for i in range(self.n) ] target_q = layers.data(name='target_q', shape=[], dtype='float32') self.critic_cost = self.alg.learn(obs_n, act_n, target_q) with fluid.program_guard(self.next_q_program): obs_n = [ layers.data( name='obs' + str(i), shape=[self.obs_dim_n[i]], dtype='float32') for i in range(self.n) ] act_n = [ layers.data( name='act' + str(i), shape=[self.act_dim_n[i]], dtype='float32') for i in range(self.n) ] self.next_Q = self.alg.Q_next(obs_n, act_n) with fluid.program_guard(self.next_a_program): obs = layers.data( name='obs', shape=[self.obs_dim_n[self.agent_index]], dtype='float32') self.next_action = self.alg.predict_next(obs) if self.speedup: self.pred_program = parl.compile(self.pred_program) self.learn_program = parl.compile(self.learn_program, self.critic_cost) self.next_q_program = parl.compile(self.next_q_program) self.next_a_program = parl.compile(self.next_a_program)
def test_compiled_restore(self): agent = TestAgent(self.alg) agent.learn_program = parl.compile(agent.learn_program) obs = np.random.random([3, 10]).astype('float32') previous_output = agent.predict(obs) save_path1 = 'model.ckpt' agent.save(save_path1) agent.restore(save_path1) # a new agent instance another_agent = TestAgent(self.alg) another_agent.learn_program = parl.compile(another_agent.learn_program) another_agent.restore(save_path1) current_output = another_agent.predict(obs) np.testing.assert_equal(current_output, previous_output)
def build_program(self): self.predict_program = fluid.Program() with fluid.program_guard(self.predict_program): obs = layers.data(name='obs', shape=[self.config['obs_dim']], dtype='float32') self.predict_action = self.alg.predict(obs) self.predict_program = parl.compile(self.predict_program)
def build_program(self): self.sample_program = fluid.Program() self.predict_program = fluid.Program() self.learn_program = fluid.Program() with fluid.program_guard(self.sample_program): obs = layers.data(name='obs', shape=self.obs_shape, dtype='float32') self.sample_actions, self.behaviour_logits = self.alg.sample(obs) with fluid.program_guard(self.predict_program): obs = layers.data(name='obs', shape=self.obs_shape, dtype='float32') self.predict_actions = self.alg.predict(obs) with fluid.program_guard(self.learn_program): obs = layers.data(name='obs', shape=self.obs_shape, dtype='float32') actions = layers.data(name='actions', shape=[], dtype='int64') behaviour_logits = layers.data(name='behaviour_logits', shape=[self.act_dim], dtype='float32') rewards = layers.data(name='rewards', shape=[], dtype='float32') dones = layers.data(name='dones', shape=[], dtype='float32') lr = layers.data(name='lr', shape=[1], dtype='float32', append_batch_size=False) entropy_coeff = layers.data(name='entropy_coeff', shape=[1], dtype='float32', append_batch_size=False) self.learn_reader = fluid.layers.create_py_reader_by_data( capacity=32, feed_list=[ obs, actions, behaviour_logits, rewards, dones, lr, entropy_coeff ]) obs, actions, behaviour_logits, rewards, dones, lr, entropy_coeff = fluid.layers.read_file( self.learn_reader) vtrace_loss, kl = self.alg.learn(obs, actions, behaviour_logits, rewards, dones, lr, entropy_coeff) self.learn_outputs = [ vtrace_loss.total_loss, vtrace_loss.pi_loss, vtrace_loss.vf_loss, vtrace_loss.entropy, kl ] self.learn_program = parl.compile(self.learn_program, vtrace_loss.total_loss)
def build_program(self): self.sample_program = fluid.Program() self.predict_program = fluid.Program() self.value_program = fluid.Program() self.learn_program = fluid.Program() with fluid.program_guard(self.sample_program): obs = layers.data(name='obs', shape=self.obs_shape, dtype='float32') sample_actions, values = self.alg.sample(obs) self.sample_outputs = [sample_actions, values] with fluid.program_guard(self.predict_program): obs = layers.data(name='obs', shape=self.obs_shape, dtype='float32') self.predict_actions = self.alg.predict(obs) with fluid.program_guard(self.value_program): obs = layers.data(name='obs', shape=self.obs_shape, dtype='float32') self.values = self.alg.value(obs) with fluid.program_guard(self.learn_program): obs = layers.data(name='obs', shape=self.obs_shape, dtype='float32') actions = layers.data(name='actions', shape=[], dtype='int64') advantages = layers.data(name='advantages', shape=[], dtype='float32') target_values = layers.data(name='target_values', shape=[], dtype='float32') lr = layers.data(name='lr', shape=[1], dtype='float32', append_batch_size=False) entropy_coeff = layers.data(name='entropy_coeff', shape=[1], dtype='float32', append_batch_size=False) total_loss, pi_loss, vf_loss, entropy = self.alg.learn( obs, actions, advantages, target_values, lr, entropy_coeff) self.learn_outputs = [total_loss, pi_loss, vf_loss, entropy] self.learn_program = parl.compile(self.learn_program, total_loss)
def build_program(self): self.pred_program = fluid.Program() self.learn_program = fluid.Program() self.supervised_eval_program = fluid.Program() with fluid.program_guard(self.pred_program): obs = layers.data( name='obs', shape=[CONTEXT_LEN, IMAGE_SIZE[0], IMAGE_SIZE[1]], dtype='float32') self.value = self.alg.predict(obs) with fluid.program_guard(self.learn_program): obs = layers.data( name='obs', shape=[CONTEXT_LEN, IMAGE_SIZE[0], IMAGE_SIZE[1]], dtype='float32') action = layers.data(name='act', shape=[1], dtype='int32') reward = layers.data(name='reward', shape=[], dtype='float32') next_obs = layers.data( name='next_obs', shape=[CONTEXT_LEN, IMAGE_SIZE[0], IMAGE_SIZE[1]], dtype='float32') terminal = layers.data(name='terminal', shape=[], dtype='bool') self.cost = self.alg.learn(obs, action, reward, next_obs, terminal) # use parl.compile to distribute data and model to GPUs self.learn_program = parl.compile(self.learn_program, loss=self.cost) with fluid.program_guard(self.supervised_eval_program): obs = layers.data( name='obs', shape=[CONTEXT_LEN, IMAGE_SIZE[0], IMAGE_SIZE[1]], dtype='float32') action = layers.data(name='act', shape=[1], dtype='int32') reward = layers.data(name='reward', shape=[], dtype='float32') next_obs = layers.data( name='next_obs', shape=[CONTEXT_LEN, IMAGE_SIZE[0], IMAGE_SIZE[1]], dtype='float32') terminal = layers.data(name='terminal', shape=[], dtype='bool') self.supervised_cost = self.alg.supervised_eval( obs, action, reward, next_obs, terminal)