def train(): memory = [] Transition = collections.namedtuple( "Transition", ["state", "action", "reward", "next_state", "next_action"]) model = ActorCritic(flags.n_actions, flags.n_features, flags.lr_C, flags.lr_A, flags.gamma, empty_goal_action) loss_his = [] entropy_his = [] reward_his = [] for ii in range(flags.max_epoch): state = env.reset() init_state = state.copy() reward_all = 0 done = False steps = 0 loss = 0 t_start = time.time() action = model.choose_action(state) while not done: next_state, reward, done, _ = env.step(action) next_action = model.choose_action(next_state) reward_all += reward steps += 1 if len(memory) > flags.memory_size: memory.pop(0) memory.append( Transition(state, action, reward, next_state, next_action)) state = next_state action = next_action if len(memory) > flags.batch_size: batch_transition = random.sample(memory, flags.batch_size) batch_state, batch_action, batch_reward, batch_next_state, batch_next_action = map( np.array, zip(*batch_transition)) loss, _ = model.train(state=batch_state, action=batch_action, reward=batch_reward, state_=batch_next_state, action_=batch_next_action) entropy = model.compute_entropy(init_state) if loss != 0: loss_his.append(loss) entropy_his.append(entropy) reward_his.append(reward_all) print("epoch=", ii, "/time=", time.time() - t_start, "/loss=", loss, "/entropy=", entropy, "/reward=", reward_all) return loss_his, entropy_his, reward_his
class Agent(mp.Process): def __init__(self, global_actor_critic, optimizer, input_dims, nb_actions, gamma, lr, name, global_ep_index, env_id): super(Agent, self).__init__() self.local_actor_critic = ActorCritic(input_dims, nb_actions, gamma) self.global_actor_critic = global_actor_critic self.name = "w%02i" % name self.episode_index = global_ep_index self.env = gym.make(env_id) self.optimizer = optimizer def run(self): t_step = 1 while self.episode_index.value < EPISODES: done = False observation = self.env.reset() score = 0 self.local_actor_critic.clear_memory() while not done: action = self.local_actor_critic.choose_action(observation) observation_, reward, done, info = self.env.step(action) score += reward self.local_actor_critic.remember(observation, action, reward) if (t_step % T_MAX) == 0 or done: loss = self.local_actor_critic.calc_loss(done) self.optimizer.zero_grad() loss.backward() for local_param, global_param in zip( self.local_actor_critic.parameters(), self.global_actor_critic.parameters()): global_param._grad = local_param.grad self.optimizer.step() self.local_actor_critic.load_state_dict(self.global_actor_critic.state_dict()) self.local_actor_critic.clear_memory() t_step += 1 observation = observation_ with self.episode_index.get_lock(): self.episode_index.value += 1 print(self.name, 'episode ', self.episode_index.value, 'reward %.1f' % score)