def __init__(self): self.model = ActorCritic(num_actions=len(ACTIONS)) self.model.to("cuda") self.worker = RolloutWorker(self, ENV_ID, NUM_WORKERS, T) self.train_history = dict() self.train_history['frames_trained'] = 0 self.train_history['average_entropy'] = [] self.train_history['average_values'] = []
def train(num = 2000): agent = ActorCritic(env.observation_space.shape[0], env.action_space.n) # agent.load_model() steps = [] for i_episode in range(num): old_observation = env.reset() old_action = agent.get_action(np.reshape(old_observation, [1, env.observation_space.shape[0]])) done = False step = 0 while not done: step = step + 1 # env.render() observation, reward, done, info = env.step(old_action) if done: reward = - 20 td_error = agent.train_critic(reward, np.reshape(old_observation, [1, env.observation_space.shape[0]]), np.reshape(observation, [1, env.observation_space.shape[0]])) agent.train_actor(td_error, np.reshape(old_observation, [1, env.observation_space.shape[0]]), old_action) old_observation = observation old_action = agent.get_action(np.reshape(old_observation, [1, env.observation_space.shape[0]])) if done: steps.append(step) print("{}:{} steps".format(i_episode, step)) agent.save_model() break
def __init__(self, state_size, action_size, num_agents, hidden_size=64, lr=1e-4): self.model = ActorCritic(state_size, action_size, hidden_size=hidden_size) self.optimizer = Adam(self.model.parameters(), lr=lr) self.agents = [PPO_Agent() for _ in range(num_agents)]
def __init__(self, state_dim, action_dim, n_agents, lr, betas, gamma, K_epochs, eps_clip): self.lr = lr self.betas = betas self.gamma = gamma self.eps_clip = eps_clip self.K_epochs = K_epochs self.policy = ActorCritic(state_dim, action_dim, n_agents).to(device) self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr, betas=betas) self.policy_old = ActorCritic(state_dim, action_dim, n_agents).to(device) self.policy_old.load_state_dict(self.policy.state_dict()) self.MseLoss = nn.MSELoss()
def main(args): training = int(args[1]) test_interwal = int(args[2]) load = int(args[3]) env = gym.make('BipedalWalker-v2') memory = None if training == 1: memory = Memory(MAX_BUFFER) prepopulate_memory(memory, env) rewards = [] start_time = time.time() max_reward = 0 trainer = ActorCritic(env.observation_space.shape[0], env.action_space.shape[0], memory, load) for episode in np.arange(MAX_EPISODES): if training == 1: env_run(env, episode, trainer, memory, True) if episode % test_interwal == 0: max_reward += env_run(env, episode, trainer, None, False) rewards.append(max_reward / ((episode / test_interwal) + 1)) plt.plot(rewards) plt.show()
def train(num = 500): agent = ActorCritic(env.observation_space.shape[0], [-A_BOUND, A_BOUND]) # agent.load_model() steps = [] RENDER = False for i_episode in range(num): old_observation = env.reset() old_action = agent.get_action(np.reshape(old_observation, [1, env.observation_space.shape[0]])) done = False step = 0 ep_r = 0 while not done: step = step + 1 if RENDER: env.render() observation, reward, done, info = env.step(old_action) reward /= 10 td_error = agent.train_critic(reward, np.reshape(old_observation, [1, env.observation_space.shape[0]]), np.reshape(observation, [1, env.observation_space.shape[0]])) agent.train_actor(td_error, np.reshape(old_observation, [1, env.observation_space.shape[0]]), old_action) old_observation = observation old_action = agent.get_action(np.reshape(old_observation, [1, env.observation_space.shape[0]])) ep_r += reward if done: print("{} {}".format(i_episode, ep_r)) if ep_r > -50: RENDER = True break
def main(): env = gym.make('CartPole-v1') model = ActorCritic(LEARNING_RATE, GAMMA) score = 0.0 print_interval = 20 for n_epi in range(10000): s = env.reset() done = False while not done: for i in range(n_rollout): prob = model.pi(torch.from_numpy(s).float()) m = Categorical(prob) a = m.sample().item() s_prime, r, done, info = env.step(a) model.put_data((s, a, r, s_prime, done)) s = s_prime score += r if done: break model.train_net() if n_epi % print_interval == 0 and n_epi != 0: print("# of episode :{}, avg score: {}".format(n_epi, score / print_interval)) score = 0.0 env.close()
def test(num = 500): agent = ActorCritic(env.observation_space.shape[0], env.action_space.n) agent.load_model() steps = [] for i_episode in range(num): old_observation = env.reset() old_action = agent.get_action(np.reshape(old_observation, [1, env.observation_space.shape[0]])) done = False step = 0 while not done: step = step + 1 # env.render() observation, reward, done, info = env.step(old_action) if done: reward = - 20 old_observation = observation old_action = agent.get_action(np.reshape(old_observation, [1, env.observation_space.shape[0]])) if done: steps.append(step) print("{}:{} steps".format(i_episode, step)) break # if the average steps of consecutive 100 games is lower than a standard # we consider the method passes the game if len(steps) > 200 and sum(steps[-200:])/200 >=195: print(sum(steps[-200:])/200) break
def agent(net_params_queue, exp_queues, config, id): torch.set_num_threads(1) env = GymEnv(env_id=id, config=config) net = ActorCritic(False, config) send_rate_list = config['sending_rate'] default_bwe_idx = config['default_bwe'] # experience RTC if not forced to stop while True: env.reset() action = default_bwe_idx bwe = send_rate_list[action] s_batch = [] a_batch = [] r_batch = [] entropy_batch = [] done = False actor_network_params = net_params_queue.get() for target_param, source_param in zip(net.ActorNetwork.parameters(), actor_network_params): target_param.data.copy_(source_param.data) # todo: Agent interact with gym while not done: state, reward, done, _ = env.step( bwe) # todo: the shape of state needs to be regulated r_batch.append(reward) action, entropy = net.predict(state) bwe = send_rate_list[action] s_batch.append(state) a_batch.append(action) entropy_batch.append(entropy) # ignore the first bwe and state since we don't have the ability to control it exp_queues.put( [s_batch[1:], a_batch[1:], r_batch[1:], done, entropy_batch[1:]])
def Evaluation(Eseed, lseed): seed = Eseed + 10 #試行毎seed変更(学習と同じにならないようにずらす) ######### パラメータ ######### env_name = "Pendulum-v0" save_interval = 10 lr = 3 * pow(10, -4) gamma = 0.99 #減衰率 batch_size = 256 max_timesteps = 200 max_episodes = 500 #最大エピソード数2500 num_step = max_timesteps * max_episodes gamma = 0.99 #減衰率 batch_size = 256 save_step = save_interval * max_timesteps directory = "./preTrained/actorcritic/{}".format( env_name) # save trained models filename = "ActorCritic_{}_{}".format(env_name, lseed) ############################# env = gym.make(env_name) env.seed(seed) torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) min_action = float(env.action_space.low[0]) policy = ActorCritic(lr, state_dim, action_dim, max_action, min_action, batch_size, gamma) temp = np.zeros(1) sumR_ave = 0.0 for st in range(save_step, num_step + 1, save_step): #1つけんと最後のデータ読み込まん policy.load_models(directory, filename, st) sumR = 0.0 for e in range(10): #10エピソードの平均 state = env.reset() action = policy.select_action(state) for s in range(200): #not done #env.render() action = policy.select_action(state) next_state, reward, done, _ = env.step(action) sumR += reward state = next_state sumR_ave = sumR / 10 if st == save_step: temp = sumR_ave else: temp = np.vstack((temp, sumR_ave)) env.close() return temp
def __init__(self, writer, state_dim=172, action_dim=5, n_latent_var=512, lr=3e-4, betas=(0.9, 0.999), gamma=0.99, ppo_epochs=3, icm_epochs=1, eps_clip=0.2, ppo_batch_size=128, icm_batch_size=16, intr_reward_strength=0.02, lamb=0.95, device='cpu'): self.lr = lr self.betas = betas self.gamma = gamma self.lambd = lamb self.eps_clip = eps_clip self.ppo_epochs = ppo_epochs self.icm_epochs = icm_epochs self.ppo_batch_size = ppo_batch_size self.icm_batch_size = icm_batch_size self.intr_reward_strength = intr_reward_strength self.device = device self.writer = writer self.timestep = 0 self.icm = ICM(activation=Swish()).to(self.device) self.policy = ActorCritic(state_dim=state_dim, action_dim=action_dim, n_latent_var=n_latent_var, activation=Swish(), device=self.device, ).to(self.device) self.policy_old = ActorCritic(state_dim, action_dim, n_latent_var, activation=Swish(), device=self.device ).to(self.device) self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr, betas=betas) self.optimizer_icm = torch.optim.Adam(self.icm.parameters(), lr=lr, betas=betas) self.policy_old.load_state_dict(self.policy.state_dict()) self.MseLoss = nn.MSELoss(reduction='none')
def __init__(self): self.train_history = dict() if os.path.isfile("./checkpoints/model.pt"): self.model = torch.load("./checkpoints/model.pt") self.train_history['frames_trained'] = torch.load( "./checkpoints/frames_trained.pt") self.train_history['average_entropy'] = torch.load( "./checkpoints/average_entropy.pt") self.train_history['average_values'] = torch.load( "./checkpoints/average_values.pt") print("Model loaded from last checkpoint.") else: self.model = ActorCritic(num_actions=len(ACTIONS)) self.train_history['frames_trained'] = torch.tensor(0) self.train_history['average_entropy'] = torch.tensor( [0], dtype=torch.float) self.train_history['average_values'] = torch.tensor( [0], dtype=torch.float) print("New model created.") self.model.to("cuda") self.worker = RolloutWorker(self, ENV_ID, NUM_WORKERS, T)
def main(method='DQN', isTrain=True): global FPSCLOCK, DISPLAYSURF, GEMIMAGES, GAMESOUNDS, BASICFONT, BOARDRECTS, RL # import reinforece learning if method == "ActorCritic": from ActorCritic import ActorCritic RL = ActorCritic(getBlankBoard(), [BOARDSLOTS, 2]) elif method == "DQN": from DQN import DQN observation = getFeatureVec(getBlankBoard()) RL = DQN(observation, [BOARDSLOTS, 2]) # Initial set up. pygame.init() FPSCLOCK = pygame.time.Clock() DISPLAYSURF = pygame.display.set_mode((WINDOWWIDTH, WINDOWHEIGHT)) pygame.display.set_caption('Gemgem') BASICFONT = pygame.font.Font('freesansbold.ttf', 36) # Load the images GEMIMAGES = [] for i in range(1, NUMGEMIMAGES + 1): gemImage = pygame.image.load('gem%s.png' % i) if gemImage.get_size() != (GEMIMAGESIZE, GEMIMAGESIZE): gemImage = pygame.transform.smoothscale( gemImage, (GEMIMAGESIZE, GEMIMAGESIZE)) GEMIMAGES.append(gemImage) # Load the sounds. GAMESOUNDS = {} GAMESOUNDS['bad swap'] = pygame.mixer.Sound('badswap.wav') GAMESOUNDS['match'] = [] for i in range(NUMMATCHSOUNDS): GAMESOUNDS['match'].append(pygame.mixer.Sound('match%s.wav' % i)) # Create pygame.Rect objects for each board space to # do board-coordinate-to-pixel-coordinate conversions. BOARDRECTS = [] for x in range(BOARDWIDTH): BOARDRECTS.append([]) for y in range(BOARDHEIGHT): r = pygame.Rect( (XMARGIN + (x * GEMIMAGESIZE), YMARGIN + (y * GEMIMAGESIZE), GEMIMAGESIZE, GEMIMAGESIZE)) BOARDRECTS[x].append(r) runGame(300000000, isTrain)
def trainAC(env, state_size, action_size, lr, n_agents, dim_act, dim_actprob, batch_size, setting): #n_agents, dim_obs, dim_act,dim_actprob, batch_size,device #ifload = setting["ifload"]#False n_iters = setting["iter"] AC = ActorCritic(n_agents, state_size, dim_act, dim_actprob, batch_size, device, setting) step_n = 10 for iter in range(n_iters): state = env.reset() state = np.stack(state) done = False for i in range(step_n): state = torch.FloatTensor(state).to(device) #action = dist.sample() dist, actions, log_probs, act_prob = AC.select_action(state) acts = [act.detach() for act in actions] obs_n, reward_n, _, _ = env.step(acts) #print(obs_n,"]]]]]]") if i == step_n - 1: done = True next_state = obs_n #entropy += dist.entropy().mean() AC.storeSample(state, log_probs, reward_n, 1 - done, acts) state = next_state if done: if iter % 20 == 0: print('Iteration: {}, Score: {}'.format( iter, np.sum(np.array(AC.rewards)))) break next_state, thact = transTensor(next_state, acts, n_agents) for ag in range(AC.n_agents): next_value = AC.critics[ag](next_state, thact) AC.next_value.append(next_value) actloss = AC.update() if iter % 20 == 0: print("action_loss ", actloss) ifsave = setting["ifsave"] #True if iter % 250 == 0 and ifsave: torch.save(AC.actor, "model_rule/actor_" + setting["actor_name"] + ".pkl") for j in range(AC.n_agents): torch.save( AC.critics[j], "model_rule/critic_" + setting["critic_name"] + str(j) + '.pkl') return AC
def trainIters(n_iters): AC = ActorCritic(2, state_size, 1, 2, 32, device) step_n = 50 for iter in range(n_iters): state = env.reset() state = np.stack(state) done = False for i in range(step_n): state = torch.FloatTensor(state).to(device) #action = dist.sample() dist, action, log_prob, act_prob = AC.select_action(state) acts = [action.detach() for ag in range(2)] obs_n, reward_n, _, _ = env.step(acts) if i == step_n - 1: done = True next_state = obs_n #entropy += dist.entropy().mean() AC.storeSample(state, log_prob, reward_n, 1 - done, acts) state = next_state if done: if iter % 20 == 0: print('Iteration: {}, Score: {}'.format( iter, np.sum(np.array(AC.rewards)))) break next_state, thact = transTensor(next_state, acts) for ag in range(AC.n_agents): next_value = AC.critics[ag](next_state, thact) AC.next_value.append(next_value) actloss = AC.update() if iter % 20 == 0: print("action_loss ", actloss) ifsave = False if iter % 250 == 0 and ifsave: torch.save(AC.actor, 'model/actor_v2.pkl') for j in range(AC.n_agents): torch.save(AC.critics[j], 'model/criticv2' + str(j) + '.pkl') return AC
'action_shape': action_dim, 'action_scale': action_max, 'tau': 1e-3 } actor_dict = { 'layer_sizes': [480, 360], 'activation': 'selu', 'pool_size': 2, 'dropout_rate': 0.3, 'use_bn': False, 'use_do': True } # Create actor and critic objects based on environment information. ActorObj = ActorCritic(actor_type=True, **ac_dict, lr=1e-3, **actor_dict) CriticObj = ActorCritic(actor_type=False, **ac_dict, lr=1e-3) # Make experience buffer and noise. buffer_size = int(5e4) BufferObj = IndividualBuffers(buffer_size, state_dim, action_dim) NoiseObj = makeOUNoise(noise_type='none', mu=np.zeros(action_dim), sigma=np.full(action_dim, 0.2)) # Training arguments. arg_dict = { 'ActorObj': ActorObj, 'CriticObj': CriticObj, 'buffer': BufferObj, 'noise': NoiseObj,
def main(): env = gym.make('Pendulum-v0') action_dim = env.action_space.shape[0] state_dim = env.observation_space.shape[0] agent = ActorCritic(state_dim, action_dim) state = env.reset() timestep_limit = min(250, env.spec.timestep_limit) print "timestep limit set to : ", timestep_limit #timestep_limit = env.spec.timestep_limit # For checking purposes; make it proper for run # Initial data build up done_flag = 0 for i in range(REPLAY_MEMORY): if (done_flag == True): state = env.reset() action = env.action_space.sample() next_state, reward, done_flag, info = env.step(action) agent.append_memory(state, action, reward, next_state, done_flag) state = next_state print "Initial memory built!!" # Initial Training for a few steps for _ in range(5): agent.update_networks() agent.update_target_networks() print "Initial network performance = ", policy_evaluation(agent, env, 2) # ================================================================================= print "******** Starting learning process *************" num_episodes = 5 update_freq = 1 # update after how many steps (within each episode) print_freq = 1 # how often to print (episodes) performance = np.zeros(num_episodes) best_ep = 0 best_agent = copy.deepcopy(agent) start_time = t.time() for ep in range(num_episodes): done_flag = 0 state = env.reset() time = 0 while (done_flag != True and time <= timestep_limit): actor_out = agent.learner.actor.predict(state.reshape(1, -1))[0] action = actor_out # need to add exploration here next_state, reward, done_flag, _ = env.step(action) agent.append_memory(state, action, reward, next_state, done_flag) state = next_state if (time % update_freq == 0): agent.update_networks(epochs=5) #agent.update_target_networks() --> Ideall I should update here, but it's way too slow. #print time, timestep_limit time += 1 performance[ep] = policy_evaluation(agent, env, 5) # Update the target networks (I'll use a larger tau here) agent.update_target_networks(tau=0.01) if (ep % print_freq == 0): print "Now in episode: ", ep + 1, " of ", num_episodes print "Agent performance = ", performance[ep] if (performance[ep] > performance[best_ep]): best_agent = copy.deepcopy(agent) best_ep = ep end_time = t.time() print "Total time", (end_time - start_time) plt.plot(performance[-100:]) plt.show()
} actor_dict = { 'layer_sizes': [480, 360], 'activation': activation_name, 'pool_size': 2, 'dropout_rate': 0.5, 'use_bn': False, 'use_do': True, 'DR': DR, 'total_tasks': len(env_names), 'cpack': False } # Create actor and critic objects based on environment information. ActorObj = ActorCritic(actor_type=True, **ac_dict, lr=1e-3, **actor_dict) # Critic object has a simpler version if not using PackNet on it. # Which is only needed for multi-task PackNet. if not CPACK: CriticObj = ActorCritic(actor_type=False, **ac_dict, lr=1e-3, cpack=CPACK) else: critic_dict = { 'layer_sizes': [480, 360], 'activation': activation_name, 'pool_size': 2, 'dropout_rate': 0.5, 'use_bn': False,
def single_agent(): config = load_config() # num_agents = config['num_agents'] torch.set_num_threads(1) env = GymEnv(config=config) env.reset() net = ActorCritic(True, config) net.ActorNetwork.init_params() net.CriticNetwork.init_params() bwe = config['sending_rate'][config['default_bwe']] i = 1 s_batch = [] r_batch = [] a_batch = [] # experience RTC if not forced to stop ax = [] ay = [] plt.ion() while True: # todo: Agent interact with gym state, reward, done, _ = env.step(bwe) r_batch.append(reward) action = net.predict(state) bwe = config['sending_rate'][action] a_batch.append(action) s_batch.append(state) # todo: need to be fixed if done: action = config['default_bwe'] bwe = config['sending_rate'][action] # update network net.getNetworkGradient(s_batch, a_batch, r_batch, done) net.updateNetwork() print('Network update.') i += 1 ax.append(i) # ay.append(entropy) ay.append(reward) plt.clf() plt.plot(ax, ay) plt.pause(0.1) # s_batch.append(np.zeros(config['state_dim'], config['state_length'])) # a_batch.append(action) env.reset() print('Environment has been reset.') print('Epoch {}, Reward: {}'.format(i - 1, reward)) if i % 100 == 0: # print('Current BWE: ' + str(bwe)) torch.save(net.ActorNetwork.state_dict(), config['model_dir'] + '/actor1_{}.pt'.format(str(i))) torch.save(net.CriticNetwork.state_dict(), config['model_dir'] + '/critic13m_{}.pt'.format(str(i))) print('Model Restored.')
class ICMPPO: def __init__(self, writer, state_dim=172, action_dim=5, n_latent_var=512, lr=3e-4, betas=(0.9, 0.999), gamma=0.99, ppo_epochs=3, icm_epochs=1, eps_clip=0.2, ppo_batch_size=128, icm_batch_size=16, intr_reward_strength=0.02, lamb=0.95, device='cpu'): self.lr = lr self.betas = betas self.gamma = gamma self.lambd = lamb self.eps_clip = eps_clip self.ppo_epochs = ppo_epochs self.icm_epochs = icm_epochs self.ppo_batch_size = ppo_batch_size self.icm_batch_size = icm_batch_size self.intr_reward_strength = intr_reward_strength self.device = device self.writer = writer self.timestep = 0 self.icm = ICM(activation=Swish()).to(self.device) self.policy = ActorCritic(state_dim=state_dim, action_dim=action_dim, n_latent_var=n_latent_var, activation=Swish(), device=self.device, ).to(self.device) self.policy_old = ActorCritic(state_dim, action_dim, n_latent_var, activation=Swish(), device=self.device ).to(self.device) self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr, betas=betas) self.optimizer_icm = torch.optim.Adam(self.icm.parameters(), lr=lr, betas=betas) self.policy_old.load_state_dict(self.policy.state_dict()) self.MseLoss = nn.MSELoss(reduction='none') def update(self, memory, timestep): # Convert lists from memory to tensors self.timestep = timestep old_states = torch.stack(memory.states).to(self.device).detach() old_states = torch.transpose(old_states, 0, 1) old_actions = torch.stack(memory.actions).T.to(self.device).detach() old_logprobs = torch.stack(memory.logprobs).T.to(self.device).detach() # Finding s, n_s, a, done, reward: curr_states = old_states[:, :-1, :] next_states = old_states[:, 1:, :] actions = old_actions[:, :-1].long() rewards = torch.tensor(memory.rewards[:-1]).T.to(self.device).detach() mask = (~torch.tensor(memory.is_terminals).T.to(self.device).detach()[:, :-1]).type(torch.long) with torch.no_grad(): intr_reward, _, _ = self.icm(actions, curr_states, next_states, mask) intr_rewards = torch.clamp(self.intr_reward_strength * intr_reward, 0, 1) self.writer.add_scalar('Mean_intr_reward_per_1000_steps', intr_rewards.mean() * 1000, self.timestep ) # Finding comulitive advantage with torch.no_grad(): state_values = torch.squeeze(self.policy.value_layer(curr_states)) next_state_values = torch.squeeze(self.policy.value_layer(next_states)) td_target = (rewards + intr_rewards) / 2 + self.gamma * next_state_values * mask delta = td_target - state_values self.writer.add_scalar('maxValue', state_values.max(), timestep ) self.writer.add_scalar('meanValue', state_values.mean(), self.timestep ) advantage = torch.zeros(1, 16).to(self.device) advantage_lst = [] for i in range(delta.size(1) - 1, -1, -1): delta_t, mask_t = delta[:, i], mask[:, i] advantage = delta_t + (self.gamma * self.lambd * advantage) * mask_t advantage_lst.insert(0, advantage) advantage_lst = torch.cat(advantage_lst, dim=0).T # Get local advantage to train value function local_advantages = state_values + advantage_lst # Normalizing the advantage advantages = (advantage_lst - advantage_lst.mean()) / (advantage_lst.std() + 1e-10) # Optimize policy for ppo epochs: epoch_surr_loss = 0 for _ in range(self.ppo_epochs): indexes = np.random.permutation(actions.size(1)) # Train PPO and icm for i in range(0, len(indexes), self.ppo_batch_size): batch_ind = indexes[i:i + self.ppo_batch_size] batch_curr_states = curr_states[:, batch_ind, :] batch_actions = actions[:, batch_ind] batch_mask = mask[:, batch_ind] batch_advantages = advantages[:, batch_ind] batch_local_advantages = local_advantages[:, batch_ind] batch_old_logprobs = old_logprobs[:, batch_ind] # Finding actions logprobs and states values batch_logprobs, batch_state_values, batch_dist_entropy = self.policy.evaluate(batch_curr_states, batch_actions) # Finding the ratio (pi_theta / pi_theta__old): ratios = torch.exp(batch_logprobs - batch_old_logprobs.detach()) # Apply leaner decay and multiply 16 times cause agents_batch is 16 long decay_epsilon = linear_decay_eps(self.timestep * 16) decay_beta = linear_decay_beta(self.timestep * 16) # Finding Surrogate Loss: surr1 = ratios * batch_advantages surr2 = torch.clamp(ratios, 1 - decay_epsilon, 1 + decay_epsilon) * batch_advantages loss = -torch.min(surr1, surr2) * batch_mask + \ 0.5 * nn.MSELoss(reduction='none')(batch_state_values, batch_local_advantages.detach()) * batch_mask - \ decay_beta * batch_dist_entropy * batch_mask loss = loss.mean() self.optimizer.zero_grad() loss.backward() self.optimizer.step() linear_decay_lr(self.optimizer, self.timestep * 16) epoch_surr_loss += loss.item() self._icm_update(self.icm_epochs, self.icm_batch_size, curr_states, next_states, actions, mask) self.writer.add_scalar('Lr', self.optimizer.param_groups[0]['lr'], self.timestep ) self.writer.add_scalar('Surrogate_loss', epoch_surr_loss / (self.ppo_epochs * (len(indexes) // self.ppo_batch_size + 1)), self.timestep ) # Copy new weights into old policy: self.policy_old.load_state_dict(self.policy.state_dict()) def _icm_update(self, epochs, batch_size, curr_states, next_states, actions, mask): epoch_forw_loss = 0 epoch_inv_loss = 0 for _ in range(epochs): indexes = np.random.permutation(actions.size(1)) for i in range(0, len(indexes), batch_size): batch_ind = indexes[i:i + batch_size] batch_curr_states = curr_states[:, batch_ind, :] batch_next_states = next_states[:, batch_ind, :] batch_actions = actions[:, batch_ind] batch_mask = mask[:, batch_ind] _, inv_loss, forw_loss = self.icm(batch_actions, batch_curr_states, batch_next_states, batch_mask) epoch_forw_loss += forw_loss.item() epoch_inv_loss += inv_loss.item() unclip_intr_loss = 10 * (0.2 * forw_loss + 0.8 * inv_loss) # take gradient step self.optimizer_icm.zero_grad() unclip_intr_loss.backward() self.optimizer_icm.step() linear_decay_lr(self.optimizer_icm, self.timestep * 16) self.writer.add_scalar('Forward_loss', epoch_forw_loss / (epochs * (len(indexes) // batch_size + 1)), self.timestep ) self. writer.add_scalar('Inv_loss', epoch_inv_loss / (epochs * (len(indexes) // batch_size + 1)), self.timestep )
def main(): env = gym.make('Pendulum-v0') action_dim = env.action_space.shape[0] state_dim = env.observation_space.shape[0] agent = ActorCritic(state_dim, action_dim) state = env.reset() timestep_limit = min(250, env.spec.timestep_limit) print "timestep limit set to : ", timestep_limit #timestep_limit = env.spec.timestep_limit # For checking purposes; make it proper for run # Initial data build up done_flag = 0 for i in range(REPLAY_MEMORY): if (done_flag == True): state = env.reset() action = env.action_space.sample() next_state, reward, done_flag, info = env.step(action) agent.append_memory(state, action, reward, next_state, done_flag) state = next_state print "Initial memory built!!" # Initial Training for a few steps for _ in range(5): agent.update_networks() agent.update_target_networks() print "Initial network performance = ", policy_evaluation(agent, env, 2) # ================================================================================= print "******** Starting learning process *************" num_episodes = 5 update_freq = 1 # update after how many steps (within each episode) print_freq = 1 # how often to print (episodes) performance = np.zeros(num_episodes) best_ep = 0 best_agent = copy.deepcopy(agent) start_time = t.time() for ep in range(num_episodes): done_flag = 0 state = env.reset() time = 0 while (done_flag!=True and time<=timestep_limit): actor_out = agent.learner.actor.predict(state.reshape(1,-1))[0] action = actor_out # need to add exploration here next_state, reward, done_flag, _ = env.step(action) agent.append_memory(state, action, reward, next_state, done_flag) state = next_state if (time % update_freq == 0): agent.update_networks(epochs=5) #agent.update_target_networks() --> Ideall I should update here, but it's way too slow. #print time, timestep_limit time += 1 performance[ep] = policy_evaluation(agent, env, 5) # Update the target networks (I'll use a larger tau here) agent.update_target_networks(tau=0.01) if (ep % print_freq == 0): print "Now in episode: ", ep+1, " of ", num_episodes print "Agent performance = ", performance[ep] if (performance[ep] > performance[best_ep]): best_agent = copy.deepcopy(agent) best_ep = ep end_time = t.time() print "Total time", (end_time - start_time) plt.plot(performance[-100:]) plt.show()
def train(rank, args, shared_model, optimizer=None): torch.manual_seed(args.seed + rank) env = GeneralEnvironment('policy.mdl') model = ActorCritic() if optimizer is None: optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() state = env.reset() state = torch.Tensor(state) model.init_hidden(env.map_height, env.map_width) done = True episode_length = 0 while True: # Sync with the shared model model.load_state_dict(shared_model.state_dict()) values = [] log_probs = [] rewards = [] entropies = [] off_targets = [] for step in range(args.num_steps): episode_length += 1 value, logit = model(Variable(state.unsqueeze(0))) prob = F.softmax(logit) old_prob = prob # Set the probability of all items that not owned by user to # 0 army_map = state[0, ...] label_map = (army_map > 0) label_map = label_map.view(1, env.map_height, env.map_width) label_map = label_map.expand(8, env.map_height, env.map_width) label_map = label_map.contiguous() label_map = label_map.view(-1) # prob[~label_map] = 0 prob = old_prob * Variable(label_map.float()) # Penalize model for predicting off target tiles off_prob = old_prob * Variable((~label_map).float()) off_targets.append(off_prob.sum(1)) log_prob = F.log_softmax(logit) entropy = -(log_prob * prob).sum(1) entropies.append(entropy) action = prob.multinomial().data log_prob = log_prob.gather(1, Variable(action)) state, reward, done, _ = env.step(action.numpy().flat[0]) done = done or episode_length >= args.max_episode_length if done: episode_length = 0 state = env.reset() model.init_hidden(env.map_height, env.map_width) state = torch.Tensor(state) values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1) if not done: value, _ = model(Variable(state.unsqueeze(0))) R = value.data values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * \ values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ log_probs[i] * Variable(gae) - args.entropy_coef * entropies[i] + \ args.off_tile_coef * off_targets[i] optimizer.zero_grad() loss = policy_loss + args.value_loss_coef * value_loss (loss).backward() torch.nn.utils.clip_grad_norm(model.parameters(), args.max_grad_norm) ensure_shared_grads(model, shared_model) optimizer.step() model.reset_hidden() gc.collect()
def main(): env = PendulumEnv() num_episodes = 5 num_of_time_steps = 200 #1 policy1 = ActorCritic(env, alpha_value=0.0001, alpha_policy=0.01, gamma=0.99, sigma=1.5) reward_plot1 = train(env, policy1, num_episodes, num_of_time_steps) plt.figure() plt.plot(reward_plot1) plt.xlabel("Every 10th epsiode") plt.ylabel("Sum of rewards in the episode") plt.title("1 > Policy Step Size > Value Step Size") #plt.savefig('1-3_1.png') plt.show() #2 policy2 = ActorCritic(env, alpha_value=0.1, alpha_policy=0.01, gamma=0.99, sigma=1.5) reward_plot2 = train(env, policy2, num_episodes, num_of_time_steps) plt.figure() plt.plot(reward_plot2) plt.xlabel("Every 10th epsiode") plt.ylabel("Sum of rewards in the episode") plt.title("1 > Policy Step Size < Value Step Size") #plt.savefig('1-3_2.png') plt.show() #3 policy3 = ActorCritic(env, alpha_value=0.001, alpha_policy=0.001, gamma=0.99, sigma=1.5) reward_plot3 = train(env, policy3, num_episodes, num_of_time_steps) plt.figure() plt.plot(reward_plot3) plt.xlabel("Every 10th epsiode") plt.ylabel("Sum of rewards in the episode") plt.title("1 > Policy Step Size = Value Step Size") #plt.savefig('1-3_3.png') plt.show() #4 policy4 = ActorCritic(env, alpha_value=1, alpha_policy=0.1, gamma=0.99, sigma=1.5) reward_plot4 = train(env, policy4, num_episodes, num_of_time_steps) plt.figure() plt.plot(reward_plot4) plt.xlabel("Every 10th epsiode") plt.ylabel("Sum of rewards in the episode") plt.title("Policy Step Size > Value Step Size > 1") plt.show() #plt.savefig('1-3_4.png') #5 plt.figure() plt.plot(reward_plot1, label="1 > alpha_Policy > alpha_Value") plt.plot(reward_plot2, label="1 > alpha_Policy < alpha_Value") plt.plot(reward_plot3, label="1 > alpha_Policy = alpha_Value") plt.plot(reward_plot4, label="alpha_Policy > alpha_Value > 1") plt.title("Performance for different values of policy and value step size") plt.xlabel("Every 10th epsiode") plt.ylabel("Sum of rewards in the episode") plt.legend() #plt.savefig("all_plots.png") plt.show()
parser.add_argument('--off-tile-coef', type=float, default=10, help='weight to penalize bad movement') parser.add_argument('--checkpoint-interval', type=float, default=None, help='interval to save model') if __name__ == '__main__': os.environ['OMP_NUM_THREADS'] = '1' args = parser.parse_args() env = GeneralEnvironment('2_epoch.mdl') shared_model = ActorCritic() shared_model.share_memory() if args.no_shared: optimizer = None else: optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=args.lr) optimizer.share_memory() processes = [] p = mp.Process(target=test, args=(args.num_processes, args, shared_model)) p.start() processes.append(p)
class PPO: def __init__(self, state_dim, action_dim, n_agents, lr, betas, gamma, K_epochs, eps_clip): self.lr = lr self.betas = betas self.gamma = gamma self.eps_clip = eps_clip self.K_epochs = K_epochs self.policy = ActorCritic(state_dim, action_dim, n_agents).to(device) self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr, betas=betas) self.policy_old = ActorCritic(state_dim, action_dim, n_agents).to(device) self.policy_old.load_state_dict(self.policy.state_dict()) self.MseLoss = nn.MSELoss() def update(self, memory): # Monte Carlo estimate of state rewards: rewards = [] discounted_reward = [0, 0] for reward, is_terminal in zip(reversed(memory.rewards), reversed(memory.is_terminals)): if all(is_terminal): discounted_reward =[0, 0] elif is_terminal[0]: discounted_reward[0] = 0 elif is_terminal[1]: discounted_reward[1] = 0 discounted_reward[0] = reward[0] + self.gamma * discounted_reward[0] discounted_reward[1] = reward[1] + self.gamma * discounted_reward[1] rewards.insert(0, discounted_reward) # Normalizing the rewards: rewards = torch.tensor(rewards).to(device) rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5) # convert list to tensor old_states = torch.stack(memory.states).to(device).detach() old_actions = torch.stack(memory.actions).to(device).detach() old_logprobs = torch.stack(memory.logprobs).to(device).detach() # Optimize policy for K epochs: for _ in range(self.K_epochs): # Evaluating old actions and values : logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions) # Finding the ratio (pi_theta / pi_theta__old): ratios = torch.exp(logprobs - old_logprobs.detach()) # Finding Surrogate Loss: advantages = rewards - state_values.detach() surr1 = ratios * advantages surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(state_values, rewards) - 0.01 * dist_entropy # take gradient step self.optimizer.zero_grad() loss.mean().backward() self.optimizer.step() # Copy new weights into old policy: self.policy_old.load_state_dict(self.policy.state_dict()) return loss.mean()
class PPOAgent: def __init__(self): self.train_history = dict() if os.path.isfile("./checkpoints/model.pt"): self.model = torch.load("./checkpoints/model.pt") self.train_history['frames_trained'] = torch.load( "./checkpoints/frames_trained.pt") self.train_history['average_entropy'] = torch.load( "./checkpoints/average_entropy.pt") self.train_history['average_values'] = torch.load( "./checkpoints/average_values.pt") print("Model loaded from last checkpoint.") else: self.model = ActorCritic(num_actions=len(ACTIONS)) self.train_history['frames_trained'] = torch.tensor(0) self.train_history['average_entropy'] = torch.tensor( [0], dtype=torch.float) self.train_history['average_values'] = torch.tensor( [0], dtype=torch.float) print("New model created.") self.model.to("cuda") self.worker = RolloutWorker(self, ENV_ID, NUM_WORKERS, T) def select_act(self, states, train_mode=True): states = torch.tensor(states).to("cuda") prob_dists, values = self.model(states) if train_mode: actions = prob_dists.sample() else: actions = torch.argmax(prob_dists.probs, dim=1) action_log_probs = prob_dists.log_prob(actions) values = values.data.cpu().numpy() actions = actions.data.cpu().numpy() action_log_probs = action_log_probs.data.cpu().numpy() return values, actions, action_log_probs def train_step(self): states, actions, old_action_log_probs, returns, advantages \ = self.worker.rollout() states = torch.tensor(states).to("cuda") actions = torch.tensor(actions).to("cuda") old_action_log_probs = torch.tensor(old_action_log_probs).to("cuda") returns = torch.tensor(returns).to("cuda") advantages = torch.tensor(advantages).to("cuda") optimizer = torch.optim.Adam(self.model.parameters(), lr=LEARNING_RATE, eps=1e-5) loss_surr = self._surrogate_loss(states, actions, old_action_log_probs, advantages) loss_surr_before = loss_surr.data.cpu().numpy() loss_value = self._value_loss(states, returns) loss_value_before = loss_value.data.cpu().numpy() loss_ent = self._entropy_loss(states) loss_ent_before = loss_ent.data.cpu().numpy() for epoch in range(NUM_EPOCHS): dataset_size = states.shape[0] batch_size = dataset_size // NUM_MINIBATCHES random_indices = torch.randperm(dataset_size, device="cpu").to("cuda") for n in range(NUM_MINIBATCHES): batch_indices = random_indices[n * batch_size:n * batch_size + batch_size] states_batch = states[batch_indices] old_action_log_probs_batch = old_action_log_probs[ batch_indices] actions_batch = actions[batch_indices] advantages_batch = advantages[batch_indices] returns_batch = returns[batch_indices] advantages_batch = (advantages_batch - advantages_batch.mean() ) / (advantages_batch.std() + 1e-6) loss_surr_batch = self._surrogate_loss( states_batch, actions_batch, old_action_log_probs_batch, advantages_batch) loss_value_batch = self._value_loss(states_batch, returns_batch) loss_ent_batch = self._entropy_loss(states_batch) loss_batch = loss_surr_batch + C1 * loss_value_batch + C2 * loss_ent_batch # loss_batch_np = loss_batch.data.cpu().numpy() optimizer.zero_grad() loss_batch.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), MAX_GRAD_NORM) optimizer.step() pass loss_surr = self._surrogate_loss(states, actions, old_action_log_probs, advantages) loss_surr_after = loss_surr.data.cpu().numpy() loss_value = self._value_loss(states, returns) loss_value_after = loss_value.data.cpu().numpy() loss_ent = self._entropy_loss(states) loss_ent_after = loss_ent.data.cpu().numpy() self.train_history['frames_trained'] += 4 * states.shape[0] self.train_history['average_entropy'] = torch.cat( (self.train_history['average_entropy'], torch.tensor([float(loss_ent_after)]))) self.train_history['average_values'] = torch.cat( (self.train_history['average_values'], torch.tensor([returns.mean()]))) print("Frames trained: ", self.train_history['frames_trained'].cpu().numpy()) print("Loss before: {: .6f} {:.6f} {:.6f}".format( loss_surr_before, loss_value_before, loss_ent_before)) print("Loss after : {: .6f} {:.6f} {:.6f}".format( loss_surr_after, loss_value_after, loss_ent_after)) torch.save(self.model, "./checkpoints/model.pt") torch.save(self.train_history['frames_trained'], "./checkpoints/frames_trained.pt") torch.save(self.train_history['average_entropy'], "./checkpoints/average_entropy.pt") torch.save(self.train_history['average_values'], "./checkpoints/average_values.pt") def test_step(self): self.worker.rollout(train_mode=False) def _surrogate_loss(self, states, actions, old_action_log_probs, advantages): advantages -= advantages.mean() advantages /= advantages.std() + 1e-8 # advantages_np = advantages.data.cpu().numpy() pd, _ = self.model(states) action_log_probs = pd.log_prob(actions) # action_log_probs_np = action_log_probs.data.cpu().numpy() r = torch.exp(action_log_probs - old_action_log_probs) # r_np = r.data.cpu().numpy() r_clip = torch.clamp(r, 1 - EPSILON, 1 + EPSILON) # r_clip_np = r_clip.data.cpu().numpy() surr1 = r * advantages # surr1_np = surr1.data.cpu().numpy() surr2 = r_clip * advantages # surr2_np = surr2.data.cpu().numpy() loss_policy_batch = -torch.min(surr1, surr2) # loss_policy_batch_np = loss_policy_batch.data.cpu().numpy() loss_policy = torch.mean(loss_policy_batch, dim=0) # loss_policy_np = loss_policy.data.cpu().numpy() return loss_policy def _entropy_loss(self, states): pd, _ = self.model(states) loss_entropy_batch = -pd.entropy() # loss_entropy_batch_np = loss_entropy_batch.data.cpu().numpy() loss_entropy = torch.mean(loss_entropy_batch) # loss_entropy_np = loss_entropy.data.cpu().numpy() return loss_entropy def _value_loss(self, states, returns): _, values = self.model(states) # values_np = values.data.cpu().numpy() loss_value_batch = (returns - values)**2 # loss_value_batch_np = loss_value_batch.data.cpu().numpy() loss_value = 0.5 * torch.mean(loss_value_batch, dim=0) # loss_value_np = loss_value.data.cpu().numpy() return loss_value
def test(rank, args, shared_model): torch.manual_seed(args.seed + rank) env = GeneralEnvironment('2_epoch.mdl') model = ActorCritic() model.eval() state = env.reset() model.init_hidden(env.map_height, env.map_width) state = torch.Tensor(state) reward_sum = 0 done = True start_time = time.time() checkpoint_interval = 1 # a quick hack to prevent the agent from stucking actions = deque(maxlen=100) episode_length = 0 while True: episode_length += 1 # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) value, logit = model(Variable( state.unsqueeze(0), volatile=True)) prob = F.softmax(logit) # Set the probability of all items that not owned by user to # 0 army_map = state[0, ...] label_map = (army_map > 0) label_map = label_map.view(1, env.map_height, env.map_width) label_map = label_map.expand(8, env.map_height, env.map_width) label_map = label_map.contiguous() label_map = label_map.view(-1) prob = prob * Variable(label_map.float()) action = prob.max(1, keepdim=True)[1].data.numpy() state, reward, done, _ = env.step(action[0, 0]) done = done or episode_length >= args.max_episode_length reward_sum += reward # a quick hack to prevent the agent from stucking actions.append(action[0, 0]) if actions.count(actions[0]) == actions.maxlen: done = True if done: print("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() model.init_hidden(env.map_height, env.map_width) time.sleep(60) checkpoint_interval += 1 if checkpoint_interval % arg.checkpoint_interval == 0: torch.save(model.cpu().state_dict(), 'reinforce_trained.mdl') state = torch.Tensor(state)
class MultiAgent(): def __init__(self, state_size, action_size, num_agents, hidden_size=64, lr=1e-4): self.model = ActorCritic(state_size, action_size, hidden_size=hidden_size) self.optimizer = Adam(self.model.parameters(), lr=lr) self.agents = [PPO_Agent() for _ in range(num_agents)] def save_checkpoint(self, filepath=None): if filepath is None: filepath = 'checkpoint.pth' torch.save(self.model.state_dict(), filepath) def load_checkpoint(self, filepath): self.model.load_state_dict(torch.load(filepath)) def act(self, states): results = zip(*[ agent.chooce_action(self.model, state) for agent, state in zip(self.agents, states) ]) actions, log_probs, values = map(lambda x: np.array(x).squeeze(1), results) return actions, log_probs, values def step(self, states, actions, rewards, dones, log_probs, values, is_terminal=False): for i, agent in enumerate(self.agents): if is_terminal: agent.register_trajectories(states[i], None, None, None, None, values[i], is_terminal=is_terminal) else: agent.register_trajectories(states[i], actions[i], rewards[i], dones[i], log_probs[i], values[i]) def process_trajectories(self, gamma=0.99, gae_tau=0.95): for agent in self.agents: agent.calculate_gae_returns(gamma=gamma, gae_tau=gae_tau) def maybe_learn(self, i_episode, update_every=4): if i_episode % update_every == 0: accumulated_trajectories = [] for agent in self.agents: accumulated_trajectories += agent.processed_trajectories self.learn(accumulated_trajectories) def learn(self, accumulated_trajectories, batch_size=64, epsilon_clip=0.2, gradient_clip=10, beta=0.001, critic_discount=1., num_epochs=5): # Unroll and convert accumulated trajectories to tensors states, actions, old_log_probs, returns, advantages = map( torch.FloatTensor, zip(*accumulated_trajectories)) # Normalized advantages advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-7) # Get random batches from accumulated trajectories batcher = DataLoader(Batcher(states, actions, old_log_probs, returns, advantages), batch_size=batch_size, shuffle=True) self.model.train() for _ in range(num_epochs): for states, actions, old_log_probs, returns, advantages in batcher: # Get updated values from policy values, dist = self.model(states) new_log_probs = dist.log_prob(actions) entropy = dist.entropy() # Calculate ratio and clip, so that learning doesn't change new policy much from old ratio = (new_log_probs - old_log_probs).exp() clip = torch.clamp(ratio, 1 - epsilon_clip, 1 + epsilon_clip) clipped_surrogate = torch.min(ratio * advantages, clip * advantages) # Get losses actor_loss = -torch.mean( clipped_surrogate) - beta * entropy.mean() critic_loss = torch.mean(torch.square((returns - values))) losses = critic_loss * critic_discount + actor_loss # Do the optimizer step self.optimizer.zero_grad() losses.backward() nn.utils.clip_grad_norm_(self.model.parameters(), gradient_clip) self.optimizer.step() # Reset collected trajectories for agent in self.agents: agent.reset()
def central_agent(net_params_queue, exp_queues, config): torch.set_num_threads(1) # log training info logging.basicConfig(filename=config['log_dir'] + '/Central_agent_training.log', filemode='w', level=logging.INFO) assert len(net_params_queue) == config['num_agents'] assert len(exp_queues) == config['num_agents'] net = ActorCritic(True, config) # since the original pensieve does not use critic in workers # push actor_net_params into net_params_queue only, and save parameters regarding both networks separately if config['load_model']: actor_net_params = torch.load(config['model_dir'] + '/actor_300k1_80.pt') critic_net_params = torch.load(config['model_dir'] + '/critic_300k1_80.pt') net.ActorNetwork.load_state_dict(actor_net_params) net.CriticNetwork.load_state_dict(critic_net_params) else: net.ActorNetwork.init_params() net.CriticNetwork.init_params() # actor_net_params = list(net.ActorNetwork.parameters()) for i in range(config['num_agents']): # actor_net_params = net.ActorNetwork.parameters() net_params_queue[i].put(actor_net_params) epoch = 0 total_reward = 0.0 total_batch_len = 0.0 episode_entropy = 0.0 ax = [] ay = [] plt.ion() while True: start = time.time() actor_net_params = list(net.ActorNetwork.parameters()) for i in range(config['num_agents']): net_params_queue[i].put(actor_net_params) for i in range(config['num_agents']): s_batch, a_batch, r_batch, done, e_batch = exp_queues[i].get() net.getNetworkGradient(s_batch, a_batch, r_batch, done) total_reward += np.sum(r_batch) total_batch_len += len(r_batch) episode_entropy += np.sum(e_batch) net.updateNetwork() epoch += 1 avg_reward = total_reward / total_batch_len # avg_entropy = total_entropy / total_batch_len logging.info('Epoch ' + str(epoch) + '\nAverage reward: ' + str(avg_reward) + '\nEpisode entropy: ' + str(episode_entropy)) ax.append(epoch) ay.append(episode_entropy) plt.clf() plt.plot(ax, ay) plt.pause(0.1) total_reward = 0.0 total_batch_len = 0 episode_entropy = 0.0 if epoch % config['save_interval'] == 0: print('Train Epoch ' + str(epoch) + ', Model restored.') print('Epoch costs ' + str(time.time() - start) + ' seconds.') torch.save( net.ActorNetwork.state_dict(), config['model_dir'] + '/actor_300k_' + str(epoch) + '.pt') torch.save( net.CriticNetwork.state_dict(), config['model_dir'] + '/critic_300k_' + str(epoch) + '.pt')