def globalTrain(): torch.manual_seed(123) mp = _mp.get_context('spawn') env, num_state, num_action = gym_env(world, stage, version, actions) # define environment #env.seed(123+idx) shared_model = A3C(num_state, num_action) shared_model.share_memory() #optimizer = Adam_global(shared_model.parameters(), lr=Args.lr, betas = Args.betas ,eps = Args.eps, weight_decay = Args.weight_decay) optimizer = Adam_global(shared_model.parameters(), lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) processes = [] counter = mp.Value('i', 0) lock = mp.Lock() for index in range(num_processes): process = mp.Process(target=train, args=(index, shared_model, optimizer, counter, lock)) process.start() processes.append(process) process = mp.Process(target=test, args=(num_processes, shared_model)) process.start() processes.append(process) for process in processes: process.join()
def dummy_test(idx): torch.manual_seed(123 + idx) env, num_state, num_action = gym_env(world, stage, version, actions) done = True with open('record_reward_average.txt', 'rb') as fp: reward = pickle.load(fp) with open('record_acts.txt', 'rb') as fp: acts = pickle.load(fp) max_id = reward.index(max(reward)) acts_8 = acts[max_id] print(max(reward)) print(max_id) for act in acts_8: if done: state = env.reset() state, reward, done, info = env.step(act) env.render() plt.plot(range(1, len(reward) + 1), reward) plt.xlabel('Episode') plt.ylabel('Episode Rewards Achieved') plt.title('Episode Rewards') plt.show() plt.close()
def test(idx, shared_model): torch.manual_seed(123 + idx) env, num_state, num_action = gym_env(world, stage, version, actions) model = A3C(num_state, num_action) # model.load_state_dict(torch.load(path.join(path.dirname(path.abspath(__file__)),'trained_model.pth'),map_location='cpu')) model.eval() state = torch.from_numpy(env.reset()) done = True step_counter = 0 total_reward = 0 acts = deque(maxlen=max_actions) while True: step_counter += 1 if done: model.load_state_dict(shared_model.state_dict()) with torch.no_grad(): if done: hx = torch.zeros((1, 512), dtype=torch.float) cx = torch.zeros((1, 512), dtype=torch.float) else: hx = hx.detach() cx = cx.detach() action, value, hx, cx = model(state, hx, cx) prob = F.softmax(action, dim=-1) action = prob.max(1, keepdim=True)[1].numpy() state, reward, done, _ = env.step(int(action)) state = torch.from_numpy(state) env.render() acts.append(action) total_reward += reward if done: break # if __name__ == "__main__": # torch.manual_seed(123) # env,num_state,num_action = gym_env(world,stage,version,actions) # define environment # #env.seed(123+idx) # shared_model = A3C(num_state,num_action) # shared_model.share_memory() # #optimizer = Adam_global(shared_model.parameters(), lr=Args.lr, betas = Args.betas ,eps = Args.eps, weight_decay = Args.weight_decay) # optimizer = Adam_global(shared_model.parameters(), lr=lr, betas = betas ,eps = eps, weight_decay = weight_decay) # train(0,shared_model,optimizer,0,0)
def test_global(idx): torch.manual_seed(123 + idx) env, num_state, num_action = gym_env(world, stage, version, actions) model = A3C(num_state, num_action) model.load_state_dict( torch.load(path.join(path.dirname(path.abspath(__file__)), 'trained_model.pth'), map_location='cpu')) model.eval() state = torch.from_numpy(env.reset()) done = True step_counter = 0 total_reward = 0 acts = deque(maxlen=max_actions) while True: step_counter += 1 with torch.no_grad(): if done: hx = torch.zeros((1, 512), dtype=torch.float) cx = torch.zeros((1, 512), dtype=torch.float) else: hx = hx.detach() cx = cx.detach() action, value, hx, cx = model(state, hx, cx) prob = F.softmax(action, dim=-1) action = prob.max(1, keepdim=True)[1].numpy() state, reward, done, info = env.step(int(action)) state = torch.from_numpy(state) env.render() acts.append(action) total_reward += reward if done: break
def train(idx, shared_model, optimizer, counter, lock): ''' A3C for EACH actor-learner thread Inputs: idx: a scalar, indicting the idx th thread shared_model: The global model optimizer: The optimizer used for local gradient descent global_counter: a scalar, global shared counter Returns: None ''' # initialization torch.manual_seed(123 + idx) start = timeit.default_timer() env, num_state, num_action = gym_env(world, stage, version, actions) # define environment env.seed(123 + idx) # model = A3C(num_state,num_action) model = shared_model model.train() state = env.reset() state = torch.from_numpy(state) done = True step_counter = 0 curr_episode = 0 terminated = 0 success = 0 fail = 0 acts = [] record_reward = [] record_reward_average = [] record_acts = [] success_acts = [] while True: curr_episode += 1 # sync with the shared model # model.load_state_dict(shared_model.state_dict()) # save data if curr_episode % 50 == 0: interval_timer = timeit.default_timer() print('Current episode:{}, terminated:{},\ success:{}, fail:{},elasped time:{}'.format( curr_episode, terminated, success, fail, interval_timer - start)) if curr_episode >= 50: with open('record_acts.txt', 'wb') as fp: pickle.dump(record_acts, fp) with open('record_reward_average.txt', 'wb') as fp: pickle.dump(record_reward_average, fp) save_model(model) if done: hx = torch.zeros((1, 512), dtype=torch.float) cx = torch.zeros((1, 512), dtype=torch.float) terminated += 1 else: hx = hx.detach() cx = cx.detach() values = [] log_probs = [] rewards = [] entropies = [] # reset gradient action_loss = 0 critic_loss = 0 # repeat until terminal or max steps reach for step in range(num_local_steps): step_counter += 1 # perform action according to policy logits, value, hx, cx = model(state, hx, cx) prob = F.softmax(logits, dim=1) # probability of choosing each actions log_prob = F.log_softmax(logits, dim=1) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) m = Categorical(prob) action = m.sample().item( ) # choosing actions based on multinomial distribution acts.append(action) # recieve reward and new state state, reward, done, info = env.step(action) with lock: counter.value += 1 if done or step_counter >= num_global_step: step_counter = 0 state = env.reset() if info['flag_get']: success = success + 1 else: fail = fail + 1 state = torch.from_numpy(state) values.append(value) log_probs.append(log_prob[0, action]) rewards.append(reward) record_reward.append(reward) if done: break # obtain critic values if not done: _, R, _, _ = model(state, hx, cx) R = R.detach() else: R = torch.zeros((1, 1), dtype=torch.float) record_acts.append(acts) avg_reward = sum(record_reward) record_reward_average.append(avg_reward) if info['flag_get']: success_acts.append(acts) with open('success_acts.txt', 'wb') as fp: pickle.dump(success_acts, fp) record_reward = [] acts = [] # gradient acsent values.append(R) esitimator = torch.zeros((1, 1), dtype=torch.float) for i in reversed(range(len(rewards))): R = rewards[i] + discount * R advantage_fc = rewards[i] + discount * values[i + 1] - values[i] # approximate the actor gradient using Generalized Advantage Estimator esitimator = discount * tau * esitimator + advantage_fc # accumulate gradients wrt the actor action_loss = action_loss + log_probs[i] * esitimator.detach( ) + beta * entropies[i] # accumulate gradients wrt the critic critic_loss = critic_loss + (R - values[i])**2 / 2 # perform asynchronous update optimizer.zero_grad() total_loss = critic_loss_coef * critic_loss - action_loss nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) total_loss.backward() optimizer.step() if info['flag_get']: with open('success_acts.txt', 'wb') as fp: pickle.dump(record_acts, fp) save_model(shared_model) if curr_episode == int(num_global_step / num_local_steps): end = timeit.default_timer() print('Training process {} terminated, run {} episodes, \n \ with {} success and {} failure,elasped time {}'.format( idx, terminated, success, fail, end - start)) return