def test(rank, args, dtype): if args.upload: api_key = '' with open('api_key.json', 'r+') as api_file: api_key = json.load(api_file)['api_key'] timestring = str(date.today()) + '_' + time.strftime( "%Hh-%Mm-%Ss", time.localtime(time.time())) run_name = args.load_name + '_' + timestring configure("logs/es_evaluate" + run_name, flush_secs=5) torch.manual_seed(args.seed) curr_seed = args.seed env = create_atari_env(args.env_name, True, run_name) env.seed(args.seed + rank) state = env.reset() model = EvolutionNet(state.shape[0], env.action_space).type(dtype) if args.load_name is not None: model.load_state_dict( pickle.load(open('models/' + args.load_name + '.p', 'rb'))) else: print( 'A model is needed to train. Use the --load-name argument to point to a saved model\'s pickled state dictionary' ) for step in range(100): # evaluate on environment done = False total_reward = 0 steps = 0 state = env.reset() while not done: state = torch.from_numpy(state).type(dtype) action_probs = model((Variable(state.unsqueeze(0), volatile=True))) action = np.argmax(action_probs.data.cpu().numpy()) next_state, reward, done, _ = env.step(action) state = next_state total_reward += reward steps += 1 print('Reward from process ' + str(rank) + ': ' + str(total_reward) + ' after ' + str(steps) + ' steps') # logs average reward log_value('Reward', total_reward, step) env.close() if args.upload: gym.upload('monitor/' + run_name, api_key=api_key)
def test(rank, params, shared_model): # desynchronising the agents torch.manual_seed(params.seed + rank) # create the environment env = create_atari_env(params.env_name, video=True) env.seed(params.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) # since this is test mode, then we need to evaluate the model model.eval() state = env.reset() state = torch.from_numpy(state) # initialize all the required parameters reward_sum = 0 done = True # start time to measure the time of computations start_time = time.time() actions = deque(maxlen = 100) episode_length = 0 while True: episode_length += 1 if done: # reload last state of the model model.load_state_dict(shared_model.state_dict()) # reinitialize the cell and hidden states cx = Variable(torch.zeros(1, 256), volatile=True) hx = Variable(torch.zeros(1, 256), volatile=True) else: # we keep the same cell and hidden states # while making sure they are torch variables cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) # get the predictions of the model # output of critic, output of actor, hidden and cell states value, action_value, (hx, cx) = model((Variable(state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(action_value) # immediately play the action because there is no need to train action = prob.max(1)[1].data.numpy() state, reward, done, _ = env.step(action[0, 0]) reward_sum += reward if done: # when the game is done print("Time {}, episode reward {}, episode length {}".format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) # reinitialize everything after game is done reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() # do a break of 60 seconds to let the other agents practice time.sleep(60) # get new state state = torch.from_numpy(state)
def test(rank, args, shared_model, dtype): test_ctr = 0 torch.manual_seed(args.seed + rank) #set up logger timestring = str(date.today()) + '_' + time.strftime( "%Hh-%Mm-%Ss", time.localtime(time.time())) run_name = args.save_name + '_' + timestring configure("logs/run_" + run_name, flush_secs=5) env = create_atari_env(args.env_name, args.evaluate, run_name) env.seed(args.seed + rank) state = env.reset() model = ActorCritic(state.shape[0], env.action_space).type(dtype) model.eval() state = torch.from_numpy(state).type(dtype) reward_sum = 0 max_reward = -99999999 done = True start_time = time.time() episode_length = 0 while True: episode_length += 1 # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256).type(dtype), volatile=True) hx = Variable(torch.zeros(1, 256).type(dtype), volatile=True) else: cx = Variable(cx.data.type(dtype), volatile=True) hx = Variable(hx.data.type(dtype), volatile=True) value, logit, (hx, cx) = model((Variable(state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(logit) action = prob.max(1)[1].data.cpu().numpy() state, reward, done, _ = env.step(action[0, 0]) done = done or episode_length >= args.max_episode_length reward_sum += reward if done: print("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) # if not stuck or args.evaluate: log_value('Reward', reward_sum, test_ctr) log_value('Episode length', episode_length, test_ctr) if reward_sum >= max_reward: pickle.dump(shared_model.state_dict(), open(args.save_name + '_max' + '.p', 'wb')) max_reward = reward_sum reward_sum = 0 episode_length = 0 state = env.reset() test_ctr += 1 if test_ctr % 10 == 0 and not args.evaluate: pickle.dump(shared_model.state_dict(), open(args.save_name + '.p', 'wb')) if not args.evaluate: time.sleep(60) elif test_ctr == evaluation_episodes: # Ensure the environment is closed so we can complete the submission env.close() gym.upload('monitor/' + run_name, api_key=api_key) state = torch.from_numpy(state).type(dtype)
import pickle import torch import torch.nn as nn import torch.autograd as autograd import torch.nn.functional as F import torch.nn.init as init from torch.autograd import Variable from a3c_envs import create_atari_env from tensorboard_logger import configure, log_value # dtype = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor dtype = torch.FloatTensor env = create_atari_env('Asteroids-ram-v0') state = env.reset() # def weights_init(m): # classname = m.__class__.__name__ # if classname.find('Linear') != -1: # init.xavier_uniform(m.weight.data) # m.bias.data.fill_(0) class DQN(nn.Module): def __init__(self, state_space, action_space): super(DQN, self).__init__() self.l1 = nn.Linear(state_space, 16) self.l2 = nn.Linear(16, 8) self.l3 = nn.Linear(8, action_space.n)
def train(rank, args, shared_model, dtype): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) state = env.reset() model = ActorCritic(state.shape[0], env.action_space).type(dtype) optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() values = [] log_probs = [] state = torch.from_numpy(state).type(dtype) done = True episode_length = 0 while True: episode_length += 1 # Sync with the shared model model.load_state_dict(shared_model.state_dict()) if done: cx = Variable(torch.zeros(1, 256).type(dtype)) hx = Variable(torch.zeros(1, 256).type(dtype)) else: cx = Variable(cx.data.type(dtype)) hx = Variable(hx.data.type(dtype)) values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): value, logit, (hx, cx) = model( (Variable(state.unsqueeze(0)), (hx, cx))) prob = F.softmax(logit) log_prob = F.log_softmax(logit) entropy = -(log_prob * prob).sum(1) entropies.append(entropy) action = prob.multinomial().data log_prob = log_prob.gather(1, Variable(action)) state, reward, done, _ = env.step(action.cpu().numpy()) done = done or episode_length >= args.max_episode_length if done: episode_length = 0 state = env.reset() state = torch.from_numpy(state).type(dtype) values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1).type(dtype) if not done: value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx))) R = value.data values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1).type(dtype) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * \ values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ log_probs[i] * Variable(gae) - args.beta * entropies[i] optimizer.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(model.parameters(), 40) ensure_shared_grads(model, shared_model) optimizer.step()
self.lr = 0.0001 # learning rate self.gamma = 0.99 # gamme self.tau = 1. self.seed = 1 self.num_processes = 16 self.num_steps = 20 self.max_episode_length = 10000 self.env_name = 'Breakout-v0' # Main run os.environ['OMP_NUM_THREADS'] = '1' # 1 thread per core params = Params() # get all out parameters and initialize them torch.manual_seed(params.seed) # set the seed env = create_atari_env( params.env_name ) # get the environment, create an optimized env using universe shared_model = ActorCritic( env.observation_space.shape[0], env.action_space ) # model shared by every agent and store it in the computer shared_model.share_memory() optimizer = a3c_custom_optim.SharedAdam( shared_model.parameters(), lr=params.lr) # link optimizer to shared model act on the shared model optimizer.share_memory() # store the optimizer in the memory processes = [] p = mp.Process(target=test, args=( params.num_processes, params, shared_model)) # runs a funciton on an independent thread (from torch) p.start() processes.append(p)
default=None, metavar='SN', help='path/prefix for the filename to load shared model\'s parameters') parser.add_argument('--evaluate', action="store_true", help='whether to evaluate results and upload to gym') if __name__ == '__main__': args = parser.parse_args() torch.manual_seed(args.seed) dtype = torch.cuda.FloatTensor if torch.cuda.is_available( ) else torch.FloatTensor env = create_atari_env(args.env_name) state = env.reset() shared_model = ActorCritic(state.shape[0], env.action_space).type(dtype) if args.load_name is not None: shared_model.load_state_dict( pickle.load(open(args.load_name + '.p', 'rb'))) shared_model.share_memory() # train(1,args,shared_model,dtype) processes = [] p = mp.Process(target=test, args=(args.num_processes, args, shared_model, dtype)) p.start() processes.append(p)
def train(rank, params, shared_model, optimizer): # have to desynchronize every training agent # use rank to shift each seed, n agents mean rank is 0 to n torch.manual_seed(params.seed + rank) # desync each traiing agent # create environment for breakout env = create_atari_env(params.env_name) # align seed of the environment on the agent # each agent has it's own copy of the environment # we need to align each of the agent on one specific environment # associate different seed to each agent so they can have separate env env.seed(params.seed + rank) # create a3c model model = ActorCritic(env.observation_space.shape[0], env.action_space) # insert thru env # get state of env # state is 1 by 42 by 42 (1 is black) state = env.reset() # convert into torch tensors state = torch.from_numpy(state) # done is when game is over done = True episode_length = 0 # increment the episode_length while True: episode_length += 1 model.load_state_dict(shared_model.state_dict()) if done: # reinitialize the hidden and cell states # since output is 256 we need 256 zeroes cx = Variable(torch.zeros(1, 256)) hx = Variable(torch.zeros(1, 256)) else: # keep data cx = Variable(cx.date) hx = Variable(hx.data) values = [] # value of the critic log_probs = [] rewards = [] entropies = [] # loop over exploration steps for step in range(params.num_steps): # get predictions of the model, apply it to the input # get the values of the v function value, action_values, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx))) # model need to be unsqueezed # get the probabiliteis using softmax prob = F.softmax(action_values) # remember entropy is the minus of the sum of the product log prob times prob log_prob = F.log_softmax(action_values) entropy = -(log_prob * prob).sum(1) # append to entropies entropies.append(entropy) action = prob.multinomial().data # take a random draw of the actions available log_prob = log_prob.gather(1, Variable(action)) # associate with the action # append to values and log_probs values.append(value) log_probs.append(log_prob) # by reaching a new state, we get a reward, refer to the env code state, reward, done = env.step(action.numpy()) # make sure agent is not stuck in a state # limit the time by limiting max_episode_length done = (done or episode_length >= params.max_episode_length) # make sure reward between -1 and +1 reward = max(min(reward, 1), -1) # check if game is done and then restart environment if done: episode_length = 0 state = env.reset() # remember that state is an image in the form of a numpy array state = torch.from_numpy(state) # append reward to the rewards now rewards.append(reward) if done: # stop exploration if done break # cumulative reward R = torch.zeros(1, 1) if not done: # cumulative reward is the output of model in prev state value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx))) R = value.data values.append(Variable(R)) # calculate loss now # remember we have 2 types of loss policy_loss = 0 value_loss = 0 R = Variable(R) # must be torch as we're comparing gradient R is a term of value loss # initialise the GAE generalised advantage estimation (advantage of action in state compared to another state) gae = torch.zeros(1, 1) # A(a, s) = Q(a, s) - V(s) # stochastic gradient descent # reversed is so that we can move back in time for i in reversed(range(len(rewards))): R = params.gamma * R + rewards[i] # we will get R = r_0 + gamma * r_1 + gamma^2 * r_2 + ... + gamma^(n-1) * r_(n-1) + gamma^nb_steps * V(last state) # compute the advantage of reward against the value advantage = R - values[i] # get the value loss Q*(a*, s) = V*(s) value_loss = value_loss + 0.5 * advantage.pow(2) # loss generated by the predictions of the V function output by the critic # use GAE for policy loss, temporal diff of state value TD = rewards[i] + params.gamma * values[i + 1].data - values[i].data gae = gae * params.gamma * params.tau + TD # gae = sum_i (gamme*tau)^i * TD(i) # we can now finally calculate policy loss # log of probability of the entropy are negative values # we maximise the probability of playing the action that will maximise the advantage # purpose of entropy is to prevent falling too quickly into a trap # where all actions 0 but one is 1, entropy is to control that from happening policy_loss = policy_loss - log_probs[i] * Variable(gae) - 0.01 * entropies[i] # policy_loss = - sum_i log(pi_i) + 0.01*R_i (entropy) # apply stochastic gradient descent optimizer.zero_grad() # give more importance to policy loss as its smaller than value loss (policy_loss + 0.5 * value_loss).backward() # prevent gradient from generating very large values # 40 is such that the norm of the gradient stays between 0 and 40 torch.nn.utils.clip_grad_norm(model.parameters(), 40) # make sure model and share_model share the same grad ensure_shared_grads(model, shared_model) # now optimize to reduce the losses optimizer.step()