def train(rank, args, shared_model, counter, lock, optimizer): torch.manual_seed(args.seed + rank) env = AI2ThorEnv(config_dict=args.config_dict) env.seed(args.seed + rank) if args.point_cloud_model: model = ActorCritic(env.action_space.n) else: args.frame_dim = env.config['resolution'][-1] model = ActorCritic(env.action_space.n, env.observation_space.shape[0], args.frame_dim) if args.cuda: model = model.cuda() model.train() state = env.reset() done = True # monitoring total_reward_for_num_steps_list = [] episode_total_rewards_list = [] avg_reward_for_num_steps_list = [] total_length = 0 episode_length = 0 n_episode = 0 total_reward_for_episode = 0 all_rewards_in_episode = [] while True: # Sync with the shared model model.load_state_dict(shared_model.state_dict()) if done: cx = torch.zeros(1, 256) hx = torch.zeros(1, 256) else: cx = cx.detach() hx = hx.detach() values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): episode_length += 1 total_length += 1 if args.cuda: if args.point_cloud_model: state = (state[0].cuda(), state[1].cuda()) else: state = state.cuda() cx = cx.cuda() hx = hx.cuda() value, logit, (hx, cx) = model((state, (hx, cx))) prob = F.softmax(logit, dim=-1) log_prob = F.log_softmax(logit, dim=-1) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) action = prob.multinomial(num_samples=1).detach() log_prob = log_prob.gather(1, action) log_probs.append(log_prob) action_int = action.cpu().numpy()[0][0].item() state, reward, done, _ = env.step(action_int, verbose=False) done = done or episode_length >= args.max_episode_length with lock: counter.value += 1 if done: total_length -= 1 total_reward_for_episode = sum(all_rewards_in_episode) episode_total_rewards_list.append(total_reward_for_episode) all_rewards_in_episode = [] state = env.reset() print( 'Process {} Episode {} Over with Length: {} and Reward: {: .2f}. Total Trained Length: {}' .format(rank, n_episode, episode_length, total_reward_for_episode, total_length)) sys.stdout.flush() episode_length = 0 n_episode += 1 values.append(value) rewards.append(reward) all_rewards_in_episode.append(reward) if done: break if args.synchronous: if total_reward_for_episode >= args.solved_reward: print("Process {} Solved with Reward {}".format( rank, total_reward_for_episode)) env.close() break total_reward_for_num_steps = sum(rewards) total_reward_for_num_steps_list.append(total_reward_for_num_steps) avg_reward_for_num_steps = total_reward_for_num_steps / len(rewards) avg_reward_for_num_steps_list.append(avg_reward_for_num_steps) # Backprop and optimisation R = torch.zeros(1, 1) gae = torch.zeros(1, 1) if args.cuda: if args.point_cloud_model: state = (state[0].cuda(), state[1].cuda()) else: state = state.cuda() R = R.cuda() gae = gae.cuda() if not done: # to change last reward to predicted value to .... value, _, _ = model((state, (hx, cx))) R = value.detach() values.append(R) policy_loss = 0 value_loss = 0 # import pdb;pdb.set_trace() # good place to breakpoint to see training cycle for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimation delta_t = rewards[i] + args.gamma * values[i + 1] - values[i] gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - log_probs[i] * gae.detach() - \ args.entropy_coef * entropies[i] optimizer.zero_grad() (policy_loss + args.value_loss_coef * value_loss).backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) ensure_shared_grads(model, shared_model) optimizer.step()
args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() if args.cuda: print('Using', torch.cuda.get_device_name(0)) torch.cuda.init() torch.manual_seed(args.seed) args.config_dict = { 'max_episode_length': args.max_episode_length, 'point_cloud_model': args.point_cloud_model } env = AI2ThorEnv(config_dict=args.config_dict) if args.point_cloud_model: shared_model = ActorCritic(env.action_space.n) else: args.frame_dim = env.config['resolution'][-1] shared_model = ActorCritic(env.action_space.n, env.observation_space.shape[0], args.frame_dim) if args.cuda: shared_model = shared_model.cuda() shared_model.share_memory() env.close( ) # above env initialisation was only to find certain params needed optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=args.lr) optimizer.share_memory()
def test(rank, args, shared_model, counter): torch.manual_seed(args.seed + rank) if args.atari: env = create_atari_env(args.atari_env_name) else: args.config_dict = {'max_episode_length': args.max_episode_length} env = AI2ThorEnv(config_dict=args.config_dict) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space.n, args.frame_dim) model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True # be True at the beginning start_time = time.time() # a quick hack to prevent the agent from stucking actions = deque(maxlen=100) episode_length = 0 episodes = 0 vis = Visdom() assert vis.check_connection() vis.close() win = vis.line(X=[0.], Y=[0.], win='testing_Rewards', opts=dict(title='testing_Rewards')) while True: episode_length += 1 if args.atari and args.atari_render: env.render() # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) cx = torch.zeros(1, 256) hx = torch.zeros(1, 256) else: cx = cx.detach() hx = hx.detach() with torch.no_grad(): value, logit, (hx, cx) = model( (state.unsqueeze(0).float(), (hx, cx))) prob = F.softmax(logit, dim=-1) action = prob.max(1, keepdim=True)[1].numpy() state, reward, done, _ = env.step(action[0, 0]) done = done or episode_length >= args.max_episode_length reward_sum += reward # a quick hack to prevent the agent from stucking # i.e. in test mode an agent can repeat an action ad infinitum actions.append(action[0, 0]) if actions.count(actions[0]) == actions.maxlen: print( 'In test. Episode over because agent repeated action {} times'. format(actions.maxlen)) done = True if done: print( "In test. Time {}, num steps over all threads {}, FPS {:.0f}, episode reward {}, episode length {}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), counter.value, counter.value / (time.time() - start_time), reward_sum, episode_length)) vis.line(X=[episodes], Y=[reward_sum], win='testing_Rewards', update='append') episodes += 1 reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() time.sleep(args.test_sleep_time) # wasting resource... print('testing...') state = torch.from_numpy(state)
def test(rank, args, shared_model, counter): torch.manual_seed(args.seed + rank) env = AI2ThorEnv(config_dict=args.config_dict) env.seed(args.seed + rank) if args.point_cloud_model: model = ActorCritic(env.action_space.n) else: args.frame_dim = env.config['resolution'][-1] model = ActorCritic(env.action_space.n, env.observation_space.shape[0], args.frame_dim) if args.cuda: model = model.cuda() model.eval() state = env.reset() reward_sum = 0 done = True save = '{}-steps{}-process{}-lr{}-entropy_coef{}'.format( "point" if args.point_cloud_model else "conv", args.num_steps, args.num_processes, args.lr, args.entropy_coef) save = os.path.join('logs', save) os.makedirs(save, exist_ok=True) if args.model: shared_model.load_state_dict( torch.load(os.path.join(save, "solved_ai2thor.pth"))) else: logger = CSVLogger(os.path.join(save, 'test.csv')) fileds = ['episode_reward', 'frames_rendered'] logger.log(fileds) start_time = time.time() # a quick hack to prevent the agent from stucking # actions = deque(maxlen=100) episode_length = 0 while True: episode_length += 1 # Sync with the shared model if done: model.load_state_dict(deepcopy(shared_model.state_dict())) cx = torch.zeros(1, 256) hx = torch.zeros(1, 256) else: cx = cx.detach() hx = hx.detach() with torch.no_grad(): if args.cuda: if args.point_cloud_model: state = (state[0].cuda(), state[1].cuda()) else: state = state.cuda() cx = cx.cuda() hx = hx.cuda() value, logit, (hx, cx) = model((state, (hx, cx))) prob = F.softmax(logit, dim=-1) # log_prob = F.log_softmax(logit, dim=-1) # print(prob) # entropy = -(log_prob * prob).sum(1, keepdim=True) # print(prob.max(1, keepdim=True)[0].cpu().numpy()) # print(entropy) action = prob.max(1, keepdim=True)[1].cpu().numpy() state, reward, done, _ = env.step(action[0, 0], verbose=False) done = done or episode_length >= args.max_episode_length reward_sum += reward # a quick hack to prevent the agent from stucking # i.e. in test mode an agent can repeat an action ad infinitum # actions.append(action[0, 0]) # if actions.count(actions[0]) == actions.maxlen: # print('In test. Episode over because agent repeated action {} times'.format( # actions.maxlen)) # done = True if done: print( "Time {}, num steps over all threads {}, FPS {:.0f}, episode reward {: .2f}, episode length {}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), counter.value, counter.value / (time.time() - start_time), reward_sum, episode_length)) if not args.model: logger.log(["{: .2f}".format(reward_sum), counter.value]) if reward_sum >= args.solved_reward: print("Solved Testing with Reward {}".format(reward_sum)) torch.save(model.state_dict(), os.path.join(save, "solved_ai2thor.pth")) env.close() logger.close() break reward_sum = 0 episode_length = 0 # actions.clear() state = env.reset() time.sleep(args.test_sleep_time)
def train(rank, args, shared_model, counter, lock, device, optimizer=None): torch.manual_seed(args.seed + rank) if args.atari: env = create_atari_env(args.atari_env_name) else: args.config_dict = {'max_episode_length': args.max_episode_length} env = AI2ThorEnv(config_dict=args.config_dict) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space.n, args.frame_dim) model = model.to(device) if optimizer is None: optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() state = env.reset() state = torch.from_numpy(state) done = True # monitoring total_reward_for_num_steps_list = [] episode_total_rewards_list = [] all_rewards_in_episode = [] avg_reward_for_num_steps_list = [] total_length = 0 episode_length = 0 episodes = 0 vis = Visdom() assert vis.check_connection() vis.close() vis.line(X=[0.], Y=[0.], win='training_Rewards' + str(rank), opts=dict(title='training_Rewards' + str(rank))) while True: episodes += 1 # Sync with the shared model model.load_state_dict(shared_model.state_dict()) if done: cx = torch.zeros(1, 256) hx = torch.zeros(1, 256) else: cx = cx.detach() hx = hx.detach() values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): episode_length += 1 total_length += 1 value, logit, (hx, cx) = model( (state.unsqueeze(0).float().cuda(), (hx.cuda(), cx.cuda()))) value = value.cpu() logit = logit.cpu() hx = hx.cpu() cx = cx.cpu() prob = F.softmax(logit, dim=-1) log_prob = F.log_softmax(logit, dim=-1) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) action = prob.multinomial(num_samples=1).detach() log_prob = log_prob.gather(1, action) action_int = action.numpy()[0][0].item() state, reward, done, _ = env.step(action_int) done = done or episode_length >= args.max_episode_length with lock: counter.value += 1 if done: total_length -= 1 total_reward_for_episode = sum(all_rewards_in_episode) episode_total_rewards_list.append(total_reward_for_episode) all_rewards_in_episode = [] vis.line(X=[episodes], Y=[total_reward_for_episode], win='training_Rewards' + str(rank), update='append') print( 'In Train. Episode Over. Total Length: {}. Total reward for episode: {}' .format(total_length, total_reward_for_episode)) print('In Train. Step no: {}. total length: {}'.format( episode_length, total_length)) episode_length = 0 state = env.reset() state = torch.from_numpy(state) values.append(value) log_probs.append(log_prob) rewards.append(reward) all_rewards_in_episode.append(reward) if done: break # No interaction with environment below. # Monitoring total_reward_for_num_steps = sum(rewards) # accumulate at each step total_reward_for_num_steps_list.append(total_reward_for_num_steps) avg_reward_for_num_steps = total_reward_for_num_steps / len(rewards) avg_reward_for_num_steps_list.append(avg_reward_for_num_steps) # Backprop and optimisation R = torch.zeros(1, 1) if not done: # to change last reward to predicted value to .... value, _, _ = model((state.unsqueeze(0).float(), (hx, cx))) R = value.detach() values.append(R) policy_loss = 0 value_loss = 0 # import pdb;pdb.set_trace() # good place to breakpoint to see training cycle gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimation delta_t = rewards[i] + args.gamma * values[i + 1] - values[i] gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - log_probs[i] * gae.detach() - \ args.entropy_coef * entropies[i] optimizer.zero_grad() (policy_loss + args.value_loss_coef * value_loss).cuda().backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) ensure_shared_grads(model, shared_model) optimizer.step()
def test(rank, args, shared_model, counter): torch.manual_seed(args.seed + rank) if args.atari: env = create_atari_env(args.atari_env_name) else: args.config_dict = {'max_episode_length': args.max_episode_length} env = AI2ThorEnv(config_dict=args.config_dict) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space.n, args.frame_dim) if args.cuda: model = model.cuda() model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True start_time = time.time() # a quick hack to prevent the agent from stucking actions = deque(maxlen=100) episode_length = 0 while True: episode_length += 1 if args.atari and args.atari_render: env.render() # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) cx = torch.zeros(1, 64) hx = torch.zeros(1, 64) else: cx = cx.detach() hx = hx.detach() with torch.no_grad(): if args.cuda: state = state.cuda() cx = cx.cuda() hx = hx.cuda() value, logit, (hx, cx) = model((state.unsqueeze(0).float(), (hx, cx))) prob = F.softmax(logit, dim=-1) # log_prob = F.log_softmax(logit, dim=-1) # print(prob) # entropy = -(log_prob * prob).sum(1, keepdim=True) # print(prob.max(1, keepdim=True)[0].cpu().numpy()) # print(entropy) action = prob.max(1, keepdim=True)[1].cpu().numpy() state, reward, done, _ = env.step(action[0, 0], verbose=False) done = done or episode_length >= args.max_episode_length reward_sum += reward # a quick hack to prevent the agent from stucking # i.e. in test mode an agent can repeat an action ad infinitum actions.append(action[0, 0]) if actions.count(actions[0]) == actions.maxlen: print('In test. Episode over because agent repeated action {} times'.format( actions.maxlen)) done = True if done: print("Time {}, num steps over all threads {}, FPS {:.0f}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), counter.value, counter.value / (time.time() - start_time), reward_sum, episode_length)) if reward_sum >= args.solved_reward: print("Solved Testing with Reward {}".format(reward_sum)) torch.save(model.state_dict(), "solved_{}.pth".format("atari" if args.atari else "ai2thor")) env.close() break reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() time.sleep(args.test_sleep_time) state = torch.from_numpy(state)