def test(args, model, env): torch.manual_seed(args.seed) # env = create_atari_env(args.env_name) # env = create_car_racing_env() env.seed(args.seed) model = ActorCritic(env.observation_space.shape[0], env.action_space) model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True start_time = time.time() # a quick hack to prevent the agent from stucking actions = deque(maxlen=100) episode_length = 0 while True: #env.render() episode_length += 1 # Sync with the shared model if done: # model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, model.lstm_size), volatile=True) hx = Variable(torch.zeros(1, model.lstm_size), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) value, logit, (hx, cx) = model((Variable(state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(logit) action = prob.max(1)[1].data.numpy() state, reward, done, _ = env.step(action[0, 0]) done = done or episode_length >= args.max_episode_length reward_sum += reward # a quick hack to prevent the agent from stucking actions.append(action[0, 0]) if actions.count(actions[0]) == actions.maxlen: done = True if done: print("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() return # time.sleep(60) state = torch.from_numpy(state)
def test(rank, args, shared_model, counter): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True start_time = time.time() # a quick hack to prevent the agent from stucking actions = deque(maxlen=100) episode_length = 0 while True: episode_length += 1 # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) if done and counter.value > args.max_steps: test_final(shared_model, env, args) save_model(shared_model, args) exit() with torch.no_grad(): value, logit = model(state.unsqueeze(0)) prob = F.softmax(logit, dim=-1) action = prob.max(1, keepdim=True)[1].numpy() state, reward, done, _ = env.step(action[0, 0]) done = done or episode_length >= args.max_episode_length reward_sum += reward # a quick hack to prevent the agent from stucking actions.append(action[0, 0]) if actions.count(actions[0]) == actions.maxlen: done = True if done: print( "Time {}, num steps {}, FPS {:.0f}, episode reward {}, episode length {}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), counter.value, counter.value / (time.time() - start_time), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() time.sleep(60) state = torch.from_numpy(state)
def test(rank, params, shared_model): torch.manual_seed(params.seed + rank) # asynchronizing the test agent env = create_atari_env(params.env_name, video=True) # running an environment with a video env.seed(params.seed + rank) # asynchronizing the environment model = ActorCritic(env.observation_space.shape[0], env.action_space) # creating one model model.eval() # putting the model in "eval" model because it won't be trained state = env.reset() # getting the input images as numpy arrays state = torch.from_numpy(state) # converting them into torch tensors reward_sum = 0 # initializing the sum of rewards to 0 done = True # initializing done to True start_time = time.time() # getting the starting time to measure the computation time actions = deque(maxlen=100) # cf https://pymotw.com/2/collections/deque.html episode_length = 0 # initializing the episode length to 0 while True: # repeat episode_length += 1 # incrementing the episode length by one if done: # synchronizing with the shared model (same as train.py) model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256), volatile=True) hx = Variable(torch.zeros(1, 256), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) value, action_value, (hx, cx) = model((Variable(state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(action_value) action = prob.max(1)[1].data.numpy() # the test agent does not explore, it directly plays the best action state, reward, done, _ = env.step(action[0, 0]) # done = done or episode_length >= params.max_episode_length reward_sum += reward if done: # printing the results at the end of each part print("Time {}, episode reward {}, episode length {}".format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) reward_sum = 0 # reinitializing the sum of rewards episode_length = 0 # reinitializing the episode length actions.clear() # reinitializing the actions state = env.reset() # reinitializing the environment time.sleep(60) # doing a one minute break to let the other agents practice (if the game is done) state = torch.from_numpy(state) # new state and we continue
def test(rank, args, shared_model, counter): torch.manual_seed(args.seed + rank) torch.save(shared_model.state_dict(), 't.pkl') env = Env(args.seed + rank) model = ActorCritic(1, env.action_space) model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True # env.visual() start_time = time.time() # a quick hack to prevent the agent from stucking actions = deque(maxlen=500) episode_length = 0 while True: episode_length += 1 # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) with torch.no_grad(): value, logit = model((state.unsqueeze(0)).type(torch.FloatTensor)) prob = F.softmax(logit, dim=-1) action = prob.max(1, keepdim=True)[1].numpy() print(action) state, reward, done = env.step(action[0, 0]) done = done or episode_length >= args.max_episode_length reward_sum += reward # a quick hack to prevent the agent from stucking actions.append(action[0, 0]) if actions.count(actions[0]) == actions.maxlen: done = True if done: print("Time {}, num steps {}, FPS {:.0f}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), counter.value, counter.value / (time.time() - start_time), reward_sum, episode_length)) # env.visual() reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() time.sleep(60) state = torch.from_numpy(state)
def run(args): device = torch.device("cpu") env = gym.make('SpaceInvaders-v0') state_size = env.observation_space.shape action_size = env.action_space.n model = ActorCritic([1, 4, 84, 84], action_size).to(device) opt = SharedRMSprop(model.parameters(), lr=args.lr, alpha=args.alpha, eps=1e-8, weight_decay=args.weight_decay, momentum=args.momentum, centered=False) opt_lock = mp.Lock() scheduler = LRScheduler(args) if args.load_fp: checkpoint = torch.load(args.load_fp) model.load_state_dict(checkpoint['model_state_dict']) opt.load_state_dict(checkpoint['optimizer_state_dict']) if args.train: start = time.time() model.share_memory() model.train() step_counter, max_reward, ma_reward, ma_loss = [ mp.Value('d', 0.0) for _ in range(4) ] processes = [] if args.num_procs == -1: args.num_procs = mp.cpu_count() for rank in range(args.num_procs): p = mp.Process(target=train, args=(rank, args, device, model, opt, opt_lock, scheduler, step_counter, max_reward, ma_reward, ma_loss)) p.start() processes.append(p) for p in processes: p.join() if args.verbose > 0: print(f"Seconds taken: {time.time() - start}") if args.save_fp: torch.save( { 'model_state_dict': model.state_dict(), # 'optimizer_state_dict': opt.state_dict(), }, args.save_fp) if args.test: model.eval() test(args, device, model)
def test(shared_model, render=0): # torch.manual_seed(rank) env = create_atari_env(args.rom) model = ActorCritic(env.observation_space.shape[0], env.action_space) model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True # a quick hack to prevent the agent from stucking actions = deque(maxlen=100) episode_length = 0 cx = hx = None while True: episode_length += 1 # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256).type(FloatTensor), volatile=True) hx = Variable(torch.zeros(1, 256).type(FloatTensor), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) value, logit, (hx, cx) = model((Variable( state.unsqueeze(0).type(FloatTensor), volatile=True), (hx, cx))) prob = F.softmax(logit) # print logit.data.numpy() action = prob.max(1, keepdim=True)[1].data.cpu().numpy() state, reward, done, _ = env.step(action[0, 0]) if render == 1: env.render() time.sleep(0.03) done = done or episode_length >= 10000 reward_sum += reward # a quick hack to prevent the agent from stucking # actions.append(action[0, 0]) # if actions.count(actions[0]) == actions.maxlen: # done = True if done: print("Time {}, episode reward {}, episode length {}". format(get_elapsed_time_str(), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() time.sleep(60) state = torch.from_numpy(state)
def test(rank, params, shared_model): torch.manual_seed(params.seed + rank) # Test ajanını asenkron yapmak için env = create_atari_env(params.env_name, video=True) # Ortamı video ile oynatmak için env.seed(params.seed + rank) # Ortamı asenkron yapmak için model = ActorCritic(env.observation_space.shape[0], env.action_space) # Modelin oluşturulması model.eval() # Modelin eğitim yapmaması için state = env.reset() # input resmini numpy array olarak alıyoruz. state = torch.from_numpy(state) # Bunu torch tensörüne çeviriyoruz. reward_sum = 0 done = True start_time = time.time() # Başlangıç zamanı actions = deque(maxlen=100) # https://pymotw.com/2/collections/deque.html episode_length = 0 while True: episode_length += 1 # Bölüm uzunluğunu birer birer arttırıyoruz. if done: # Eğitim modundaki gibi paylaşımlı model ile senkronize hale getiriyoruz. model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256), volatile=True) hx = Variable(torch.zeros(1, 256), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) value, action_value, (hx, cx) = model( (Variable(state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(action_value) action = prob.max(1)[1].data.numpy( ) # Test Ajanı keşif yapmadan doğrudan en iyi aksiyonu kullanarak oynu oynar. state, reward, done, _ = env.step(action[ 0, 0]) # done = done or episode_length >= params.max_episode_length reward_sum += reward if done: # Her bölümün sonunda sonucu yazdırır. print("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() time.sleep(60) # Öbür ajanları beklemek için 1 dk beklemesi için. state = torch.from_numpy( state) # Yeni durum (state) oluşturup devam eder.
def test(rank, args, model_path, all_cooked_time, all_cooked_bw, all_vp_time, all_vp_unit, num): torch.manual_seed(args.seed + rank) env = Environment(args, all_cooked_time, all_cooked_bw, all_vp_time, all_vp_unit, random_seed=args.seed + rank) model = ActorCritic() model.load_state_dict(torch.load(model_path)) model.eval() state = env.reset() state_time = time.time() episode_length = 0 # log = open('new-result-1/test-vp-log20000.txt', 'w') # log = open('results-3/log20000.txt', 'w') # log = open('train_norway_result-2/test_log3000.txt', 'w') log = open('result-1/log-' + str(num) + '.txt', 'w') while True: episode_length += 1 state = Variable(torch.FloatTensor(state)) # print('state', state) logit, value = model(state.view(-1, 11, 8)) prob = F.softmax(logit, dim=1) _, action = torch.max(prob, 1) state, reward, done, (action, vp_quality, ad_quality, out_quality, rebuf, cv, blank_ratio, reward, real_vp_bitrate, smooth) \ = env.step(action.data.numpy()[0]) update = True if update: print("Time {}, action {}, ({},{},{}), bitrate {:.3f}, rebuf {:.3f}, cv {:.3f}, smooth {:.3f}, reward {:.3f}, episode {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - state_time)), action, vp_quality, ad_quality, out_quality, real_vp_bitrate, rebuf, cv, smooth, reward, episode_length)) log.write('action: ' + str(action) + ' (' + str(vp_quality) + ',' + str(ad_quality) + ',' + str(out_quality) + ') rebuf: ' + str(rebuf) + ' cv: ' + str(cv) + ' bitrate: ' + str(real_vp_bitrate) + ' smooth: ' + str(smooth) + ' reward: ' + str(reward) + ' episode: ' + str(episode_length) + '\n') # log.write(str()) # print('Time {}'.format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - state_time)))) # print('time: ', time.gmtime(time.time() - state_time)) # time.sleep(0.5) if done: state = env.reset() if episode_length == 50000: log.close() break
def test(rank, params, shared_model): torch.manual_seed(params.seed + rank) env = create_atari_env(params.env_name, video=True) env.seed(params.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True start_time = time.time() actions = deque(maxlen=100) episode_length = 0 while True: episode_length += 1 if done: save(model, 'brain.pkl') model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256), volatile=True) hx = Variable(torch.zeros(1, 256), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) value, action_value, (hx, cx) = model( (Variable(state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(action_value) action = prob.max(1)[1].data.numpy() state, reward, done, _ = env.step(action[0]) reward_sum += reward if done: f = open("Statistics.txt", 'a') f.write(str(reward_sum) + " " + str(episode_length) + "\n") f.close() print("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() time.sleep(60) state = torch.from_numpy(state)
def local_test(index, opt, global_model): torch.manual_seed(123 + index) env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type) local_model = ActorCritic(num_states, num_actions) local_model.eval() state = torch.from_numpy(env.reset()) done = True curr_step = 0 actions = deque(maxlen=opt.max_actions) while True: curr_step += 1 if done: local_model.load_state_dict(global_model.state_dict()) with torch.no_grad(): if done: h_0 = torch.zeros((1, 512), dtype=torch.float) c_0 = torch.zeros((1, 512), dtype=torch.float) else: h_0 = h_0.detach() c_0 = c_0.detach() logits, value, h_0, c_0 = local_model(state, h_0, c_0) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() state, reward, done, _ = env.step(action) env.render() actions.append(action) if curr_step > opt.num_global_steps or actions.count( actions[0]) == actions.maxlen: done = True if done: curr_step = 0 actions.clear() state = env.reset() state = torch.from_numpy(state)
def test(rank, params, shared_model): torch.manual_seed(params.seed + rank) env = create_atari_env(params.env_name, video=True) env.seed(params.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True start_time = time.time() actions = deque(maxlen=100) episode_length = 0 while True: episode_length += 1 if done: model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256), volatile=True) hx = Variable(torch.zeros(1, 256), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) value, action_value, (hx, cx) = model( (Variable(state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(action_value) action = prob.max(1)[1].data.numpy() state, reward, done, _ = env.step(action[0, 0]) reward_sum += reward if done: print("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() time.sleep( 60 ) # 60 seconds break to allow the other agents to test the environment state = torch.from_numpy(state)
def main(): env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) net = ActorCritic(num_inputs, num_actions) net.load_state_dict(torch.load(args.save_path + 'model.pth')) net.to(device) net.eval() running_score = 0 steps = 0 for e in range(5): done = False score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: env.render() steps += 1 policy, value = net(state) action = get_action(policy, num_actions) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state).to(device) next_state = next_state.unsqueeze(0) score += reward state = next_state print('{} episode | score: {:.2f}'.format(e, score))
class Agent: def __init__(self): self.net = ActorCritic() self.net.load_state_dict( torch.load('models/good.pt', map_location='cpu')) self.net.eval() torch.no_grad().__enter__() # 关闭梯度记录 def brain(self, reversi: Reversi, who: int) -> Coordinate: # assert reversi.next == who state = torch.Tensor(getBoardState(reversi)).unsqueeze(0) policy = self.net(state)[1][0] # 保证位置合法性 for y, x in itertools.product(range(SIZE), repeat=2): if not reversi.good[y][x]: policy[y * SIZE + x] = 0. else: policy[y * SIZE + x] += 1e-8 # 防止概率全为 0 action = policy.max(dim=-1).indices.item() return (action // SIZE, action % SIZE)
def test(args): args.device = torch.device( 'cuda:1') if torch.cuda.is_available() else torch.device('cpu') scorer = Scorer(args) if args.use_tensorboard: args.runs_path = os.path.join(args.output_dir, 'runs_test') summary_writer = SummaryWriter(args.runs_path) model = ActorCritic(args).to(args.device) model.eval() for epoch_id in range(args.n_epochs): cur_reward, used_steps, not_finish, status = run_one_epoch( args, scorer, model) summary_writer.add_scalar('aver_reward', cur_reward, epoch_id) summary_writer.add_scalar('used_steps', used_steps, epoch_id) if (epoch_id + 1) % args.add_image_per_epoch == 0: (origin_img, cropped_bbox, score_diff) = status # import pdb; pdb.set_trace(); (xmin, ymin, xmax, ymax) = cropped_bbox cropped_img = np.ones_like(origin_img) * 255 cropped_img[ymin:ymax, xmin:xmax, :] = origin_img[ymin:ymax, xmin:xmax, :] # cropped_img = transform.resize(cropped_img, (origin_img.shape[0], origin_img.shape[1])) [origin_img, cropped_img] = map(lambda x: x.transpose((2, 0, 1)), [origin_img, cropped_img]) # summary_writer.add_image('origin_img {}'.format(epoch_id), origin_img, epoch_id) # summary_writer.add_image('cropped_img {}'.format(epoch_id), cropped_img, epoch_id) stacked_img = torchvision.utils.make_grid(torch.from_numpy( np.stack((origin_img, cropped_img))), nrow=1, padding=2) summary_writer.add_image('origin_cropped {}'.format(epoch_id), stacked_img) summary_writer.add_scalar('score_diff {}'.format(epoch_id), score_diff) print("epoch : {:03f}, aver_reward: {:03f}, used_steps: {:03d}, not_finish: {:d}".\ format(epoch_id, cur_reward, used_steps, int(not_finish)))
def load_checkpoint(filepath): # checkpoint = torch.load(filepath) # model = checkpoint['model'] # model.load_state_dict(checkpoint['state_dict']) # for parameter in model.parameters(): # parameter.requires_grad = False # model.eval() ##################### model = ActorCritic(len(state), params.output_space) optimizer = my_optim.SharedAdam(model.parameters(), lr=params.lr) checkpoint = torch.load(params.file_path_shared_model) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) model.eval() model_test = ActorCritic(len(state), params.output_space) optimizer_test = my_optim.SharedAdam(model_test.parameters(), lr=params.lr) checkpoint = torch.load(params.file_path_shared_model_test) model_test.load_state_dict(checkpoint['state_dict']) optimizer_test.load_state_dict(checkpoint['optimizer']) model_test.eval() ########################### return model
def test(rank, args, shared_model, counter, loggers, kill): counter, steps, max_episodes = counter torch.manual_seed(args.seed + rank) env = create_vizdoom_env(args.config_path, args.test_scenario_path) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.spaces[0].shape[0], env.action_space, args.topology) model.eval() state = env.reset() reward_sum = 0 done = True start_time = time.time() # a quick hack to prevent the agent from stucking hidden = ((torch.zeros(1, 64), torch.zeros(1, 64)), (torch.zeros(1, 256), torch.zeros(1, 256))) actions = deque(maxlen=100) episode_length = 0 episode_counter = 0 obs_index = 0 obs_history = [] pose_history = [] goal_loc = env.goal() model.load_state_dict(shared_model.state_dict()) while not kill.is_set(): if steps.value > args.max_episode_steps: break if episode_counter > max_episodes: break try: episode_start_time = time.time() episode_length += 1 value, logit, _, _, hidden = model((state_to_torch(state), hidden)) prob = F.softmax(logit) action = prob.max(1, keepdim=True)[1].data.numpy() for i in range(4): state, reward, done, _ = env.step(action[0, 0], steps=1) reward_sum += reward if done: break else: obs_frame = (np.moveaxis(state[0], 0, -1) * 255).astype( np.uint8) if isinstance(obs_history, list): obs_history.append(obs_frame) else: obs_history[obs_index, :, :, :] = obs_frame obs_index += 1 pose_history.append(env.pose()) # a quick hack to prevent the agent from stucking # actions.append(action[0, 0]) # if actions.count(actions[0]) == actions.maxlen: # done = True if done: if isinstance(obs_history, list): obs_history = np.array(obs_history) if loggers: loggers['test_reward'](env.game.get_total_reward(), episode_counter) loggers['video'](video(env.wad, env.current_map, goal_loc, obs_history, pose_history), episode_counter) loggers['test_time'](time.time() - episode_start_time, episode_counter) print( "Time {}, num episodes {}, FPS {:.0f}, episode reward {}, episode length {}". format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), counter.value, counter.value / (time.time() - start_time), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() obs_index = 0 pose_history = [] goal_loc = env.goal() hidden = ((torch.zeros(1, 64), torch.zeros(1, 64)), (torch.zeros(1, 256), torch.zeros(1, 256))) time.sleep(args.eval_interval) model.load_state_dict(shared_model.state_dict()) episode_counter += 1 except Exception as err: kill.set() raise err
def test(name, backend, env_name, rank, args, shared_model, counter, docker, train_mode=True): torch.manual_seed(args.seed + rank) if backend == 'unity3d': if docker: os.chdir('/mnt/code/') env = create_unity3d_env(train_mode=train_mode,\ file_name=env_name, \ worker_id=rank, seed=args.seed, \ docker_training=docker) elif backend == 'gym': env = create_atari_env(env_name) env.seed(args.seed + rank) else: print(f' [!]: {backend} is not a valid backend') raise ValueError print(env.action_space) model = ActorCritic(env.observation_space.shape[0], env.action_space) model.eval() state = env.reset() state = torch.from_numpy(state).float() reward_sum = 0 done = True start_time = time.time() # a quick hack to prevent the agent from stucking history = {'num-steps': [], 'times': [], 'rewards': [], 'episode-length': []} actions = deque(maxlen=100) episode_length = 0 while True: episode_length += 1 # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256), volatile=True) hx = Variable(torch.zeros(1, 256), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) value, logit, (hx, cx) = model((Variable( state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(logit) action = prob.max(1, keepdim=True)[1].data.numpy() state, reward, done, _ = env.step(action[0, 0]) done = done or episode_length >= args.max_episode_length reward_sum += reward # a quick hack to prevent the agent from stucking actions.append(action[0, 0]) if actions.count(actions[0]) == actions.maxlen: done = True if done: end = time.time() - start_time history['num-steps'].append(counter.value) history['times'].append(end) history['rewards'].append(reward_sum) history['episode-length'].append(episode_length) print("Time {}, num steps {}, FPS {:.0f}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(end)), counter.value, counter.value / (end), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() if train_mode: history['weights'] = shared_model.state_dict() torch.save(history, f'{name}-history.t7') time.sleep(60) state = torch.from_numpy(state).float() env.close()
def test(rank, args, shared_model, counter): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True start_time = time.time() # a quick hack to prevent the agent from stucking # actions = deque(maxlen=100) episode_length = 0 while True: env.render() print('here') # env.render() episode_length += 1 # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256), volatile=True) hx = Variable(torch.zeros(1, 256), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) print('there') value, logit, (hx, cx) = model((Variable( state.unsqueeze(0), volatile=True), (hx, cx))) print('hi') prob = F.softmax(logit) # print(prob) action = prob.max(1, keepdim=True)[1].data.numpy() print(action) state, reward, done, _ = env.step(action[0, 0]) done = done or episode_length >= args.max_episode_length reward_sum += reward # a quick hack to prevent the agent from stucking # actions.append(action[0, 0]) # if actions.count(actions[0]) == actions.maxlen: # done = True if done: print("Time {}, num steps {}, FPS {:.0f}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), counter.value, counter.value / (time.time() - start_time), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() time.sleep(60) state = torch.from_numpy(state)
def test(rank, args, shared_model, counter, logger): console_f = logger.init_console_log_file() torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) model.eval() max_score = 0 start_time = time.time() while True: if args.max_counter_num != 0 and counter.value > args.max_counter_num: if args.save_policy_models: logger.save_policy_model(shared_model, counter.value + 1) exit(0) # monitor counter value if counter.value % args.testing_every_counter > 1: continue counter_value = counter.value model.load_state_dict(shared_model.state_dict()) if args.save_policy_models: if counter_value % args.save_policy_models_every <= 5: logger.save_policy_model(shared_model, counter_value) state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True # a quick hack to prevent the agent from stucking # actions = deque(maxlen=100) # actions = deque(maxlen=500) actions = deque(maxlen=1000) episode_length = 0 episode_count = 0 episode_rewards_sum = 0 episode_length_sum = 0 while True: episode_length += 1 # Sync with the shared model with torch.no_grad(): if done: cx = Variable(torch.zeros(1, 256)) hx = Variable(torch.zeros(1, 256)) else: cx = Variable(cx.data) hx = Variable(hx.data) value, logit, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx))) prob = F.softmax(logit, dim=1) action = prob.max(1, keepdim=True)[1].data.numpy() state, reward, done, _ = env.step(action[0, 0]) done = done or episode_length >= args.max_episode_length reward_sum += reward # a quick hack to prevent the agent from stucking actions.append(action[0, 0]) if actions.count(actions[0]) == actions.maxlen: done = True if done: episode_count += 1 episode_rewards_sum += reward_sum episode_length_sum += episode_length if episode_count == args.testing_episodes_num: print("Time {}, num steps {}, FPS {:.0f}, avg episode reward {}, avg episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), counter_value, counter_value / (time.time() - start_time), episode_rewards_sum/args.testing_episodes_num, episode_length_sum/args.testing_episodes_num)) logger.write_results_log(console_f, time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), counter_value, counter_value / (time.time() - start_time), episode_rewards_sum / args.testing_episodes_num, episode_length_sum / args.testing_episodes_num) if args.save_max and (episode_rewards_sum / args.testing_episodes_num) >= max_score: max_score = episode_rewards_sum / args.testing_episodes_num logger.save_policy_model(shared_model, count="max_reward") break reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() state = torch.from_numpy(state)
def test(rank, args, shared_model): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space.n, args.lstm_size) model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True start_time = time.time() #actions=deque(maxlen=100) episode_length = 0 currentPath = os.getcwd() File = open(currentPath + '/record.txt', 'a+') print("\n\n\n\n------------------------------\n\n\n\n\n") File.write("\n\n\n\n------------------------------\n\n\n\n\n") File.close() cnt = 0 episode_number = 0 while True: env.render() cnt = cnt + 1 episode_length += 1 if done: model.load_state_dict(shared_model.state_dict()) hx = Variable(torch.zeros(1, args.lstm_size), volatile=True) cx = Variable(torch.zeros(1, args.lstm_size), volatile=True) else: hx = Variable(hx.data, volatile=True) cx = Variable(cx.data, volatile=True) #print(state) value, logit, (hx, cx) = model((Variable(state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(logit) #action=prob.max(1)[1].data.numpy() action = prob.multinomial().data #if(args.env_name=='Breakout-v3'): # state,reward,done,_=env.step(1) # reward_sum+=reward #state,reward,done,_ =env.step(action[0,0]) state, reward, done, _ = env.step(action.numpy()) done = done #or episode_length >= args.max_episode_length if episode_length >= args.max_episode_length: done = True reward_sum -= 30 reward_sum += reward #actions.append(action[0,0]) #if actions.count(actions[0])==actions.maxlen: # done=True #if reward!=0: # print("ep %d : game finished,reward: %d " %(episode_number,reward))+('' if reward == #-1 else ' !!!!!!!!') if done: hour = int( time.strftime("%H", time.gmtime(time.time() - start_time))) _min = int( time.strftime("%M", time.gmtime(time.time() - start_time))) print("Time {},episode reward {}, episode length {} ".format( hour * 60 + _min + args.starttime, reward_sum, episode_length)) File = open(currentPath + '/record.txt', 'a+') File.write( "Time {},episode reward {}, episode length {} \n".format( hour * 60 + _min + args.starttime, reward_sum, episode_length)) File.close() reward_sum = 0 episode_length = 0 #actions.clear() state = env.reset() torch.save(model.state_dict(), currentPath + '/A3C.t7') episode_number += 1 time.sleep(60) state = torch.from_numpy(state)
def test(rank, args, shared_model, gl_step_cnt): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True start_time = time.time() local_episode_num = 0 # a quick hack to prevent the agent from stucking actions = deque(maxlen=100) episode_length = 0 while True: episode_length += 1 # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256), volatile=True) hx = Variable(torch.zeros(1, 256), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) value, logit, (hx, cx) = model((Variable(state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(logit) action = prob.max(1)[1].data.numpy() state, reward, done, _ = env.step(action[0, 0]) done = done or episode_length >= args.max_episode_length reward_sum += reward # a quick hack to prevent the agent from stucking actions.append(action[0, 0]) if actions.count(actions[0]) == actions.maxlen: done = True if done: passed_time = time.time() - start_time local_episode_num += 1 global_step_count = gl_step_cnt.get_value() logger.info("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(passed_time)), reward_sum, episode_length)) tb.log_value('steps_second', global_step_count / passed_time, global_step_count) tb.log_value('reward', reward_sum, global_step_count) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() time.sleep(60) state = torch.from_numpy(state)
def test(rank, args, shared_model, counter): torch.manual_seed(args.seed + rank) env = gym.make(args.env_name) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[2], env.action_space) model.eval() state = env.reset() state = np.transpose(state, (2,0,1)) state = np.ascontiguousarray(state, dtype=np.float32) / 255 state = torch.from_numpy(state) reward_sum = 0 done = True start_time = time.time() # a quick hack to prevent the agent from stucking actions = deque(maxlen=100) episode_length = 0 while True: episode_length += 1 # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) cx = torch.zeros(1, 256) hx = torch.zeros(1, 256) else: cx = cx.detach() hx = hx.detach() with torch.no_grad(): value, logit, (hx, cx) = model((state.unsqueeze(0), (hx, cx))) prob = F.softmax(logit, dim=-1) action = prob.max(1, keepdim=True)[1].numpy() state, reward, done, _ = env.step(action[0, 0]) state = np.transpose(state, (2, 0, 1)) state = np.ascontiguousarray(state, dtype=np.float32) / 255 done = done or episode_length >= args.max_episode_length reward_sum += reward # a quick hack to prevent the agent from stucking actions.append(action[0, 0]) if actions.count(actions[0]) == actions.maxlen: done = True if done: print("Time {}, num steps {}, FPS {:.0f}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), counter.value, counter.value / (time.time() - start_time), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() state = np.transpose(state, (2,0,1)) state = np.ascontiguousarray(state, dtype=np.float32) / 255 time.sleep(60) state = torch.from_numpy(state)
def test(rank, args, shared_model): torch.manual_seed(args.seed + rank) env = WrapEnv(args.env_name) model = ActorCritic(4, env.num_actions, args.num_skips) model.eval() state = env.reset() state = np.concatenate([state] * 4, axis=0) state = torch.from_numpy(state) reward_sum = 0 done = True action_stat = [0] * (model.n_real_acts + model.n_aux_acts) start_time = time.time() episode_length = 0 for ep_counter in itertools.count(1): # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) if not os.path.exists('model-a3c-aux'): os.makedirs('model-a3c-aux') torch.save(shared_model.state_dict(), 'model-a3c-aux/model-{}.pth'.format(args.model_name)) print('saved model') value, logit = model(Variable(state.unsqueeze(0), volatile=True)) prob = F.softmax(logit) action = prob.max(1)[1].data.numpy() action_np = action[0, 0] action_stat[action_np] += 1 if action_np < model.n_real_acts: state_new, reward, done, info = env.step(action_np) if args.testing: print('episode', episode_length, 'normal action', action_np, 'lives', info['ale.lives']) env.render() state = np.append(state.numpy()[1:, :, :], state_new, axis=0) done = done or episode_length >= args.max_episode_length reward_sum += reward episode_length += 1 else: state = state.numpy() for _ in range(action_np - model.n_real_acts + 2): state_new, rew, done, info = env.step( 0) # instead of random perform NOOP=0 if args.testing: print('episode', episode_length, 'no_op action', action_np, 'lives', info['ale.lives']) # env.render() state = np.append(state[1:, :, :], state_new, axis=0) done = done or episode_length >= args.max_episode_length reward_sum += rew episode_length += 1 if done: break if done: print("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) print("actions stats real {}, aux {}".format( action_stat[:model.n_real_acts], action_stat[model.n_real_acts:])) reward_sum = 0 episode_length = 0 state = env.reset() state = np.concatenate([state] * 4, axis=0) action_stat = [0] * (model.n_real_acts + model.n_aux_acts) if not args.testing: time.sleep(60) state = torch.from_numpy(state)
def test(args, shared_model): action_map = _set_action_map() env = FixedEnvWrap() # time.sleep(10) model = ActorCritic() model.load_state_dict(shared_model.state_dict()) model.eval() state = env.reset() training_time = 0 vis = visdom.Visdom(env='final') line_plot = vis.line(Y=np.array([0]), opts=dict(xlabel='testing count', ylabel='average reward', title='ali-v1')) start = time.time() vis_count = 0 while True: video_count = 1 reward_all_sum = 0 reward_all = 0 reward_all_ave = 0 reward_gop = 0 action = 3 last_action = 3 # update model before testing all trace files # time.sleep(5) print('load updated model') model.load_state_dict(shared_model.state_dict()) while True: # get the reward for one gop while True: _, done, decision_flag = env.step_gop(action) if decision_flag or done: reward_gop = env.get_reward_gop() state = env.get_state_gop() break else: continue # print('testing') # get action from model last_action = action with torch.no_grad(): state = torch.FloatTensor(state) logit, _ = model( state.view(-1, args.s_gop_info, args.s_gop_len)) prob = F.softmax(logit, dim=1) _, action = torch.max(prob, 1) action = action.data.numpy()[0] bitrate, target_buffer = action_map[last_action] # print('bitrate: %d, target_buffer: %d, reward is %s' % (bitrate, target_buffer, reward_gop)) if done: print("video count %d, reward is %.5f" % (video_count, reward_all)) # reward_all_sum += reward_all / 100 reward_all_sum += reward_all video_count += 1 if reward_all < 0: print('bad model ! just break this loop') reward_all_ave = 0 break if video_count > env.traces_len * 2: reward_all_ave = reward_all_sum / video_count break action = 3 last_action = 3 reward_all = 0 reward_all += reward_gop # update the figure of average reward of all testing files vis_count += 1 reward_all_ave = max(reward_all_ave, 0) vis.line(Y=np.array([reward_all_ave]), X=np.array([vis_count]), win=line_plot, update='append') path = 'ali-v1/actor.pt-' + str(vis_count) torch.save(model.state_dict(), path) end = time.time() hours, rem = divmod(end - start, 3600) minutes, seconds = divmod(rem, 60) print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds)) print("average reward of traces are: ", reward_all_ave) print('saved one model in epoch:', vis_count)
def test(rank, args, T, shared_model): torch.manual_seed(args.seed + rank) env = gym.make(args.env) env.seed(args.seed + rank) model = ActorCritic(env.observation_space, env.action_space, args.hidden_size) model.eval() can_test = True # Test flag t_start = 1 # Test step counter to check against global counter rewards, steps = [], [] # Rewards and steps for plotting l = str(len(str(args.T_max))) # Max num. of digits for logging steps done = True # Start new episode while T.value() <= args.T_max: if can_test: t_start = T.value() # Reset counter # Evaluate over several episodes and average results avg_rewards, avg_episode_lengths = [], [] for _ in range(args.evaluation_episodes): while True: # Reset or pass on hidden state if done: # Sync with shared model every episode model.load_state_dict(shared_model.state_dict()) hx = Variable(torch.zeros(1, args.hidden_size), volatile=True) cx = Variable(torch.zeros(1, args.hidden_size), volatile=True) # Reset environment and done flag state = state_to_tensor(env.reset()) done, episode_length = False, 0 reward_sum = 0 # Optionally render validation states if args.render: env.render() # Calculate policy policy, _, _, (hx, cx) = model(Variable(state, volatile=True), (hx.detach(), cx.detach())) # Break graph for memory efficiency # Choose action greedily action = policy.max(1)[1].data[0, 0] # Step state, reward, done, _ = env.step(action) state = state_to_tensor(state) reward_sum += reward done = done or episode_length >= args.max_episode_length # Stop episodes at a max length episode_length += 1 # Increase episode counter # Log and reset statistics at the end of every episode if done: avg_rewards.append(reward_sum) avg_episode_lengths.append(episode_length) break print(('[{}] Step: {:<' + l + '} Avg. Reward: {:<8} Avg. Episode Length: {:<8}').format( datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3], t_start, sum(avg_rewards) / args.evaluation_episodes, sum(avg_episode_lengths) / args.evaluation_episodes)) if args.evaluate: return rewards.append(avg_rewards) # Keep all evaluations steps.append(t_start) plot_line(steps, rewards) # Plot rewards torch.save(model.state_dict(), 'model.pth') # Save model params can_test = False # Finish testing else: if T.value() - t_start >= args.evaluation_interval: can_test = True time.sleep(0.001) # Check if available to test every millisecond env.close()
def test(rank, args, T, shared_model): torch.manual_seed(args.seed + rank) env = gym.make(args.env) env.seed(args.seed + rank) model = ActorCritic(env.observation_space, env.action_space, args.hidden_size) model.eval() save_dir = os.path.join('results', args.name) can_test = True # Test flag t_start = 1 # Test step counter to check against global counter rewards, steps = [], [] # Rewards and steps for plotting l = str(len(str(args.T_max))) # Max num. of digits for logging steps done = True # Start new episode # stores step, reward, avg_steps and time results_dict = {'t': [], 'reward': [], 'avg_steps': [], 'time': []} while T.value() <= args.T_max: if can_test: t_start = T.value() # Reset counter # Evaluate over several episodes and average results avg_rewards, avg_episode_lengths = [], [] for _ in range(args.evaluation_episodes): while True: # Reset or pass on hidden state if done: # Sync with shared model every episode model.load_state_dict(shared_model.state_dict()) hx = torch.zeros(1, args.hidden_size) cx = torch.zeros(1, args.hidden_size) # Reset environment and done flag state = state_to_tensor(env.reset()) done, episode_length = False, 0 reward_sum = 0 # Optionally render validation states if args.render: env.render() # Calculate policy with torch.no_grad(): policy, _, _, (hx, cx), _ = model(state, (hx, cx)) # Choose action greedily action = policy.max(1)[1][0] # Step state, reward, done, _ = env.step(action.item()) state = state_to_tensor(state) reward_sum += reward done = done or episode_length >= args.max_episode_length # Stop episodes at a max length episode_length += 1 # Increase episode counter # Log and reset statistics at the end of every episode if done: avg_rewards.append(reward_sum) avg_episode_lengths.append(episode_length) break print(('[{}] Step: {:<' + l + '} Avg. Reward: {:<8} Avg. Episode Length: {:<8}').format( datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3], t_start, sum(avg_rewards) / args.evaluation_episodes, sum(avg_episode_lengths) / args.evaluation_episodes)) fields = [ t_start, sum(avg_rewards) / args.evaluation_episodes, sum(avg_episode_lengths) / args.evaluation_episodes, str(datetime.now()) ] # storing data in the dictionary. results_dict['t'].append(t_start) results_dict['reward'].append( sum(avg_rewards) / args.evaluation_episodes) results_dict['avg_steps'].append( sum(avg_episode_lengths) / args.evaluation_episodes) results_dict['time'].append(str(datetime.now())) # Dumping the results in pickle format with open(os.path.join(save_dir, 'results.pck'), 'wb') as f: pickle.dump(results_dict, f) # Saving the data in csv format with open(os.path.join(save_dir, 'results.csv'), 'a') as f: writer = csv.writer(f) writer.writerow(fields) if args.evaluate: return rewards.append(avg_rewards) # Keep all evaluations steps.append(t_start) plot_line(steps, rewards, save_dir) # Plot rewards torch.save(model.state_dict(), os.path.join(save_dir, 'model.pth')) # Save model params # torch.save(model.state_dict(), os.path.join(save_dir, 'model_{}.pth'.format(t_start))) # Save model params can_test = False # Finish testing else: if T.value() - t_start >= args.evaluation_interval: can_test = True time.sleep(0.001) # Check if available to test every millisecond # Dumping the results in pickle format with open(os.path.join(save_dir, 'results.pck'), 'wb') as f: pickle.dump(results_dict, f) env.close()
def main(): # 确定神经网络计算设备 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 构建神经网络 net = ActorCritic() net = net.to(device) # 准备优化器 optimizer = torch.optim.Adam(net.parameters(), lr=3e-4) # 准备环境 envs = Envs(NUM_WORKERS, gamma=GAMMA) # 开始训练 for episode in range(EPISODES): # 从多个环境采集一回合数据 net.eval() with torch.no_grad(): states = envs.reset() done = False while not done: states = states.to(device) _, policys = net(states) policys = policys.cpu() # 移到CPU上处理比较好 # 不能下的位置概率填 0 for i in range(NUM_WORKERS): if envs.reversis[i].next != 0: for y, x in itertools.product(range(SIZE), repeat=2): if not envs.reversis[i].good[y][x]: policys[i][y * SIZE + x] = 0. else: policys[i][y * SIZE + x] += 1e-8 # 防止概率全为 0 actions = Categorical(probs=policys).sample() done, states = envs.step(actions) envs.setReturn() data = EpisodeData(envs.readHistory()) loader = DataLoader(data, batch_size=BATCH_SIZE, shuffle=True, num_workers=2) # 训练网络 net.train() # 相关指标 value_loss_total = 0. entropy_total = 0. for states, actions, Returns in loader: states, actions, Returns = states.to(device), actions.to( device), Returns.to(device) values, policys = net(states) dist = Categorical(probs=policys) action_log_probs = dist.log_prob(actions).view(-1, 1) dist_entropy = dist.entropy().mean() # 我们希望分布的熵更大些,保持模型的探索性 advantages = Returns.view(-1, 1) - values value_loss = advantages.pow(2).mean() action_loss = -(advantages.detach() * action_log_probs).mean() optimizer.zero_grad() (VALUE_LOSS_COEF * value_loss + action_loss - ENTROPY_LOSS_COEF * dist_entropy).backward() optimizer.step() value_loss_total += value_loss.item() entropy_total += dist_entropy.item() print('Episode: {:>10d}, Value Loss: {:g}, Entropy: {:g}'.format( episode, value_loss_total / len(loader), entropy_total / len(loader)), flush=True) if episode != 0 and episode % SAVE_INTERVAL == 0: if not os.path.isdir('models'): os.mkdir('models') torch.save(net.state_dict(), 'models/{}.pt'.format(episode // SAVE_INTERVAL))
def do_test(self, rank, args, shared_model, counter): torch.manual_seed(args.seed + rank) if args.run_name is None: rn = None else: rn = 'runs/' + args.run_name writer = SummaryWriter(log_dir=rn, flush_secs=60) cnt = 0 env = grid2op.make(args.env_name, test=args.for_test, reward_class=L2RPNReward) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.size(), self.action_space, args.hidden_size) model.eval() state = self.convert_obs(env.reset()) state = torch.from_numpy(state) reward_sum = 0 done = True start_time = time.time() # a quick hack to prevent the agent from stucking #actions = deque(maxlen=100) episode_length = 0 while True: episode_length += 1 # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) cx = torch.zeros(1, args.hidden_size) hx = torch.zeros(1, args.hidden_size) else: cx = cx.detach() hx = hx.detach() with torch.no_grad(): _, logit, (hx, cx) = model((state.unsqueeze(0), (hx, cx))) prob = F.softmax(logit, dim=-1) action = prob.max(1, keepdim=True)[1].numpy() state, reward, done, _ = env.step(self.convert_act(action[0, 0])) state = self.convert_obs(state) done = done or episode_length >= args.max_episode_length reward_sum += reward # a quick hack to prevent the agent from stucking #actions.append(action[0, 0]) #if actions.count(actions[0]) == actions.maxlen: # done = True if done: print( "Time {}, num steps {}, episode reward {}, episode length {}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), counter.value, reward_sum, episode_length), flush=True) writer.add_scalar('Main/Reward', reward_sum, cnt) writer.add_scalar('Main/Episode Length', episode_length, cnt) writer.add_scalar('Stats/Global steps', counter.value, cnt) cnt += 1 reward_sum = 0 episode_length = 0 #actions.clear() state = self.convert_obs(env.reset()) time.sleep(args.test_interval) state = torch.from_numpy(state)
def test(rank, args, shared_model): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) if not os.path.exists('models-a3c'): os.makedirs('models-a3c') path = 'models-a3c/model-{}.pth'.format(args.model_name) print('saving directory is', path) model = ActorCritic(env.action_space.n, args.num_atoms, args.gamma) model.eval() state = env.reset() state = np.concatenate([state] * 4, axis=0) state = torch.from_numpy(state) reward_sum = 0 done = True action_stat = [0] * model.num_outputs start_time = time.time() episode_length = 0 for ep_counter in itertools.count(1): # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) torch.save(shared_model.state_dict(), path) print('saved model') atoms_logit, logit = model(Variable(state.unsqueeze(0), volatile=True)) prob = F.softmax(logit) action = prob.max(1)[1].data.numpy() action_np = action[0, 0] action_stat[action_np] += 1 state_new, reward, done, info = env.step(action_np) dead = is_dead(info) if args.testing: atoms_prob = F.softmax(atoms_logit) value = model.get_v(atoms_prob, batch=False) atoms_prob = atoms_prob.squeeze().data.numpy() print('episode', episode_length, 'normal action', action_np, 'lives', info['ale.lives'], 'value', value) env.render() if ep_counter % 100 == 0: plt.plot(model.z, atoms_prob) plt.title('average v is {}'.format(value)) plt.show() state = np.append(state.numpy()[1:, :, :], state_new, axis=0) done = done or episode_length >= args.max_episode_length reward_sum += reward episode_length += 1 if done: print("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) print("actions stats real {}".format( action_stat[:model.num_outputs])) reward_sum = 0 episode_length = 0 state = env.reset() env.seed(args.seed + rank + (args.num_processes + 1) * ep_counter) state = np.concatenate([state] * 4, axis=0) action_stat = [0] * model.num_outputs if not args.testing: time.sleep(60) state = torch.from_numpy(state)
def test(shared_model, render=0): env = create_atari_env(args.rom) if render == 1: env.render() model = ActorCritic(env.observation_space.shape[0], env.action_space) model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True # a quick hack to prevent the agent from stucking episode_length = 0 cx = hx = None while True: episode_length += 1 # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256).type(FloatTensor), volatile=True) hx = Variable(torch.zeros(1, 256).type(FloatTensor), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) value, logit, (hx, cx) = model( (Variable(state.unsqueeze(0).type(FloatTensor), volatile=True), (hx, cx))) prob = F.softmax(logit) # print logit.data.numpy() action = prob.max(1, keepdim=True)[1].data.cpu().numpy() state, reward, done, _ = env.step(action[0, 0]) if render: #env.render() # Spits out images in the selected path img = env.render('rgb_array') imsave( '/opt/tmp/img/pac-20000/frame_{:06d}.png'.format( episode_length), img) """ TEST-DEMO-ONLY state_im = state.numpy() state_im.transpose() scipy.misc.imageio.saveim(state_im, filename-with-time-step-number) #ffmpeg END-WORKZONE """ done = done or episode_length >= 10000 reward_sum += reward # a quick hack to prevent the agent from stucking # actions.append(action[0, 0]) # if actions.count(actions[0]) == actions.maxlen: # done = True if done: print("Time {}, episode reward {}, episode length {}".format( get_elapsed_time_str(), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 state = env.reset() time.sleep(60) state = torch.from_numpy(state)