def main(): """Main""" env_id = 'SpaceInvaders-v0' weight_fname = '/home/matthieu/temp/test.h5' env = ProcessedEnvironnement( env_id, outdir='/home/matthieu/temp/random-agent-results', wrappers_cond=True) env.seed(0) network = ConvNet(input_shape=(84, 84, 1), nbr_action=env.action_space.n, weight_fname=weight_fname) agent = DQNAgent(action_space=env.action_space, network=network, obs_shape=(84, 84, 1), buffer_size=6, decay=0.0, epsilon=0.9) episode_count = 1 reward = 0 action_repetition_rate = 4 action = 0 for i in range(episode_count): ob = env.reset() done = True counter = 0 while True: if counter % action_repetition_rate == 0: action = agent.act(ob, reward, done) print(action) ob, reward, done, _ = env.step(action) counter += 1 if done: break # Close the env and write monitor result info to disk env.close()
gamma=hyper_params['discount_factor']) eps_timesteps = hyper_params['eps_fraction'] * float( hyper_params['num_steps']) episode_rewards = [0.0] loss = [0.0] policy_actions = unpickle_object('action_map') state = env.reset() for t in range(hyper_params['num_steps']): fraction = min(1.0, float(t) / eps_timesteps) eps_threshold = hyper_params['eps_start'] + fraction * ( hyper_params['eps_end'] - hyper_params['eps_start']) sample = random.random() if sample > eps_threshold: action = agent.act(np.array(state)) else: action = np.random.randint(0, 4) env_action = policy_actions[action] next_state, reward, done, _ = env.step(env_action) agent.memory.add(state, action, reward, next_state, float(done)) state = next_state episode_rewards[-1] += reward if done: state = env.reset() episode_rewards.append(0.0) if t > hyper_params[ 'learning_starts'] and t % hyper_params['learning_freq'] == 0:
def main(): config = {'starting-floor': 0, 'total-floors': 9, 'dense-reward': 1, 'lighting-type': 0, 'visual-theme': 0, 'default-theme': 0, 'agent-perspective': 1, 'allowed-rooms': 0, 'allowed-modules': 0, 'allowed-floors': 0, } env = ObstacleTowerEnv('./ObstacleTower/obstacletower', worker_id=1, retro=True, realtime_mode=False, config=config) print(env.observation_space) print(env.action_space) hyper_params = { "seed": 6, # which seed to use "replay-buffer-size": int(5e3), # replay buffer size "learning-rate": 1e-4, # learning rate for Adam optimizer "discount-factor": 0.99, # discount factor "num-steps": int(1e6), # total number of steps to run the environment for "batch-size": 32, # number of transitions to optimize at the same time "learning-starts": 5000, # number of steps before learning starts "learning-freq": 1, # number of iterations between every optimization step "use-double-dqn": True, # use double deep Q-learning "target-update-freq": 1000, # number of iterations between every target network update "eps-start": 1.0, # e-greedy start threshold "eps-end": 0.01, # e-greedy end threshold "eps-fraction": 0.05, # fraction of num-steps "print-freq": 10 } np.random.seed(hyper_params["seed"]) random.seed(hyper_params["seed"]) #assert "NoFrameskip" in hyper_params["env"], "Require environment with no frameskip" #env = gym.make(hyper_params["env"]) env.seed(hyper_params["seed"]) #env = NoopResetEnv(env, noop_max=30) #env = MaxAndSkipEnv(env, skip=4) #env = EpisodicLifeEnv(env) #env = FireResetEnv(env) # env = WarpFrame(env) env = PyTorchFrame(env) # env = ClipRewardEnv(env) # env = FrameStack(env, 4) replay_buffer = ReplayBuffer(hyper_params["replay-buffer-size"]) agent = DQNAgent( env.observation_space, env.action_space, replay_buffer, use_double_dqn=hyper_params["use-double-dqn"], lr=hyper_params["learning-rate"], batch_size=hyper_params["batch-size"], gamma=hyper_params["discount-factor"] ) model_num = 500 agent.policy_network.load_state_dict(torch.load('./Models/' + str(model_num) + '_policy.pt',map_location=torch.device(device))) eps_timesteps = hyper_params["eps-fraction"] * float(hyper_params["num-steps"]) episode_rewards = [0.0] ep_nums = model_num state = env.reset() for t in range(hyper_params["num-steps"]): fraction = min(1.0, float(t) / eps_timesteps) eps_threshold = hyper_params["eps-start"] + fraction * (hyper_params["eps-end"] - hyper_params["eps-start"]) sample = random.random() # TODO # select random action if sample is less equal than eps_threshold # take step in env # add state, action, reward, next_state, float(done) to reply memory - cast done to float # add reward to episode_reward if sample > eps_threshold: action = agent.act(np.array(state)) else: action = env.action_space.sample() next_state, reward, done, _ = env.step(action) agent.memory.add(state, action, reward, next_state, float(done)) state = next_state episode_rewards[-1] += reward if done: state = env.reset() episode_rewards.append(0.0) ep_nums += 1 if ep_nums % 50 == 0: agent.save_models(ep_nums) plot(episode_rewards,ep_nums) if t > hyper_params["learning-starts"] and t % hyper_params["learning-freq"] == 0: agent.optimise_td_loss() if t > hyper_params["learning-starts"] and t % hyper_params["target-update-freq"] == 0: agent.update_target_network() num_episodes = len(episode_rewards) if done and hyper_params["print-freq"] is not None and len(episode_rewards) % hyper_params[ "print-freq"] == 0: mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) print("********************************************************") print("steps: {}".format(t)) print("episodes: {}".format(num_episodes)) print("mean 100 episode reward: {}".format(mean_100ep_reward)) print("% time spent exploring: {}".format(int(100 * eps_threshold))) print("********************************************************") #if done and ep_nums % 10 == 0: # animate(env,agent,"anim/progress_"+str(ep_nums)) # state = env.reset() animate(env,agent,"anim/final") env.close()
torch.load(args.load_checkpoint_file)) eps_timesteps = hyper_params["eps-fraction"] * \ float(hyper_params["num-steps"]) episode_rewards = [0.0] state = env.reset() for t in range(hyper_params["num-steps"]): fraction = min(1.0, float(t) / eps_timesteps) eps_threshold = hyper_params["eps-start"] + fraction * \ (hyper_params["eps-end"] - hyper_params["eps-start"]) sample = random.random() if (sample > eps_threshold): # Exploit action = agent.act(state) else: # Explore action = env.action_space.sample() next_state, reward, done, info = env.step(action) agent.memory.add(state, action, reward, next_state, float(done)) state = next_state episode_rewards[-1] += reward if done: state = env.reset() episode_rewards.append(0.0) if t > hyper_params[ "learning-starts"] and t % hyper_params["learning-freq"] == 0:
batch_size=hyper_params['batch-size'], gamma=hyper_params['discount-factor']) eps_timesteps = hyper_params["eps-fraction"] * float( hyper_params["num-steps"]) episode_rewards = [0.0] state = env.reset() for t in range(hyper_params["num-steps"]): fraction = min(1.0, float(t) / eps_timesteps) eps_threshold = hyper_params["eps-start"] + fraction * ( hyper_params["eps-end"] - hyper_params["eps-start"]) sample = random.random() # TODO if sample > eps_threshold: action = agent.act(np.array(state)) else: action = env.action_space.sample() next_state, reward, done, _ = env.step(action) agent.memory.add(state, action, reward, next_state, float(done)) state = next_state episode_rewards[-1] += reward if done: state = env.reset() episode_rewards.append(0.0) if t > hyper_params[ "learning-starts"] and t % hyper_params["learning-freq"] == 0: agent.optimise_td_loss()