def main():

    setup_utils.setup_and_load(use_cmd_line_args=False)

    # Make the base enviroments that we will train the agent on first, this makes 1 gym enviroment
    # But after each epoch a different enviroment will be choosen, currently we only use 1 enviroment,
    # because it works better with the dqn algorithm
    base_env = make('standard', num_envs=1)
    base_env = CoinRunVecEnvWrapper(base_env)

    #base_env = wrappers.add_final_wrappers(base_env)
    # Make the enviroment that we will attempt to transfer to
    transfer_enviroment = make('standard', num_envs=1)
    transfer_enviroment = CoinRunVecEnvWrapper(transfer_enviroment)

    t = int(5e3)
    with tf.Session():
        model = make_model()

        print("-----\ntraining base model on training enviroment\n-----")
        base_statistics = run_deepq(model if model else 'cnn',
                                    base_env,
                                    total_timesteps=t,
                                    name="base")

        print('mean reward: ', np.mean(np.array(base_statistics['rewards'])))

        print("-----\ntraining transfer model on test enviroment\n-----")
        transfer_statistics = run_deepq(model if model else 'cnn',
                                        transfer_enviroment,
                                        total_timesteps=t,
                                        name="transfer")
        print('mean reward: ',
              np.mean(np.array(transfer_statistics['rewards'])))

        model = make_model()
        print("-----\ntraining non-transfer model on test enviroment\n-----")
        transfer_enviroment_base_model_statistics = run_deepq(
            model if model else 'cnn',
            transfer_enviroment,
            total_timesteps=t,
            name="transfer")
        print(
            'mean reward: ',
            np.mean(
                np.array(
                    transfer_enviroment_base_model_statistics['rewards'])))
        plot_stats(base_statistics, transfer_statistics,
                   transfer_enviroment_base_model_statistics)
示例#2
0
def Train():
    setup_utils.setup_and_load(use_cmd_line_args=False,
                               set_seed=3,
                               num_levels=1,
                               use_black_white=True,
                               frame_stack=4)
    # env=make("platform",num_envs=8)
    env = make("platform", num_envs=8)
    env = CourierWrapper(env, True)
    env = MyReward(env)
    # env = VecMonitor(env)
    learning_rate = 3e-4
    clip_range = 0.2
    n_timesteps = int(1e8)
    hyperparmas = {
        'nsteps': 256,
        'noptepochs': 4,
        'nminibatches': 8,
        'lr': learning_rate,
        'cliprange': clip_range,
        'vf_coef': 0.5,
        'ent_coef': 0.01
    }

    act = ppo2.learn(
        network=MyPolicy,
        env=env,
        total_timesteps=n_timesteps,
        **hyperparmas,
        save_interval=100,
        log_interval=20,

        # value_network="copy"
    )
示例#3
0
def test_coinrun():
    setup_utils.setup_and_load(use_cmd_line_args=False)
    env = make('CoinRun-v0', num_envs=16)
    for _ in range(100):
        acts = np.array([env.action_space.sample() for _ in range(env.num_envs)])
        _obs, _rews, _dones, _infos = env.step(acts)
    env.close()
示例#4
0
def random_agent(num_envs=1, max_steps=100000):
    setup_utils.setup_and_load(use_cmd_line_args=False)
    env = make('standard', num_envs=num_envs)
    for step in range(max_steps):
        acts = np.array(
            [env.action_space.sample() for _ in range(env.num_envs)])
        _obs, rews, _dones, _infos = env.step(acts)
        print("step", step, "rews", rews)
    env.close()
示例#5
0
def test(config, agent=None, levels=5):
    """Test routine"""

    env = utils.Scalarize(make('standard', num_envs=1))

    if agent is None:
        print("Testing numlvl {} seed {} file: {}".format(
            conrun_config.NUM_LEVELS, conrun_config.SET_SEED,
            config.model_filename))
        agent = DQN(env.observation_space.shape, env.action_space.n)
        if config.enable_gpu and torch.cuda.is_available():
            agent = agent.cuda()
        bestmodel_file = os.path.join(config.save_dir, config.model_filename)
        load_res = torch.load(bestmodel_file, map_location="cpu")
        agent.load_state_dict(load_res["model"])
    else:
        config.render_play = False

    agent.eval()
    success = 0
    total_steps = 0
    for i in range(levels):
        state = env.reset()
        ep_reward = 0
        ep_length = 0
        while True:
            if config.render_play:
                env.render()
            state = torch.unsqueeze(torch.FloatTensor(state), 0)
            action = torch.max(agent.forward(state),
                               1)[1].data.numpy()[0]  # TODO debug this

            next_state, reward, done, info = env.step(action)

            ep_length += 1
            ep_reward += reward

            state = copy.copy(next_state)

            if done:
                print(
                    "test episode: {} , the episode reward : {} with length : {}"
                    .format(i, ep_reward, ep_length))
                break

        if ep_reward > 0:
            success = success + 1
        total_steps += ep_length

    print("Testing result : {} % completed. Avg. ep length : {}".format(
        success / levels * 100, total_steps / levels))
    env.close()
    if success >= (levels / 2):
        return True
    return False
示例#6
0
def multi_setup(rank, world_size, destination):

    dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)

    setup()
    env = make("standard", num_envs=ExpConfig.NUM_ENVS)
    env = add_final_wrappers(env)

    learn(rank, destination, env)

    cleanup()
示例#7
0
 def __init__(self):
     self.AE = AutoEncoder(args,
                           latent_dim=args.latent_dim).double().to(device)
     self.AE.train()
     self.counter = 0
     self.buffer = np.empty(args.buffer_capacity, dtype=transition)
     setup_utils.setup_and_load(use_cmd_line_args=False)
     self.env = make('standard', num_envs=args.num_envs)
     self.optimizer = optim.Adam(self.AE.parameters(), lr=args.lr)
     self.criterion = nn.MSELoss()
     self.step = 0
示例#8
0
def testing():
    setup_utils.setup_and_load()
    episodes = 10
    env = Scalarize(make('standard', num_envs=1))
    for i in range(episodes):
        env.reset()
        while True:
            env.render()
            action = np.random.randint(0, env.action_space.n)
            next_state, reward, done, info = env.step(action)
            if done or reward > 0:
                break
示例#9
0
def create_coinrun_env(num_levels, task_id, random_seed_list):
    # setup_utils.setup_and_load(use_cmd_line_args=False, is_high_res=True, num_levels=num_levels, set_seed=seed)
    try:
        random_seed = random_seed_list[task_id]
    except:
        random_seed = 123
    setup_utils.setup_and_load(use_cmd_line_args=False,
                               is_high_res=True,
                               num_levels=num_levels,
                               set_seed=random_seed)
    env = make('standard', num_envs=1)
    return env
示例#10
0
def make_coinrun():
    from coinrun import setup_utils, make
    from coinrun_wrapper import CourierWrapper, MyReward
    setup_utils.setup_and_load(use_cmd_line_args=False,
                               set_seed=3,
                               num_levels=1,
                               use_black_white=True,
                               frame_stack=4)
    # env=make("platform",num_envs=8)
    env = make("platform", num_envs=256)
    env = CourierWrapper(env, False)
    env = MyReward(env)
    return env
示例#11
0
def random_agent(num_envs=1, max_steps=100000):
    setup_utils.setup_and_load(use_cmd_line_args=True)
    print(Config.IS_HIGH_RES)
    env = make('standard', num_envs=num_envs)
    env.render()
    viewer = rendering.SimpleImageViewer()
    for step in range(max_steps):
        acts = np.array(
            [env.action_space.sample() for _ in range(env.num_envs)])
        _obs, rews, _dones, _infos = env.step(acts)
        print("step", step, "rews", rews)
        env.render()
    env.close()
示例#12
0
def random_agent(num_envs=1, max_steps=100000):
    #random environment
    # setup_utils.setup_and_load(use_cmd_line_args=False)
    #just test in level1 with config --run-id myrun --num-levels 1
    setup_utils.setup_and_load()
    env = make('standard', num_envs=num_envs)
    imgNum = 0
    for step in range(100000):
        env.render()
        #acts = np.array([env.action_space.sample() for _ in range(env.num_envs)])

        foo = [1, 3]
        acts = np.array([random.choice(foo)])
        #0: no move
        #1:right move
        #2: move but stay
        #3:jump
        #4:down
        #5:down
        #6:down

        # 0, 0,
        # +1, 0, // right
        # -1, 0, // left
        # 0, +1, // jump
        # +1, +1, // right - jump
        # -1, +1, // left - jump
        # 0, -1, // down(step down from a crate)

        print("python input action: ", acts)
        print("\n env.step(acts): \n")
        _obs, rews, _dones, _infos = env.step(acts)
        #todo:return distance (change _obs to distance) then condition

        img_input = img.imgbuffer_process(_obs, (256, 256))

        if step % 50 == 0:
            #turn gray
            #todo:make coinrunMOXCS consume gray img
            #plt.imsave('%i.jpg' % (imgNum), img_input.mean(axis=2), cmap = "gray")
            # plt.imsave('%i.jpg' % (imgNum), img_input)
            #plt.imshow(img_input.mean(axis=2), cmap="gray")
            imgNum = imgNum + 1
            print("imgNum:%i" % (imgNum))

        print("step", step, "rews", rews)
    env.close()
示例#13
0
    def run(seed):
        setup(
            rand_seed=seed,
            num_envs=1,
            high_difficulty=False,
            num_levels=0,
            use_data_augmentation=False,
        )
        env = make("standard", num_envs=1)
        obs = env.reset()
        episode_rew = 0
        done = False
        while not done:
            actions, _, _ = model.get_all_values(obs)
            actions = actions.numpy()
            next_obs, rew, done, _, = env.step(actions)
            obs = next_obs
            done = done.any() if isinstance(done, np.ndarray) else done
            episode_rew += rew

        return episode_rew
示例#14
0
文件: coinrun_env.py 项目: yyht/rl
  def __init__(self, hparams):
    # only support 1 environment currently
    super().__init__(hparams)
    try:
      from coinrun import setup_utils, make
      setup_utils.setup_and_load(use_cmd_line_args=False)

      self._env = make('standard', num_envs=1)
    except ImportError as e:
      print(e)
      print("please check README for CoinRun installation instruction")
      exit()
    self.seed(1234)
    self._observation_space = self._env.observation_space
    self._action_space = self._env.action_space
    self._hparams.num_states = self._observation_space.shape[0]
    self._hparams.num_actions = self._action_space.n
    self._hparams.state_shape = list(self._observation_space.shape)
    self._hparams.action_space_type = self._action_space.__class__.__name__
    self._hparams.pixel_input = True
    if self._hparams.reward_augmentation is not None:
      self._reward_augmentation = get_reward_augmentation(
          self._hparams.reward_augmentation)
示例#15
0
def train(num_episodes=NUM_EPISODES,
          load_filename=None,
          save_filename=None,
          eval_interval=EVAL_INTERVAL,
          replay_capacity=REPLAY_CAPACITY,
          bootstrap_threshold=BOOTSTRAP,
          epsilon=EPSILON,
          eval_epsilon=EVAL_EPSILON,
          gamma=GAMMA,
          batch_size=BATCH_SIZE,
          num_levels=NUM_LEVELS,
          seed=SEED):
    # Set up the environment
    setup_utils.setup_and_load(use_cmd_line_args=False,
                               is_high_res=True,
                               num_levels=num_levels,
                               set_seed=seed)
    env = make('standard', num_envs=1)
    if RENDER_SCREEN and not IN_PYNB:
        env.render()

    # Reset the environment
    env.reset()

    # Get screen size so that we can initialize layers correctly based on shape returned from AI gym.
    init_screen = get_screen(env)
    _, _, screen_height, screen_width = init_screen.shape
    print("screen size: ", screen_height, screen_width)

    # Are we resuming from an existing model?
    policy_net = None
    if load_filename is not None and os.path.isfile(load_filename):
        print("Loading model...")
        policy_net = torch.load(load_filename)
        policy_net = policy_net.to(DEVICE)
        print("Done loading.")
    else:
        print("Making new model.")
        policy_net = DQN(screen_height, screen_width,
                         env.NUM_ACTIONS).to(DEVICE)
    # Make a copy of the policy network for evaluation purposes
    eval_net = DQN(screen_height, screen_width, env.NUM_ACTIONS).to(DEVICE)
    eval_net.load_state_dict(policy_net.state_dict())
    eval_net.eval()

    # Instantiate the optimizer
    optimizer = None
    if len(list(policy_net.parameters())) > 0:
        optimizer = initializeOptimizer(policy_net.parameters())

    # Instantiate the replay memory
    replay_memory = ReplayMemory(replay_capacity)

    steps_done = 0  # How many steps have been run
    eval_window = []  # Keep the last 5 episode durations
    best_window = float('inf')  # The best average window duration to date

    ### Do training until episodes complete or until ^C is pressed
    try:
        print("training...")
        i_episode = 0  # The episode number

        # Stop when we reach max episodes
        while i_episode < num_episodes:
            print("episode:", i_episode, "epsilon:", epsilon)
            max_reward = 0  # The best reward we've seen this episode
            done = False  # Has the game ended (timed out or got the coin)
            episode_steps = 0  # Number of steps performed in this episode
            # Initialize the environment and state
            env.reset()

            # Current screen. There is no last screen because we get velocity on the screen itself.
            state = get_screen(env)

            # Do forever until the loop breaks
            while not done:
                # Select and perform an action
                action, epsilon = select_action(state, policy_net,
                                                env.NUM_ACTIONS, epsilon,
                                                steps_done,
                                                bootstrap_threshold)
                steps_done = steps_done + 1
                episode_steps = episode_steps + 1

                # for debugging
                if RENDER_SCREEN and not IN_PYNB:
                    env.render()

                # Run the action in the environment
                if action is not None:
                    _, reward, done, _ = env.step(np.array([action.item()]))

                    # Record if this was the best reward we've seen so far
                    max_reward = max(reward, max_reward)

                    # Turn the reward into a tensor
                    reward = torch.tensor([reward], device=DEVICE)

                    # Observe new state
                    current_screen = get_screen(env)

                    # Did the game end?
                    if not done:
                        next_state = current_screen
                    else:
                        next_state = None

                    # Store the transition in memory
                    replay_memory.push(state, action, next_state, reward)

                    # Move to the next state
                    state = next_state

                    # If we are past bootstrapping we should perform one step of the optimization
                    if steps_done > bootstrap_threshold:
                        optimize_model(policy_net, replay_memory, optimizer,
                                       batch_size, gamma)
                else:
                    # Do nothing if select_action() is not implemented and returning None
                    env.step(np.array([0]))

                # If we are done, print some statistics
                if done:
                    print("duration:", episode_steps)
                    print("max reward:", max_reward)
                    print("total steps:", steps_done)

            # Should we evaluate?
            if steps_done > bootstrap_threshold and i_episode > 0 and i_episode % eval_interval == 0:
                test_average_duration = 0  # Track the average eval duration
                test_average_max_reward = 0  # Track the average max reward
                # copy all the weights into the evaluation network
                eval_net.load_state_dict(policy_net.state_dict())
                # Evaluate 10 times
                for _ in range(10):
                    # Call the evaluation function
                    test_duration, test_max_reward = evaluate(
                        eval_net, eval_epsilon, env)
                    test_average_duration = test_average_duration + test_duration
                    test_average_max_reward = test_average_max_reward + test_max_reward
                test_average_duration = test_average_duration / 10
                test_average_max_reward = test_average_max_reward / 10
                print("Average duration:", test_average_duration)
                print("Average max reward:", test_average_max_reward)
                # Append to the evaluation window
                if len(eval_window) < 5:
                    eval_window.append(test_average_duration)
                else:
                    eval_window = eval_window[1:] + [test_average_duration]
                # Compute window average
                window_average = sum(eval_window) / len(eval_window)
                print("evaluation window:", eval_window, "window average:",
                      window_average)
                # If this is the best window average we've seen, save the model
                if len(eval_window) >= 5 and window_average < best_window:
                    best_window = window_average
                    if save_filename is not None:
                        print("Saving model...")
                        torch.save(policy_net, save_filename)
                        print("Done saving.")
            # Only increment episode number if we are done with bootstrapping
            if steps_done > bootstrap_threshold:
                i_episode = i_episode + 1
        print('Training complete')
    except KeyboardInterrupt:
        print("Training interrupted")
    if RENDER_SCREEN and not IN_PYNB:
        env.render()
    env.close()
    return policy_net
示例#16
0
def play(destination, model):
    model.network.pi.trainable = False
    model.network.value_fc.trainable = False
    tf.random.set_seed(984_373)
    destination = Path(destination).resolve() / "play"
    sequence_folder = destination / "sequence"
    images_folder = destination / "image"
    images_explain_folder = destination / "explain"
    mkdir(sequence_folder)
    mkdir(images_folder)
    mkdir(images_explain_folder)

    metadata = Metadata(
        game_name="Coin run [OpenAI]",
        action_names=[
            "none",
            "right",
            "left",
            "jump",
            "right-jump",
            "left-jump",
            "down",
        ],
        sequence_folder="sequence",
        images_folder="image",
        explain_folder="explain",
    )

    with open(str(destination / "metadata.json"), "w") as outfile:
        json.dump(metadata.as_json(), outfile)

    env = make("standard", num_envs=1)

    obs = env.reset()
    timestep = 0
    episode_rew = 0
    done = False
    layers_to_visit = model.get_first_last_conv_layers()
    while not done:
        obs_hires = env.render(mode="rgb_array")
        actions, state_value, pi_raw = model.get_all_values(obs)
        actions = actions.numpy()
        state_value = state_value.numpy()
        pi_raw = pi_raw.numpy()
        gram_cam_images = grad_cam_heatmap(
            model.network, obs, int(np.argmax(pi_raw)), layers_to_visit
        )

        next_obs, rew, done, _ = env.step(actions)
        obs = next_obs

        done = done.any() if isinstance(done, np.ndarray) else done
        episode_rew += rew

        step = Step(
            timestep=timestep,
            imagename=f"{timestep:05d}",
            reward=float(rew),
            done=int(done),
            actions=list(map(int, actions)),
            state_value=float(state_value[0]),
            pi_raw=list(map(float, pi_raw[0])),
        )

        cv2.imwrite(
            f"{str(images_folder/step.imagename)}.jpg",
            cv2.cvtColor(obs_hires, cv2.COLOR_RGB2BGR),
        )

        for layers_position, gram_cam_image in zip(["first", "last"], gram_cam_images):
            filepath = str(
                images_explain_folder / f"{step.imagename}_{layers_position}.jpg"
            )
            cv2.imwrite(filepath, cv2.cvtColor(gram_cam_image, cv2.COLOR_RGB2BGR))

        with open(str(sequence_folder / f"{timestep:05d}.json"), "w") as outfile:
            json.dump(step.as_json(), outfile)

        logger.info(f"Save step: {timestep}, Reward {rew}")
        timestep += 1

    env.close()
示例#17
0
import numpy as np
from coinrun import setup_utils, make

config_args = setup_utils.setup_and_load(use_cmd_line_args=False)
env = make('standard', num_envs=4)
for _ in range(1000):
    env.render()
    acts = np.array([env.action_space.sample() for _ in range(env.num_envs)])
    _obs, _rews, _dones, _infos = env.step(acts)
env.close()
示例#18
0
def train(num_episodes=NUM_EPISODES,
          load_filename=None,
          save_filename=None,
          eval_interval=EVAL_INTERVAL,
          replay_capacity=REPLAY_CAPACITY,
          bootstrap_threshold=BOOTSTRAP,
          epsilon=EPSILON,
          eval_epsilon=EVAL_EPSILON,
          gamma=GAMMA,
          batch_size=BATCH_SIZE,
          target_update=TARGET_UPDATE,
          random_seed=RANDOM_SEED,
          num_levels=NUM_LEVELS,
          seed=SEED):
    # Set the random seed
    if random_seed is not None:
        random.seed(random_seed)
        torch.manual_seed(random_seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(RANDOM_SEED)
    # Set up the environment
    setup_utils.setup_and_load(use_cmd_line_args=False,
                               is_high_res=True,
                               num_levels=num_levels,
                               set_seed=seed)
    env = make('standard', num_envs=1)
    if RENDER_SCREEN and not IN_PYNB:
        env.render()

    # Reset the environment
    env.reset()

    # Get screen size so that we can initialize layers correctly based on shape returned from AI gym.
    init_screen = get_screen(env)
    _, _, screen_height, screen_width = init_screen.shape
    print("screen size: ", screen_height, screen_width)

    # Are we resuming from an existing model?
    policy_net = None
    if load_filename is not None and os.path.isfile(
            os.path.join(MODEL_PATH, load_filename)):
        print("Loading model...")
        policy_net = load_model(load_filename)
        policy_net = policy_net.to(DEVICE)
        print("Done loading.")
    else:
        print("Making new model.")
        policy_net = DQN(screen_height, screen_width,
                         env.NUM_ACTIONS).to(DEVICE)
    # Make a copy of the policy network for evaluation purposes
    eval_net = DQN(screen_height, screen_width, env.NUM_ACTIONS).to(DEVICE)
    eval_net.load_state_dict(policy_net.state_dict())
    eval_net.eval()
    # Target network is a snapshot of the policy network that lags behind (for stablity)
    target_net = DQN(screen_height, screen_width, env.NUM_ACTIONS).to(DEVICE)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    # Instantiate the optimizer
    optimizer = None
    if len(list(policy_net.parameters())) > 0:
        optimizer = initializeOptimizer(policy_net.parameters())

    # Instantiate the replay memory
    replay_memory = ReplayMemory(replay_capacity)

    steps_done = 0  # How many steps have been run
    best_eval = float('inf')  # The best model evaluation to date

    # Do training until episodes complete
    print("training...")
    i_episode = 0  # The episode number

    # Stop when we reach max episodes
    while i_episode < num_episodes:
        print("episode:", i_episode, "epsilon:", epsilon)
        max_reward = 0  # The best reward we've seen this episode
        done = False  # Has the game ended (timed out or got the coin)
        episode_steps = 0  # Number of steps performed in this episode
        # Initialize the environment and state
        env.reset()

        # Current screen. There is no last screen because we get velocity on the screen itself.
        state = get_screen(env)

        # Do forever until the loop breaks
        while not done:
            # Select and perform an action
            action, epsilon = select_action(state, policy_net, env.NUM_ACTIONS,
                                            epsilon, steps_done,
                                            bootstrap_threshold)
            steps_done = steps_done + 1
            episode_steps = episode_steps + 1

            # for debugging
            if RENDER_SCREEN and not IN_PYNB:
                env.render()

                # Run the action in the environment
            if action is not None:
                _, reward, done, _ = env.step(np.array([action.item()]))

                # Record if this was the best reward we've seen so far
                max_reward = max(reward, max_reward)

                # Turn the reward into a tensor
                reward = torch.tensor([reward], device=DEVICE)

                # Observe new state
                current_screen = get_screen(env)

                # Did the game end?
                if not done:
                    next_state = current_screen
                else:
                    next_state = None

                # Store the transition in memory
                replay_memory.push(state, action, next_state, reward)

                # Move to the next state
                state = next_state

                # If we are past bootstrapping we should perform one step of the optimization
                if steps_done > bootstrap_threshold:
                    optimize_model(
                        policy_net,
                        target_net if target_update > 0 else policy_net,
                        replay_memory, optimizer, batch_size, gamma)
            else:
                # Do nothing if select_action() is not implemented and returning None
                env.step(np.array([0]))

            # If we are done, print some statistics
            if done:
                print("duration:", episode_steps)
                print("max reward:", max_reward)
                status, _ = episode_status(episode_steps, max_reward)
                print("result:", status)
                print("total steps:", steps_done, '\n')

            # Should we update the target network?
            if target_update > 0 and i_episode % target_update == 0:
                target_net.load_state_dict(policy_net.state_dict())

        # Should we evaluate?
        if steps_done > bootstrap_threshold and i_episode > 0 and i_episode % eval_interval == 0:
            test_average_duration = 0  # Track the average eval duration
            test_average_max_reward = 0  # Track the average max reward
            # copy all the weights into the evaluation network
            eval_net.load_state_dict(policy_net.state_dict())
            # Evaluate 10 times
            for _ in range(EVAL_COUNT):
                # Call the evaluation function
                test_duration, test_max_reward = evaluate(
                    eval_net, eval_epsilon, env)
                status, score = episode_status(test_duration, test_max_reward)
                test_duration = score  # Set test_duration to score to factor in death-penalty
                test_average_duration = test_average_duration + test_duration
                test_average_max_reward = test_average_max_reward + test_max_reward
            test_average_duration = test_average_duration / EVAL_COUNT
            test_average_max_reward = test_average_max_reward / EVAL_COUNT
            print("Average duration:", test_average_duration)
            print("Average max reward:", test_average_max_reward)
            # If this is the best window average we've seen, save the model
            if test_average_duration < best_eval:
                best_eval = test_average_duration
                if save_filename is not None:
                    save_model(policy_net, save_filename, i_episode)
            print(' ')
        # Only increment episode number if we are done with bootstrapping
        if steps_done > bootstrap_threshold:
            i_episode = i_episode + 1
    print('Training complete')
    if RENDER_SCREEN and not IN_PYNB:
        env.render()
    env.close()
    return policy_net
示例#19
0
def evaluate(policy_net, epsilon=EVAL_EPSILON, env=None, test_seed=SEED):
    setup_utils.setup_and_load(use_cmd_line_args=False,
                               is_high_res=True,
                               num_levels=NUM_LEVELS,
                               set_seed=test_seed)

    # Make an environment if we don't already have one
    if env is None:
        env = make('standard', num_envs=1)
    if RENDER_SCREEN and not IN_PYNB:
        env.render()

    # Reset the environment
    env.reset()

    # Get screen size so that we can initialize layers correctly based on shape
    # returned from AI gym.
    init_screen = get_screen(env)
    _, _, screen_height, screen_width = init_screen.shape

    # Get the network ready for evaluation (turns off some things like dropout if used)
    policy_net.eval()

    # Current screen. There is no last screen
    state = get_screen(env)

    steps_done = 0  # Number of steps executed
    max_reward = 0  # Max reward seen
    done = False  # Is the game over?

    print("Evaluating...")
    while not done:
        # Select and perform an action
        action, _ = select_action(state,
                                  policy_net,
                                  env.NUM_ACTIONS,
                                  epsilon,
                                  steps_done=0,
                                  bootstrap_threshold=0)
        steps_done = steps_done + 1

        if RENDER_SCREEN and not IN_PYNB:
            env.render()

        # Execute the action
        if action is not None:
            _, reward, done, _ = env.step(np.array([action.item()]))

            # Is this the best reward we've seen?
            max_reward = max(reward, max_reward)

            # Observe new state
            state = get_screen(env)
        else:
            # Do nothing if select_action() is not implemented and returning None
            env.step(np.array([0]))

    print("duration:", steps_done)
    print("max reward:", max_reward)
    status, _ = episode_status(steps_done, max_reward)
    print("result:", status, '\n')
    if RENDER_SCREEN and not IN_PYNB:
        env.render()
    return steps_done, max_reward
示例#20
0
def create_env(
    num_envs,
    *,
    env_kind="procgen",
    epsilon_greedy=0.0,
    reward_scale=1.0,
    frame_stack=1,
    use_sticky_actions=0,
    coinrun_old_extra_actions=0,
    **kwargs,
):
    if env_kind == "procgen":
        env_kwargs = {k: v for k, v in kwargs.items() if v is not None}
        env_name = env_kwargs.pop("env_name")

        if env_name == "coinrun_old":
            import coinrun
            from coinrun.config import Config

            Config.initialize_args(use_cmd_line_args=False, **env_kwargs)
            global coinrun_initialized
            if not coinrun_initialized:
                coinrun.init_args_and_threads()
                coinrun_initialized = True
            venv = coinrun.make("standard", num_envs)
            if coinrun_old_extra_actions > 0:
                venv = VecExtraActions(
                    venv, extra_actions=coinrun_old_extra_actions, default_action=0
                )

        else:
            from procgen import ProcgenGym3Env
            import gym3

            env_kwargs = {
                k: v for k, v in env_kwargs.items() if k in PROCGEN_KWARG_KEYS
            }
            env = ProcgenGym3Env(num_envs, env_name=env_name, **env_kwargs)
            env = gym3.ExtractDictObWrapper(env, "rgb")
            venv = gym3.ToBaselinesVecEnv(env)

    elif env_kind == "atari":
        game_version = "v0" if use_sticky_actions == 1 else "v4"

        def make_atari_env(lower_env_id, num_env):
            env_id = ATARI_ENV_DICT[lower_env_id] + f"NoFrameskip-{game_version}"

            def make_atari_env_fn():
                env = make_atari(env_id)
                env = wrap_deepmind(env, frame_stack=False, clip_rewards=False)

                return env

            return SubprocVecEnv([make_atari_env_fn for i in range(num_env)])

        lower_env_id = kwargs["env_id"]

        venv = make_atari_env(lower_env_id, num_envs)

    else:
        raise ValueError(f"Unsupported env_kind: {env_kind}")

    if frame_stack > 1:
        venv = VecFrameStack(venv=venv, nstack=frame_stack)

    if reward_scale != 1:
        venv = VecRewardScale(venv, reward_scale)

    venv = VecMonitor(venv=venv, filename=None, keep_buf=100)

    if epsilon_greedy > 0:
        venv = EpsilonGreedy(venv, epsilon_greedy)

    venv = VecShallowCopy(venv)

    return venv
示例#21
0
#!/usr/bin/env python
from __future__ import print_function

import sys, gym, time
from coinrun import setup_utils, make
from course_learn.wrappers import CourierWrapper
import numpy as np

setup_utils.setup_and_load(
    use_cmd_line_args=False,
    paint_vel_info=1,
    is_high_res=True,
    set_seed=3,
    num_levels=1,
)
env = CourierWrapper(make("platform", num_envs=1, default_zoom=5.0), True)
# env = make("maze", num_envs=1, default_zoom=5.0)

if not hasattr(env.action_space, 'n'):
    raise Exception('Keyboard agent only supports discrete action spaces')
ACTIONS = env.action_space.n
SKIP_CONTROL = 0  # Use previous control decision SKIP_CONTROL times, that's how you
# can test what skip is still usable.

human_agent_action = 0
human_wants_restart = False
human_sets_pause = False


def key_press(key, mod):
    global human_agent_action, human_wants_restart, human_sets_pause
示例#22
0
def create_coinrun_env(num_levels, random_seed):
    # setup_utils.setup_and_load(use_cmd_line_args=False, is_high_res=True, num_levels=num_levels, set_seed=seed)
    setup_utils.setup_and_load(use_cmd_line_args=False, is_high_res=True, num_levels=num_levels, set_seed=random_seed)
    env = make('standard', num_envs=1)
    return env
示例#23
0
import numpy as np
from coinrun import setup_utils, make

setup_utils.setup_and_load()
env = make(env_id='standard', num_envs=1)
for _ in range(100):
    acts = np.array([env.action_space.sample() for _ in range(env.num_envs)])
    _obs, _rews, _dones, _infos = env.step(acts)
env.close()
print(_infos)