示例#1
0
    def _init():
        if args.original_gym:
            env = gym.make("CarRacing-v0")
        else:
            env = gym.make("CarRacingSoftFS{}-v0".format(args.frame_skip))

        env = wrappers.VaeCarWrapper(env, silent=silent)
        if not args.recodex:
            env = wrappers.TerminateEarlyWrapper(env)

        if args.discrete_actions:
            env = wrappers.CarDiscretizatinoWrapper(env,
                                                    args.action_map == "large")

        env = wrappers.EvaluationWrapper(env,
                                         args.seed,
                                         evaluate_for=args.evaluate_for,
                                         render_every=args.render_every,
                                         report_each=1,
                                         logname=args.logdir + "/" +
                                         get_params_str(seed))

        env = wrappers.RewardWrapper(env,
                                     green_penalty=args.green_penalty,
                                     args=args,
                                     silent=silent)
        return env
示例#2
0
    def _init():
        env = gym.make("CarRacingSoftFS{}-v0".format(args.frame_skip))
        env = wrappers.VaeCarWrapper(env,silent=True)
        globals()['vae'] = env.vae
        globals()['vae_wrapper'] = env
        env = wrappers.CarDiscretizatinoWrapper(env, args.action_map == "large")

        env = wrappers.EvaluationWrapper(env, args.seed, evaluate_for=args.evaluate_for, 
                                         report_each=1, logname="/dev/null")
        return env
示例#3
0
    def _init():

        env = gym.make("CarRacingSoftFS{}-v0".format(1))

        env = wrappers.VaeCarWrapper(env, silent=silent)
        env = wrappers.TerminateEarlyWrapper(env)
        env = wrappers.CarDiscretizatinoWrapper(env)
        env = wrappers.EvaluationWrapper(env,
                                         np.random.randint(0, 100000),
                                         evaluate_for=1,
                                         report_each=1,
                                         logname="/dev/null")
        return env
示例#4
0
def create_env(args, report_each=100, **kwargs):
    # Create the environment
    env = wrappers.EvaluationWrapper(gym.make("Taxi-v3"), seed=args.seed, report_each=report_each, **kwargs)

    # Extract a deterministic MDP into three NumPy arrays
    # - R[state][action] is the reward
    # - D[state][action] is the True/False value indicating end of episode
    # - N[state][action] is the next state
    R, D, N = [
        np.array([[env.P[s][a][0][i] for a in range(env.action_space.n)] for s in range(env.observation_space.n)]) for i in [2,3,1]
    ]

    return env, R, D, N
示例#5
0
    def _init():
        if args.original_gym:
            env = gym.make("CarRacing-v0") 
        else:
            env = gym.make("CarRacingSoftFS{}-v0".format(args.frame_skip))

        env = my_wrappers.VaeCarWrapper(env, silent=silent)
        if not args.recodex:
            env = my_wrappers.TerminateEarlyWrapper(env)

        if args.discrete_actions:
            env = my_wrappers.CarDiscretizatinoWrapper(env)

        env = wrappers.EvaluationWrapper(env, args.seed, evaluate_for=evaluate_for, 
                                         report_each=1 )

        # env = my_wrappers.RewardWrapper(env, green_penalty=args.green_penalty, speed_limit=args.speed_limit, speed_limit_end=args.speed_limit_end * args.total_timesteps, silent=silent)
        return env
示例#6
0
            # TODO(paac): Train network using current states, chosen actions and estimated returns
            network.train(states, actions, returns)

            states = next_states

        # Periodic evaluation
        total_reward = []
        for _ in range(args.evaluate_for):
            total_reward.append(evaluate_episode())
        print(
            f'Mean {args.evaluate_for} episodes return {np.mean(total_reward)}'
        )
        if np.mean(total_reward) > 90:
            training = False

    # Final evaluation
    while True:
        evaluate_episode(start_evaluation=True)


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment
    env = wrappers.EvaluationWrapper(
        wrappers.DiscreteMountainCarWrapper(
            gym.make("MountainCarContinuous-v0"), tiles=args.tiles), args.seed)

    main(env, args)
示例#7
0
            if args.render_each and env.episode > 0 and env.episode % args.render_each == 0:
                env.render()

            # TODO: Compute `action` using epsilon-greedy policy.
            action = None

            next_state, reward, done, _ = env.step(action)

            state = next_state

        # TODO: Compute returns from the recieved rewards
        # and update Q and C.

    # Final evaluation
    while True:
        state, done = env.reset(start_evaluation=True), False
        while not done:
            # TODO: Choose greedy action
            action = None
            state, reward, done, _ = env.step(action)


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment
    env = wrappers.EvaluationWrapper(
        wrappers.DiscreteCartPoleWrapper(gym.make("CartPole-v1")), args.seed)

    main(env, args)
示例#8
0
def main(env, args):
    global model
    # Fix random seeds and number of threads
    np.random.seed(args.seed)

    if args.recodex:
        models = []
        for path in args.load_from:
            models.append(SAC.load(path))

        while True:
            state, done = env.reset(start_evaluation=True), False
            ret = 0
            while not done:
                action = np.sum(np.array(
                    list(
                        map(lambda m: m.predict(state, deterministic=True)[0],
                            models))),
                                axis=0) / len(models)**0.5
                # print(action)

                # action, _states = model.predict(state, deterministic=True)
                # action, _states = model.predict(state)

                ## TODO delete before submitting
                if not args.no_render:
                    env.render()

                state, reward, done, _ = env.step(action)
                ret += reward

            print("Episode return:", ret)

    else:

        tensorboard_log_dir = None if args.tensorboard_log_dir is None else os.path.join(
            args.tensorboard_log_dir, get_exp_name())

        model = SAC("MlpPolicy",
                    env,
                    learning_rate=lr_schedule,
                    buffer_size=args.buffer_size,
                    learning_starts=args.learning_starts,
                    n_episodes_rollout=args.train_episodes,
                    batch_size=args.batch_size,
                    tau=args.tau,
                    gamma=args.gamma,
                    train_freq=args.train_freq,
                    gradient_steps=args.gradient_steps,
                    ent_coef="auto"
                    if args.ent_coef == "auto" else float(args.ent_coef),
                    use_sde=False,
                    policy_kwargs=dict(log_std_init=-3,
                                       net_arch=args.net_arch,
                                       use_expln=True),
                    tensorboard_log=tensorboard_log_dir,
                    rew_skip_thres=args.rew_skip_thres,
                    seed=args.seed)

        model.verbose = 2

        callbacks = [
            CheckpointCallback(20000,
                               "checkpoints",
                               name_prefix=get_exp_name()),
            EvalCallback(
                gym.make(getEnvName()),
                callback_on_new_best=SaveBestModelCallback(
                    save_path="best/" + get_exp_name() + "_best_model.zip"),
                eval_freq=20000,
                n_eval_episodes=5,
                deterministic=True),
            EpisodeCallback(env, model)
        ]

        print(args.log_interval)
        model.learn(args.timesteps,
                    log_interval=args.log_interval,
                    callback=callbacks)

        # Final evaluation
        env = wrappers.EvaluationWrapper(gym.make(getEnvName()),
                                         evaluate_for=200,
                                         seed=args.seed)

        while True:
            state, done = env.reset(start_evaluation=True), False
            while not done:
                action, _states = model.predict(state, deterministic=True)
                state, reward, done, _ = env.step(action)

        model.save(get_exp_name())
示例#9
0
文件: paac.py 项目: uhlajs/npfl122
    training = True
    while training:
        # Training
        for _ in range(args.evaluate_each):
            # TODO: Choose actions using network.predict_actions
            actions = None

            # TODO: Perform steps in the vectorized environment

            # TODO: Compute estimates of returns by one-step bootstrapping

            # TODO: Train network using current states, chosen actions and estimated returns

        # Periodic evaluation
        for _ in range(args.evaluate_for):
            evaluate_episode()

    # Final evaluation
    while True:
        evaluate_episode(start_evaluation=True)


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment
    env = wrappers.EvaluationWrapper(gym.make(args.env), args.seed)

    main(env, args)
示例#10
0
文件: walker.py 项目: uhlajs/npfl122
    # Fix random seeds and number of threads
    np.random.seed(args.seed)
    tf.random.set_seed(args.seed)
    tf.config.threading.set_inter_op_parallelism_threads(args.threads)
    tf.config.threading.set_intra_op_parallelism_threads(args.threads)

    if args.recodex:
        # TODO: Perform evaluation of a trained model.

        while True:
            state, done = env.reset(start_evaluation=True), False
            while not done:
                # TODO: Choose an action
                action = None

                state, reward, done, _ = env.step(action)

    else:
        # TODO: Perform training

        pass


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment
    env = wrappers.EvaluationWrapper(gym.make("BipedalWalker-v3"), args.seed)

    main(env, args)
示例#11
0
    try:
        # Final evaluation
        returns = []
        while True:
            state, done = env.reset(start_evaluation=True), False

            r = 0
            while not done:
                action = np.argmax(W[state].sum(axis=0))
                state, reward, done, _ = env.step(action)
                r += reward
            returns.append(r)
    except KeyboardInterrupt:
        if not args.recodex:
            np.save(f"{sum(returns)}_{args.tiles}_W_matrix.npy", W)


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment

    env = wrappers.EvaluationWrapper(
        wrappers.DiscreteMountainCarWrapper(gym.make("MountainCar1000-v0"),
                                            tiles=args.tiles),
        args.seed,
        logname=
        f"{args.logdir}/alpha={args.alpha},alpha_dec={args.alpha_dec},epsilon={args.epsilon},epsilon_final={args.epsilon_final},epsilon_final_at={args.epsilon_final_at},episodes={args.episodes},tiles={args.tiles},gamma={args.gamma},seed={args.seed}"
    )
    main(env, args)
示例#12
0
                                         evaluate_for=200,
                                         seed=args.seed)

        while True:
            state, done = env.reset(start_evaluation=True), False
            while not done:
                action, _states = model.predict(state, deterministic=True)
                state, reward, done, _ = env.step(action)

        model.save(get_exp_name())


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment
    if not args.recodex:
        # env = TimeLimit(
        env = wrappers.EvaluationWrapper(gym.make(getEnvName()),
                                         evaluate_for=10,
                                         seed=args.seed)
        # max_episode_steps=1600)

        if args.frame_skip > 1:
            env = wrappers.FrameSkipWrapper(env, args.frame_skip)
    else:
        env = wrappers.EvaluationWrapper(gym.make(getEnvName()),
                                         seed=args.seed)

    main(env, args)
示例#13
0
#        # TODO: Perform a training episode
#        state, done = env.reset(), False
#        while not done:
#            if args.render_each and env.episode and env.episode % args.render_each == 0:
#                env.render()
#
#            state, reward, done, _ = env.step(action)

# Final evaluation
    q = q1 + q2
    while True:
        state, done = env.reset(start_evaluation=True), False
        while not done:
            if args.render_each and env.episode and env.episode % args.render_each == 0:
                env.render()
            action = np.argmax(q[state])
            state, reward, done, _ = env.step(action)

if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment
    env = wrappers.EvaluationWrapper(
        wrappers.DiscreteLunarLanderWrapper(gym.make("LunarLander-v2")),
        seed=args.seed,
        logname=
        f"double=True,alpha={args.alpha},alpha_decay_exp={args.alpha_decay_exp},epsilon={args.epsilon},start_epsilon={args.start_epsilon},gamma={args.gamma},n={args.n},epsilon_decay={args.epsilon_decay},alpha_decay={args.alpha_decay},expert_every={args.expert_training_every},episodes={args.episodes},seed={args.seed},init_random_actions={args.init_random_actions}",
        evaluate_for=args.evaluate_for)

    main(env, args)
示例#14
0
    training = True
    while training:
        # To generate expert trajectory, you can use
        state, trajectory = env.expert_trajectory()

        # TODO: Perform a training episode
        state, done = env.reset(), False
        while not done:
            if args.render_each and env.episode and env.episode % args.render_each == 0:
                env.render()

            state, reward, done, _ = env.step(action)

    # Final evaluation
    while True:
        state, done = env.reset(start_evaluation=True), False
        while not done:
            # TODO: Choose (greedy) action
            action = None
            state, reward, done, _ = env.step(action)


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment
    env = wrappers.EvaluationWrapper(wrappers.DiscreteLunarLanderWrapper(gym.make("LunarLander-v2")), args.seed)

    main(env, args)
示例#15
0
文件: q_learning.py 项目: kubic71/mff
            next_state, reward, done, _ = env.step(action)

            # TODO: Update the action-value estimates
            q[state, action] = q[state, action] + (alpha_schedule(
                args, e) if args.decrease_alpha else args.alpha) * (
                    reward + args.gamma * np.max(q[next_state]) -
                    q[state, action])
            state = next_state

    # Final evaluation
    while True:
        state, done = env.reset(start_evaluation=True), False
        while not done:
            # TODO: Choose (greedy) action
            action = np.argmax(q[state])
            state, reward, done, _ = env.step(action)


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment
    env = wrappers.EvaluationWrapper(
        wrappers.DiscreteMountainCarWrapper(gym.make("MountainCar1000-v0")),
        args.seed,
        logname=
        f"alpha={args.alpha},epsilon={args.epsilon},gamma={args.gamma},init_bias={args.init_bias},de={args.decrease_epsilon},da={args.decrease_alpha},seed={args.seed}",
        evaluate_for=100)

    main(env, args)
示例#16
0
文件: q_network.py 项目: kubic71/mff
            if step % args.target_update_freq == 0:
                target.copy_weights_from(network)

            state = next_state

        if args.epsilon_final_at:
            epsilon = np.interp(env.episode + 1,
                                [0, args.epsilon_final_at * args.episodes],
                                [args.epsilon, args.epsilon_final])

    # Final evaluation
    while True:
        state, done = env.reset(start_evaluation=True), False
        while not done:
            if args.render_each == 1:
                env.render()
            action = np.argmax(
                network.predict(np.array([state], np.float32))[0])
            state, reward, done, _ = env.step(action)


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment
    env = wrappers.EvaluationWrapper(gym.make("CartPole-v1"),
                                     args.seed,
                                     logname="logs/" + get_log_name())

    main(env, args)
示例#17
0
    training = True
    while training:
        # Generate required number of episodes
        for _ in range(args.evaluate_each // args.batch_size):
            episodes = []
            for _ in range(args.batch_size):
                episodes.append(env.expert_episode())

            # Train the network
            network.train(episodes)

        # TODO: Maybe evaluate the current performance, using
        # `evaluate_episode()` method returning the achieved return,
        # and setting `training=False` when the performance is high enough.

    # Final evaluation
    while True:
        evaluate_episode(True)


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment
    env = wrappers.EvaluationWrapper(memory_game_environment.make(args.cards),
                                     args.seed,
                                     evaluate_for=args.evaluate_for,
                                     report_each=args.evaluate_for)

    main(env, args)
示例#18
0
文件: walker.py 项目: kubic71/mff
                               name_prefix=get_exp_name()),
            EvalCallback(gym.make("BipedalWalker-v3"),
                         callback_on_new_best=SaveBestModelCallback(),
                         eval_freq=10000,
                         n_eval_episodes=5,
                         deterministic=True)
            # EpisodeCallback(env)
        ]

        print(args.log_interval)
        model.learn(args.timesteps,
                    log_interval=args.log_interval,
                    callback=callbacks)



if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment
    if not args.recodex:
        # env = TimeLimit(
        env = wrappers.EvaluationWrapper(
            gym.make("BipedalWalkerHardcore-v3"), evaluate_for=100, seed=args.seed)
                        # max_episode_steps=1600)
    else:
        env = wrappers.EvaluationWrapper(gym.make("BipedalWalkerHardcore-v3"),
                                         seed=args.seed)

    main(env, args)
示例#19
0
                Transition(state, action, reward, done, next_state))

            # TODO: If the replay_buffer is large enough, preform a training batch
            # from `args.batch_size` uniformly randomly chosen transitions.
            #
            # After you choose `states` and suitable targets, you can train the network as
            #   network.train(states, ...)

            state = next_state

        if args.epsilon_final_at:
            epsilon = np.interp(env.episode + 1, [0, args.epsilon_final_at],
                                [args.epsilon, args.epsilon_final])

    # Final evaluation
    while True:
        state, done = env.reset(start_evaluation=True), False
        while not done:
            # TODO: Choose (greedy) action
            action = None
            state, reward, done, _ = env.step(action)


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment
    env = wrappers.EvaluationWrapper(gym.make("CartPole-v1"), args.seed)

    main(env, args)
示例#20
0
def main(args):
    # Create the environment
    env = wrappers.EvaluationWrapper(gym.make("Taxi-v3"),
                                     seed=args.seed,
                                     report_each=100)

    # Fix random seed and create a generator
    generator = np.random.RandomState(args.seed)

    Q = np.zeros((env.observation_space.n, env.action_space.n))

    for _ in range(args.episodes):
        next_state, done = env.reset(), False

        # Generate episode and update Q using the given TD method
        next_action = np.argmax(Q[next_state]) if generator.uniform(
        ) >= args.epsilon else env.action_space.sample()
        next_action_prob = args.epsilon / env.action_space.n + (
            1 - args.epsilon) * (next_action == np.argmax(Q[next_state]))
        while not done:
            action, action_prob, state = next_action, next_action_prob, next_state
            next_state, reward, done, _ = env.step(action)
            if not done:
                next_action = np.argmax(Q[next_state]) if generator.uniform(
                ) >= args.epsilon else env.action_space.sample()
                next_action_prob = args.epsilon / env.action_space.n + (
                    1 - args.epsilon) * (next_action == np.argmax(
                        Q[next_state]))

            target_policy = np.eye(env.action_space.n)[np.argmax(Q, axis=1)]
            if not args.off_policy:
                target_policy = (
                    1 - args.epsilon
                ) * target_policy + args.epsilon / env.action_space.n * np.ones_like(
                    target_policy)

            # TODO: Perform the update to the state-action value function `Q`, using
            # a TD update with the following parameters:
            # - `args.n`: use `args.n`-step method
            # - `args.off_policy`:
            #    - if False, the epsilon-greedy behaviour policy is also the target policy
            #    - if True, the target policy is the greedy policy
            #      - for SARSA (with any `args.n`) and expected SARSA (with `args.n` > 1),
            #        importance sampling must be used
            # - `args.mode`: this argument can have the following values:
            #   - "sarsa": regular SARSA algorithm
            #   - "expected_sarsa": expected SARSA algorithm
            #   - "tree_backup": tree backup algorithm
            #
            # Perform the updates as soon as you can -- whenever you have all the information
            # to update `Q[state, action]`, do it. For each `action` use its corresponding
            # `action_prob` at the time of taking the `action` as the behaviour policy action
            # probability, and current `target_policy` as the target policy (everywhere
            # in the update).
            #
            # Do not forget that when `done` is True, bootstrapping on the
            # `next_state` is not used.
            #
            # Also note that when the episode ends and `args.n` > 1, there will
            # be several state-action pairs that also need to be updated. Perform
            # the updates in the order in which you encountered the state-action
            # pairs and during these updates, use the `target_policy` computed
            # above (do not modify it during these post-episode updates).

    return Q
示例#21
0
                # network._baseline_model.optimizer.lr = warmup_lr_schedule(ep)
                # network._model.optimizer.lr = warmup_lr_schedule(ep)

                network.train(batch_states, batch_actions, batch_returns)

                print(steps)
                steps += 1

                network._model.save("checkpoint")

        except KeyboardInterrupt:
            pass

    # if args.recodex:
    # network._model.load_weights ...

    print("Evaluation!")

    # Final evaluation
    while True:
        state, done = env.reset(True), False
        while not done:
            action = np.argmax(network.predict([state])[0])
            state, reward, done, _ = env.step(action)


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)
    env = wrappers.EvaluationWrapper(gym.make("CartPolePixels-v0"), args.seed)
    main(env, args)
示例#22
0
import tensorflow as tf
import random
from vae.vae import CVAE
# from env import make_env
from utils import PARSER
args = PARSER.parse_args(['--config_path', 'configs/carracing.config'])

import pygame
pygame.init()
screen = pygame.display.set_mode((600, 300))

frame_skip = 3
seed = 2
env = wrappers.EvaluationWrapper(wrappers.VaeCarWrapper(
    gym.make("CarRacingSoftFS{}-v0".format(frame_skip))),
                                 seed,
                                 evaluate_for=15,
                                 report_each=1)

DATA_DIR = "export"
model_path_name = "models/tf_vae".format(args.exp_name, args.env_name)
vae = CVAE(args)
vae.set_weights(
    tf.keras.models.load_model(model_path_name, compile=False).get_weights())

filelist = os.listdir(DATA_DIR)
obs = np.load(os.path.join(DATA_DIR, random.choice(filelist)))["obs"]
obs = obs.astype(np.float32) / 255.0


def resize(img, factor):
                    if i % 100 == 0:
                        network.update_target_weights()

                    if i % 100 == 0:
                        network.save()

                state = next_state

            epsilon = np.exp(
                np.interp(env.episode + 1, [0, 5000],
                          [np.log(0.25), np.log(0.01)]))

    elif args.evolution:
        es = train(load_from='saved_model.pkl')
        np.save('best_params', es.best.get()[0])
        best_params = es.best.get()[0]
        play(best_params, render=True)


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment
    env = wrappers.EvaluationWrapper(gym.make("CartPolePixels-v0"),
                                     args.seed,
                                     report_each=10,
                                     evaluate_for=15)

    main(env, args)
示例#24
0
            if args.render_each and env.episode > 0 and env.episode % args.render_each == 0:
                env.render()

            # TODO: Perform an action.
            action = None

            next_state, reward, done, _ = env.step(action)

            # TODO: Update the action-value estimates

            state = next_state

    # Final evaluation
    while True:
        state, done = env.reset(start_evaluation=True), False
        while not done:
            # TODO: Choose (greedy) action
            action = None
            state, reward, done, _ = env.step(action)


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment
    env = wrappers.EvaluationWrapper(
        wrappers.DiscreteMountainCarWrapper(gym.make("MountainCar1000-v0")),
        args.seed)

    main(env, args)