예제 #1
0
def main():
    """Run DQN until the environment throws an exception."""
    env = AllowBacktracking(make_env(stack=False, scale_rew=False))
    env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    with tf.Session(config=config) as sess:
        dqn = DQN(*rainbow_models(sess,
                                  env.action_space.n,
                                  gym_space_vectorizer(env.observation_space),
                                  min_val=-200,
                                  max_val=200))
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)
        optimize = dqn.optimize(learning_rate=1e-4)
        sess.run(tf.global_variables_initializer())
        dqn.train(
            num_steps=2000000,  # Make sure an exception arrives before we stop.
            player=player,
            replay_buffer=StochasticMaxStochasticDeltaDeletionPRB(500000,
                                                                  0.5,
                                                                  0.4,
                                                                  epsilon=0.1),
            optimize_op=optimize,
            train_interval=1,
            target_interval=8192,
            batch_size=32,
            min_buffer_size=20000)
def test():
    env = envWrapper(SubprocVecEnv(sonic.make_envs(num=1)))
    model = learn(env, 20, 0, 1e6)
    total_reward = 0.0
    test_env = sonic.make_env()
    for i in range(1):
        obs = env.reset()
        while True:
            action_index, _, _, _ = model.eval_and_sample(torch.tensor(obs, dtype=torch.float).to(device)) # need to unsqueeze eval output
            obs, reward, done = env.step(action_index)
            total_reward += np.sum(reward)
            if done.any():
                break
        print("{} testgames done".format(i + 1))
    total_reward_rand = 0
    for i in range(1):
        obs = env.reset()
        while True:
            obs, reward, done = env.step([env.env.action_space.sample() for i in range(env.num_envs)])
            total_reward += np.sum(reward)
            if done.any():
                break
        print("{} testgames done".format(i + 1))
    print("total_reward: {}".format(total_reward))
    print("total_reward_rand: {}".format(total_reward_rand))
예제 #3
0
def main():
    """Run DQN until the environment throws an exception."""
    # env = make(game='SonicAndKnuckles3-Genesis', state='AngelIslandZone.Act1')
    # env = SonicDiscretizer(env)
    # env = WarpFrame(env)
    # env = AllowBacktracking(env)

    env = AllowBacktracking(make_env(stack=False, scale_rew=False))
    env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True # pylint: disable=E1101
    with tf.Session(config=config) as sess:
        dqn = DQN(*rainbow_models(sess,
                                  env.action_space.n,
                                  gym_space_vectorizer(env.observation_space),
                                  min_val=-200,
                                  max_val=200))
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 4)
        optimize = dqn.optimize(learning_rate=1e-4)
        sess.run(tf.global_variables_initializer())
        dqn.train(num_steps=2000000, # Make sure an exception arrives before we stop.
                  player=player,
                  replay_buffer=PrioritizedReplayBuffer(500000, 0.7, 0.6, epsilon=0.2),
                  optimize_op=optimize,
                  train_interval=1,
                  target_interval=16384,
                  batch_size=64,
                  min_buffer_size=20000)
예제 #4
0
def test():
    env = envWrapper(make_env()())
    model = learn(env, 3000, 3e4, 2e-4)
    total_reward = 0
    for i in range(30):
        obs = env.reset()
        while True:
            action_index, _, _, _ = model.evaluate(torch.unsqueeze(torch.tensor(obs, dtype=torch.float).to(device), 0))
            obs, reward, done = env.step(action_index)
            print(action_index)
            total_reward += reward
            if done:
                break
        print("{} testgames done".format(i))
    total_reward_rand = 0
    for i in range(30):
        obs = env.reset()
        while True:
            obs, reward, done = env.step(env.env.action_space.sample())
            total_reward_rand += reward
            if done:
                break
        print("{} testgames done".format(i))
    print("total_reward: {}".format(total_reward))
    print("total_reward_rand: {}".format(total_reward_rand))
예제 #5
0
def main():
    env = AllowBacktracking(make_env(stack=False, scale_rew=False))
    env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        dqn = DQN(*rainbow_models(sess,
                                  env.action_space.n,
                                  gym_space_vectorizer(env.observation_space),
                                  min_val=-421,
                                  max_val=421))
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)
        optimize = dqn.optimize(learning_rate=1e-4)
        sess.run(tf.global_variables_initializer())
        dqn.train(num_steps=2000000,
                  player=player,
                  replay_buffer=PrioritizedReplayBuffer(500000,
                                                        0.5,
                                                        0.4,
                                                        epsilon=0.1),
                  optimize_op=optimize,
                  train_interval=1,
                  target_interval=64,
                  batch_size=32,
                  min_buffer_size=25000)
예제 #6
0
def main():
    env = make_env()
    max_screen_x = 0

    model = DQN.load(saved_model_file_path)

    obs = env.reset()

    fps = 60
    frames_per_timestep = 4
    speed_up_factor = 1.5
    wait_time = frames_per_timestep / fps / speed_up_factor
    while True:
        t1 = time.time()

        action, _states = model.predict(obs)

        t2 = time.time()
        
        t3 = wait_time - (t2 - t1)

        if t3 > 0:
            time.sleep(t3)
        
        obs, rewards, done, info = env.step(action)

        if info['screen_x'] > max_screen_x:
            max_screen_x = info['screen_x']
            logger.info("Max screen x: " + str(max_screen_x))
        if done:
            env.reset()
        else:
            env.render()
def main():
    training_episodes = 1000
    #env = make(game='SonicAndKnuckles3-Genesis', state='AngelIslandZone.Act1')
    env = make_env(stack=False, scale_rew=False)
    obs = env.reset()
    espio = DDQN()
    score = 0

    # training
    for e in range(0, training_episodes):
        action = env.action_space.sample()
        obs_prime, rew, done, info = env.step(action)
        score_prime = info['score']
        delta_score = score_prime - score

        D = (obs_prime.flatten(), action, rew, delta_score, obs.flatten(), done)
        espio.experience_replay = np.append(espio.experience_replay, [D], axis = 0)
        env.render()
        espio.train()
        obs = obs_prime
        espio.train()
        score = score_prime # reset score to score_prime
        if done:
            obs = env.reset()
            score = 0

    obs = env.reset() # reset before testing begins

    # finished training
    while True:
        obs, rew, done, info = env.step(env.action_space.sample())
        env.render()
        if done:
            obs = env.reset()
예제 #8
0
def main():
    """Run DQN until the environment throws an exception."""
    env = AllowBacktracking(make_env(stack=False, scale_rew=False))
    env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    with tf.Session(config=config) as sess:
        dqn = DQN(*rainbow_models(sess,
                                  env.action_space.n,
                                  gym_space_vectorizer(env.observation_space),
                                  min_val=-200,
                                  max_val=200))
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)
        """
        Create a TF Op that optimizes the objective.
        Args:
          learning_rate: the Adam learning rate.
          epsilon: the Adam epsilon.
        """
        optimize = dqn.optimize(learning_rate=6.25e-5, epsilon=1.5e-4)

        sess.run(tf.global_variables_initializer())
        """
        Run an automated training loop.
        This is meant to provide a convenient way to run a
        standard training loop without any modifications.
        You may get more flexibility by writing your own
        training loop.
        Args:
          num_steps: the number of timesteps to run.
          player: the Player for gathering experience.
          replay_buffer: the ReplayBuffer for experience.
          optimize_op: a TF Op to optimize the model.
          train_interval: timesteps per training step.
          target_interval: number of timesteps between
            target network updates.
          batch_size: the size of experience mini-batches.
          min_buffer_size: minimum replay buffer size
            before training is performed.
          tf_schedules: a sequence of TFSchedules that are
            updated with the number of steps taken.
          handle_ep: called with information about every
            completed episode.
          timeout: if set, this is a number of seconds
            after which the training loop should exit.
        """
        dqn.train(
            num_steps=1000000,  # Make sure an exception arrives before we stop.
            player=player,
            replay_buffer=PrioritizedReplayBuffer(500000,
                                                  0.5,
                                                  0.4,
                                                  epsilon=0.1),
            optimize_op=optimize,
            train_interval=1,
            target_interval=8192,
            batch_size=32,
            min_buffer_size=20000)
예제 #9
0
def main():
    """Run DQN until the environment throws an exception."""
    env = AllowBacktracking(make_env(stack=False, scale_rew=False))
    env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    with tf.Session(config=config) as sess:
        dqn = DQN(*rainbow_models(sess,
                                  env.action_space.n,
                                  gym_space_vectorizer(env.observation_space),
                                  min_val=-200,
                                  max_val=200))
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)

        # Other exploration schedules
        #eps_decay_sched = LinearTFSchedule(50000, 1.0, 0.01)
        #player = NStepPlayer(BatchedPlayer(env, EpsGreedyQNetwork(dqn.online_net, 0.1)), 3)
        #player = NStepPlayer(BatchedPlayer(env, EpsGreedyQNetwork(dqn.online_net, TFScheduleValue(sess, eps_decay_sched))), 3)
        #player = NStepPlayer(BatchedPlayer(env, SonicEpsGreedyQNetwork(dqn.online_net, TFScheduleValue(sess, eps_decay_sched))), 3)

        optimize = dqn.optimize(learning_rate=1e-4)
        sess.run(tf.global_variables_initializer())

        reward_hist = []
        total_steps = 0

        def _handle_ep(steps, rew, env_rewards):
            nonlocal total_steps
            total_steps += steps
            reward_hist.append(rew)
            if total_steps % 10 == 0:
                print('%d episodes, %d steps: mean of last 100 episodes=%f' %
                      (len(reward_hist), total_steps,
                       sum(reward_hist[-100:]) / len(reward_hist[-100:])))

        dqn.train(
            num_steps=2000000,  # Make sure an exception arrives before we stop.
            player=player,
            replay_buffer=PrioritizedReplayBuffer(500000,
                                                  0.5,
                                                  0.4,
                                                  epsilon=0.1),
            optimize_op=optimize,
            train_interval=1,
            target_interval=8192,
            batch_size=32,
            min_buffer_size=20000,
            tf_schedules=[eps_decay_sched],
            handle_ep=_handle_ep,
            restore_path='./pretrained_model',
            save_interval=None,
        )
예제 #10
0
def main():
    print('connecting to remote environment')
    env = make_env(stack=False)
    print('starting episode')
    env.reset()

    while True:
        obs, rew, done, info = env.step(env.action_space.sample())
        print(rew, done, info)
        env.render()
        if done:
            print('episode complete')
            obs = env.reset()
예제 #11
0
def main():
    env = make_env()
    max_screen_x = 0

    model = DQN.load(saved_model_file_path)

    obs = env.reset()
    while True:
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)

        if info['screen_x'] > max_screen_x:
            max_screen_x = info['screen_x']
            logger.info("Max screen x: " + str(max_screen_x))
        if done:
            env.reset()
        else:
            env.render()
예제 #12
0
def main():
    print('connecting to remote environment')
    env = make_env(stack=False)
    print('starting episode')
    env.reset()
    episode_step = 0
    episode_reward = 0

    while True:
        episode_step += 1
        #action = env.action_space.sample()
        # HilltopZone.Act1
        if episode_step < 52:
            action = 1
        elif episode_step < 63:
            action = 0
        elif episode_step < 85:
            action = episode_step % 2
        elif episode_step < 95:
            action = 1
        elif episode_step < 155:
            action = 1
        elif episode_step < 160:
            action = 5
        else:
            if episode_step % 2 == 0:
                action = 1
            else:
                action = 5
        obs, rew, done, info = env.step(action)
        episode_reward += rew
        print(action)
        print(rew, done, info)
        print(episode_reward)
        env.render()
        if done:
            print('episode complete')
            obs = env.reset()
            episode_step = 0
            episode_reward = 0
def main():
    """Run DQN until the environment throws an exception."""
    env = AllowBacktracking(make_env(stack=False, scale_rew=False))
    env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101

    with tf.Session(config=config) as sess:
        dqn = DQN(*rainbow_models(sess,
                                  env.action_space.n,
                                  gym_space_vectorizer(env.observation_space),
                                  min_val=-200,
                                  max_val=200))
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)
        optimize = dqn.optimize(learning_rate=1e-4)
        sess.run(tf.global_variables_initializer())

        saver = tf.train.Saver()
        saver.restore(sess, "/root/compo/model.ckpt")
        #print('model restored')
        replay_buffer = pickle.load(
            gzip.open('/root/compo/replay_buffer.p.gz', 'rb'))
        replay_buffer.alpha = 0.2
        replay_buffer.beta = 0.4
        replay_buffer.capacity = 100000

        restore_ppo2_weights(sess)

        dqn.train(
            num_steps=2000000,  # Make sure an exception arrives before we stop.
            player=player,
            replay_buffer=
            replay_buffer,  #PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1),
            optimize_op=optimize,
            train_interval=4,
            target_interval=8192,
            batch_size=32,
            min_buffer_size=20000)
예제 #14
0
파일: ppo.py 프로젝트: rlalpha/rl-trial
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

from sonic_util import make_env
from agent import Agent
from collections import deque
import numpy as np

level_name='LabyrinthZone.Act1'
env = make_env(level_name=level_name, \
                stack=False, scale_rew=True)
env.seed = 714

state_space = list(env.observation_space.shape)
action_space = env.action_space.n
print('State shape: ', state_space)
print('Number of actions: ', action_space)

BATCH_SIZE = 3000
EXPERIENCE_REPLAY = False
BUFFER_SIZE = int(9100)

agent = Agent(state_space, action_space, level_name=level_name, \
    param={
        'EXPERIENCE_REPLAY': EXPERIENCE_REPLAY,
        'BUFFER_SIZE': BUFFER_SIZE,
        'BATCH_SIZE': BATCH_SIZE
    })

def add_noise(state):
    row,col,ch= state.shape
예제 #15
0
def train(
    train_id,
    game,
    level,
    num_processes,
    num_timesteps,
    algo_name,
    policy_name,
    is_joint,
    model_save_path,
    logs_path,
    hyper_opt,
    load_model_path=None,
    train_counter=0,  # To be set (incrementally) when running multiple trainings
    short_life=False,
    backtracking=False,
):
    global global_logs_path, best_mean_reward, n_steps

    print("\n\nStarting training with args:\n")
    print(log_fun_args(locals()))
    print("\n")

    global_logs_path = logs_path
    best_mean_reward, n_steps = -np.inf, 0
    envs = []
    if is_joint:
        envs = [
            make_env(
                game=game,
                level=level,
                rank=i,
                log_dir=logs_path,
                seed=train_counter * 100,
                short_life=short_life,
                backtracking=backtracking,
            ) for i, (game, level) in enumerate(small_train_set)
        ]
    else:
        envs = [
            make_env(
                game=game,
                level=level,
                rank=i,
                log_dir=logs_path,
                seed=train_counter * 100,
                short_life=short_life,
                backtracking=backtracking,
            ) for i in range(num_processes)
        ]

    if num_processes == 1:
        env = VecFrameStack(DummyVecEnv(envs), 4)
    else:
        env = VecFrameStack(SubprocVecEnv(envs), 4)

    print("\n\n")

    algo = None
    if algo_name == "ppo2":
        algo = PPO2
    elif algo_name == "a2c":
        algo = A2C

    policy = None
    nminibatches = 4
    if policy_name == "cnn":
        policy = CnnPolicy
    elif policy_name == "cnnlstm":
        if is_joint:
            nminibatches = 5
        policy = CnnLstmPolicy

    model = None
    if load_model_path:
        print("Loading a model...")
        model = algo.load(load_model_path, env=env, tensorboard_log=logs_path)
    else:
        print("Creating a new model...")
        if algo_name == "ppo2":
            if hyper_opt:
                model = algo(
                    policy,
                    env,
                    verbose=1,
                    tensorboard_log=logs_path,
                    n_steps=4096,
                    nminibatches=8,
                    learning_rate=2e-4,
                    ent_coef=0.01,
                )
            else:
                model = PPO2(
                    policy,
                    env,
                    nminibatches=nminibatches,
                    verbose=1,
                    tensorboard_log=logs_path,
                )
        elif algo_name == "a2c":
            model = A2C(policy, env, verbose=1, tensorboard_log=logs_path)

    print(f"Starting training for {num_timesteps} timesteps")
    model.learn(total_timesteps=num_timesteps,
                callback=callback,
                log_interval=1)
    print("Training finished!")

    if model_save_path:
        model.save(model_save_path)
        print("Model saved in:\t", model_save_path)

    timestep_values, score_values = ts2xy(load_results(logs_path), "timesteps")
    score_values = score_values * 100

    plot_path = os.path.join(logs_path, f"{level}.png")
    print("Saving the plot in: " + plot_path)
    save_plot(timestep_values, score_values, title=level, save_path=plot_path)

    env.close()
예제 #16
0
def build_env(level_name):
    env = make_env(stack=False, scale_rew=True, level_name=level_name)
    return env
예제 #17
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--restore',
                        '-restore',
                        action='store_true',
                        help='restore from checkpoint file')
    parser.add_argument('--record',
                        '-record',
                        action='store_true',
                        help='record bk2 movies')
    args = parser.parse_args()
    """Run DQN until the environment throws an exception."""
    env = AllowBacktracking(
        make_env(stack=False, scale_rew=False, record=args.record))
    env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)

    checkpoint_dir = os.path.join(os.getcwd(), 'results')
    results_dir = os.path.join(os.getcwd(), 'results',
                               time.strftime("%d-%m-%Y_%H-%M-%S"))
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)
    summary_writer = tf.summary.FileWriter(results_dir)

    # TODO
    # env = wrappers.Monitor(env, results_dir, force=True)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    with tf.Session(config=config) as sess:
        dqn = DQN(*rainbow_models(sess,
                                  env.action_space.n,
                                  gym_space_vectorizer(env.observation_space),
                                  min_val=-200,
                                  max_val=200))

        saver = tf.train.Saver()
        if args.restore:
            latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
            if latest_checkpoint:
                print("Loading model checkpoint {} ...\n".format(
                    latest_checkpoint))
                saver.restore(sess, latest_checkpoint)
            else:
                print("Checkpoint not found")

        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)
        optimize = dqn.optimize(learning_rate=1e-4)
        sess.run(tf.global_variables_initializer())

        reward_hist = []
        total_steps = 0

        # runs with every completed episode
        def _handle_ep(steps, rew):
            nonlocal total_steps
            total_steps += steps
            reward_hist.append(rew)

            summary_reward = tf.Summary()
            summary_reward.value.add(tag='global/reward', simple_value=rew)
            summary_writer.add_summary(summary_reward, global_step=total_steps)

            print('save model')
            saver.save(sess=sess,
                       save_path=checkpoint_dir + '/model',
                       global_step=total_steps)

            if len(reward_hist) == REWARD_HISTORY:
                print('%d steps: mean=%f' %
                      (total_steps, sum(reward_hist) / len(reward_hist)))
                summary_meanreward = tf.Summary()
                summary_meanreward.value.add(tag='global/mean_reward',
                                             simple_value=sum(reward_hist) /
                                             len(reward_hist))
                summary_writer.add_summary(summary_meanreward,
                                           global_step=total_steps)
                reward_hist.clear()

        dqn.train(
            num_steps=7000000,  # Make sure an exception arrives before we stop.
            player=player,
            replay_buffer=PrioritizedReplayBuffer(500000,
                                                  0.5,
                                                  0.4,
                                                  epsilon=0.1),
            optimize_op=optimize,
            train_interval=1,
            target_interval=8192,
            batch_size=32,
            min_buffer_size=20000,
            handle_ep=_handle_ep)
예제 #18
0
def main():
    """Run DQN until the environment throws an exception."""

    print('creating env')

    env = AllowBacktracking(make_env(stack=False, scale_rew=False))

    env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)

    config = tf.ConfigProto()

    config.gpu_options.allow_growth = True  # pylint: disable=E1101

    print('starting tf session')

    with tf.Session(config=config) as sess:

        print('creating agent')

        online_net, target_net = rainbow_models(sess,
                                                env.action_space.n,
                                                gym_space_vectorizer(
                                                    env.observation_space),
                                                min_val=-200,
                                                max_val=200)

        dqn = DQN(online_net, target_net)

        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)

        optimize = dqn.optimize(learning_rate=1e-4)

        saver = tf.train.Saver()

        sess.run(tf.global_variables_initializer())

        train_steps = 5000

        print('training steps:', train_steps)

        for j in range(1):

            print(j)

            start = time.time()

            dqn.train(
                num_steps=
                train_steps,  # Make sure an exception arrives before we stop.
                player=player,
                replay_buffer=PrioritizedReplayBuffer(500000,
                                                      0.5,
                                                      0.4,
                                                      epsilon=0.1),
                optimize_op=optimize,
                train_interval=1,
                target_interval=8192,
                batch_size=32,
                min_buffer_size=10000)

            end = time.time()

            print(end - start)

        print('done training')

        print('save nn')

        save_path = saver.save(sess, "saved_models/rainbow5.ckpt")
        print("Model saved in path: %s" % save_path)

        tvars = tf.trainable_variables()
        tvars_vals = sess.run(tvars)

        #for var, val in zip(tvars, tvars_vals):
        #    print(var.name, val[0])

        #print(tvars_vals[0][-5:])

        #print('stepping')

        #obs = env.reset()

        #online_net.step(obs, obs)
        '''
예제 #19
0
    model.learn(total_timesteps=TOTAL_TIMESTEPS, callback=callback)
    model.save(saved_model_name)

    obs = env.reset()


if __name__ == '__main__':
    # Setup logging
    logging.basicConfig(
        level=logging.DEBUG,
        format=
        '%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    logger = logging.getLogger(__name__)

    env = make_env()
    model = None

    print(DECORATOR)
    if len(argv) == 1:
        if isfile(saved_model_name):
            logging.info("Loading model from file: " + saved_model_name)
            model = DQN.load(saved_model_name,
                             env=env,
                             verbose=0,
                             tensorboard_log=TENSORBOARD_LOG_DIR,
                             buffer_size=REPLAY_BUFFER_SIZE)
        else:
            logging.info("Creating model from scratch...")
            model = DQN(CnnPolicy,
                        env,
예제 #20
0
파일: model.py 프로젝트: rlalpha/rl-trial
# Hyper Parameters
OUTPUT_GRAPH = True
MAX_EPISODE = 10000
DISPLAY_REWARD_THRESHOLD = 3000  # renders environment if total episode reward is greater then this threshold
MAX_EP_STEPS = 4500  # maximum time step in one episode
RENDER = False  # rendering wastes time
GAMMA = 0.99  # reward discount in TD error
LR_A = 1e-9  # learning rate for actor
LR_C = 1e-6  # learning rate for critic
BUFFER_SIZE = 5000
BATCH_SIZE = 32
UPDATE_EVERY = 100
# gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1)

env = make_env(stack=False, scale_rew=True)
# env.seed(1)  # reproducible
# env = env.unwrapped

state_space = list(env.observation_space.shape)
action_space = env.action_space.n

print('State shape: ', state_space)
print('Number of actions: ', [1, action_space])


def reshape_state(s):
    s = s[np.newaxis, :]
    return s

예제 #21
0
from anyrl.algos import DQN
from anyrl.envs import BatchedGymEnv
from anyrl.envs.wrappers import BatchedFrameStack
from anyrl.models import rainbow_models
from anyrl.rollouts import BatchedPlayer, PrioritizedReplayBuffer, NStepPlayer
from anyrl.spaces import gym_space_vectorizer, StackedBoxSpace

import gym_remote.exceptions as gre

from sonic_util import AllowBacktracking, make_env
import numpy as np

print('creating env')
#z = StackedBoxSpace(np.zeros((84,84,1)), 4)

env = AllowBacktracking(make_env(stack=False, scale_rew=False))

env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)

#print(env.action_space.n)
#StackedBox(84,84,1)

config = tf.ConfigProto()

config.gpu_options.allow_growth = True

print('starting tf session')

with tf.Session(config=config) as sess:

    print('creating agent')
예제 #22
0
파일: replay.py 프로젝트: rlalpha/rl-trial
def build_env():
    #env, multi_action = make(game='SonicTheHedgehog-Genesis', state='LabyrinthZone.Act1'), True
    env, multi_action = make_env(stack=False, scale_rew=False), False
    return env, multi_action
예제 #23
0
    max_steps = 500
    store_model = False
    RECORD_DIR = False
    BATCH_SIZE = 8
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"setting device to '{device}'")

# close current environment if there is one (e.g. on failure to complete last time)
try:
    env.close()
except NameError:
    pass

# create the environment
# Loading the level
env = make_env(GAME_NAME, LEVEL, save_game=RECORD_DIR)

# Get screen size so that we can initialize layers correctly based on shape
# returned from AI gym. Typical dimensions at this point are close to 3x40x90
# which is the result of a clamped and down-scaled render buffer in get_screen()
init_screen = get_screen()
_, screen_depth, screen_height, screen_width = init_screen.shape
print(
    f"discovered input image ({screen_depth},{screen_height},{screen_width})")

# Get number of actions from gym action space
n_actions = env.action_space.n

policy_net = FDQN(screen_depth, screen_height, screen_width,
                  n_actions).to(device)
target_net = FDQN(screen_depth, screen_height, screen_width,
예제 #24
0
# import retro
from retro_contest.local import make
from sonic_util import make_env
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
from tool import preprocess


# Import environment and get env infor
# env = retro.make(game='SonicTheHedgehog-Genesis', state='GreenHillZone.Act1', record=False)
# env, multi_action = make(game='SonicTheHedgehog-Genesis', state='LabyrinthZone.Act1'), True
env, multi_action = make_env(stack=False, scale_rew=False), False

env.seed(1)
state_space = list(env.observation_space.shape)
action_space = env.action_space.n
print('State shape: ', state_space)
print('Number of actions: ', [1, action_space])

BUFFER_SIZE = int(5e3)  # replay buffer size
BATCH_SIZE = 16         # minibatch size
GAMMA = 0.99            # discount factor
TAU = 1e-3              # for soft update of target parameters
LR = 1e-6               # learning rate 
UPDATE_EVERY = 500      # how often to update the network

from dqn_agent import Agent
agent = Agent(state_size=state_space, action_size=action_space, 
예제 #25
0
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy, CnnLnLstmPolicy, CnnPolicy, CnnLstmPolicy
from stable_baselines.common.vec_env import SubprocVecEnv, DummyVecEnv
from stable_baselines import PPO2, A2C
from sonic_util import make_env
from gym.wrappers import Monitor


env = DummyVecEnv([lambda: make_env(level_name='LabyrinthZone.Act1', \
                stack=False, scale_rew=True)])

modelname = 'sonicppo'
model = PPO2(CnnPolicy, env, n_steps=4500, verbose=1)
model.load("./checkpoint" + modelname)

obs = env.reset()
done = False
reward = 0

while not done:
    actions, _ = model.predict(obs)
    obs, rew, done, info = env.step(actions)
    reward += rew
    env.render()
env.close()
예제 #26
0
파일: ppo2.py 프로젝트: rlalpha/rl-trial
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy, CnnLnLstmPolicy, CnnPolicy, CnnLstmPolicy
from stable_baselines.common.vec_env import SubprocVecEnv, DummyVecEnv
from stable_baselines import PPO2, A2C
from sonic_util import make_env
from gym.wrappers import Monitor


env = DummyVecEnv([lambda: make_env(level_name='GreenHillZone.Act1', \
                stack=False, scale_rew=True)])

modelname = 'sonicppo'
model = PPO2(CnnPolicy, env, n_steps=3500, verbose=1)
model.learn(total_timesteps=1000000)
model.save("./checkpoint" + modelname)