예제 #1
0
def run(game, state, params_dir):
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    env = make_local_env(game=game, state=state, stack=True, scale_rew=True)

    load_path = 'params_3/checkpoints/00151'

    def env_fn():
        return env

    with tf.Session(config=config):
        model = ppo2.Model(policy = policies.CnnPolicy,
                           ob_space = env.observation_space,
                           ac_space = env.action_space,
                           nbatch_act = 1,
                           nsteps = 4500,
                           nbatch_train = 4500 // 4,
                           ent_coef=0.01,
                           vf_coef=0.5,
                           max_grad_norm=0.5)

        print(env.observation_space)
        print(env.action_space)
        model.load(load_path)
        runner = ppo2.Runner(env=DummyVecEnv([env_fn]), model=model, nsteps=4500, gamma=0.99, lam=0.95)
        runner.run()
예제 #2
0
 def get_exp(self, envIdx):
     level = self.levels[envIdx]
     env = make_local_env(level[0], level[1], True, True)
     def env_fn():
         return env
     # self.model.load_ram(params)
     runner = ppo2.Runner(
                     env=DummyVecEnv([env_fn]),
                     num_envs=1,
                     model=self.model,
                     nsteps=steps_per_ep,
                     gamma=gamma,
                     lam=lam,
                     lr=lr,
                     cliprange=cliprange,
                     noptepochs=noptepochs,
                     nbatch_train=nbatch_train)
     exp = runner.run()
     env.close()
     # tf.reset_default_graph()
     # del runner
     # gc.collect()
     pid = os.getpid()
     py = psutil.Process(pid)
     memUse = py.memory_info()[0]/2.**30
     print('memory use: %.6f GB from worker %d after model' %(memUse, self.id))
     return exp
예제 #3
0
def main():
    """Run DQN until the environment throws an exception."""
    env = make(game='SonicTheHedgehog-Genesis', state='GreenHillZone.Act1')
    env = AllowBacktracking(make_local_env(env, stack=False, scale_rew=False))
    env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True # pylint: disable=E1101
    with tf.Session(config=config) as sess:
        dqn = DQN(*rainbow_models(sess,
                                  env.action_space.n,
                                  gym_space_vectorizer(env.observation_space),
                                  min_val=-200,
                                  max_val=200))
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)
        optimize = dqn.optimize(learning_rate=1e-4)
        sess.run(tf.global_variables_initializer())
        dqn.train(num_steps=num_steps, # Make sure an exception arrives before we stop.
                  player=player,
                  replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1),
                  optimize_op=optimize,
                  train_interval=1,
                  target_interval=8192,
                  batch_size=32,
                  min_buffer_size=20000)

        print(tf.trainable_variables())
        save_path='/home/noob/retro-noob/rainbow/params/params'
        utils.save_state(save_path+'_tf_saver')

        with tf.variable_scope('model'):
            params = tf.trainable_variables()

        ps = sess.run(params)
        joblib.dump(ps, save_path + '_joblib')
예제 #4
0
def main(game,
         state,
         timesteps=5000,
         save_interval=1,
         last_dir=None,
         params_folder=None):
    """Run PPO until the environment throws an exception."""
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    env = make(game=game, state=state)
    env = make_local_env(env, stack=True, scale_rew=True)

    logger.configure(params_folder, format_strs=['stdout'])

    def env_fn():
        return env

    load_path = None
    if last_dir:
        list_of_params = glob.glob(last_dir + '/checkpoints/*')
        load_path = max(list_of_params, key=os.path.getctime)
        print('Restoring params from ', load_path)

    with tf.Session(config=config):
        # Take more timesteps than we need to be sure that
        # we stop due to an exception.
        ppo2.learn(policy=policies.CnnPolicy,
                   env=DummyVecEnv([env_fn]),
                   nsteps=4096,
                   nminibatches=8,
                   lam=0.95,
                   gamma=0.99,
                   noptepochs=3,
                   log_interval=1,
                   ent_coef=0.01,
                   lr=lambda _: 2e-4,
                   cliprange=lambda _: 0.1,
                   total_timesteps=timesteps,
                   save_interval=save_interval,
                   load_path=load_path)
def train(rank, args, shared_model, optimizer, env_conf):
    ptitle('Training Agent: {}'.format(rank))
    print("prank:", rank, "os.pid:", os.getpid())
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    env = AllowBacktracking(
        make_local_env(env_conf['game'],
                       env_conf['level'],
                       stack=False,
                       scale_rew=False))
    print("Got a local env; obs space:", env.observation_space)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)
    env.seed(args.seed + rank)
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space)

    player.state = player.env.reset()
    print("player.state.shape:", player.state.shape)
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
    player.model.train()
    player.eps_len += 2
    while True:
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())
        if player.done:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.cx = Variable(torch.zeros(1, 512).cuda())
                    player.hx = Variable(torch.zeros(1, 512).cuda())
            else:
                player.cx = Variable(torch.zeros(1, 512))
                player.hx = Variable(torch.zeros(1, 512))
        else:
            player.cx = Variable(player.cx.data)
            player.hx = Variable(player.hx.data)

        for step in range(args.num_steps):
            player.action_train()
            if player.done:
                break

        if player.done:
            # if player.info['ale.lives'] == 0 or player.max_length:
            #    player.eps_len = 0
            state = player.env.reset()
            player.eps_len += 2
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        R = torch.zeros(1, 1)
        if not player.done:
            value, _, _ = player.model(
                (Variable(player.state.unsqueeze(0)), (player.hx, player.cx)))
            R = value.data

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = gae.cuda()
        R = Variable(R)
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args.gamma * \
                player.values[i + 1].data - player.values[i].data

            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                player.log_probs[i] * \
                Variable(gae) - 0.01 * player.entropies[i]

        player.model.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()
        torch.nn.utils.clip_grad_norm(player.model.parameters(), 100.0)
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()
        player.clear_actions()
def test(args, shared_model, env_conf):
    ptitle('Test Agent')
    gpu_id = args.gpu_ids[-1]
    log = {}
    setup_logger('{}_log'.format(args.env),
                 r'{0}{1}_log'.format(args.log_dir, args.env))
    log['{}_log'.format(args.env)] = logging.getLogger(
        '{}_log'.format(args.env))
    d_args = vars(args)
    for k in d_args.keys():
        log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))

    torch.manual_seed(args.seed)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed)
    print("test proc:")
    env = AllowBacktracking(make_local_env(env_conf['game'], env_conf['level'], stack=False, scale_rew=False))
    print("test got env:", env.observation_space)
    reward_sum = 0
    start_time = time.time()
    num_tests = 0
    reward_total_sum = 0
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    player.model = A3Clstm(
        player.env.observation_space.shape[0], player.env.action_space)

    player.state = player.env.reset()
    player.eps_len += 2
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.model = player.model.cuda()
            player.state = player.state.cuda()
    flag = True
    max_score = 0
    while True:
        if flag:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.model.load_state_dict(shared_model.state_dict())
            else:
                player.model.load_state_dict(shared_model.state_dict())
            player.model.eval()
            flag = False

        player.action_test()
        reward_sum += player.reward

        """
        if player.done and player.info['ale.lives'] > 0 and not player.max_length:
            state = player.env.reset()
            player.eps_len += 2
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()
        """
        if player.done or player.max_length:
            flag = True
            num_tests += 1
            reward_total_sum += reward_sum
            reward_mean = reward_total_sum / num_tests
            log['{}_log'.format(args.env)].info(
                "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}".
                format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, player.eps_len, reward_mean))

            if args.save_max and reward_sum >= max_score:
                max_score = reward_sum
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        state_to_save = player.model.state_dict()
                        torch.save(state_to_save, '{0}{1}.dat'.format(args.save_model_dir, args.env))
                else:
                    state_to_save = player.model.state_dict()
                    torch.save(state_to_save, '{0}{1}.dat'.format(args.save_model_dir, args.env))

            reward_sum = 0
            player.eps_len = 0
            state = player.env.reset()
            player.eps_len += 2
            time.sleep(10)
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()
#!/usr/bin/env python

from sonic_util import make_local_env
from argparse import ArgumentParser
import os
import torch
import json

parser = ArgumentParser(prog="test_sonic", description="Test a trained model on the Sonic retro gym env")
parser.add_argument("--model-path", default="./trained_models/ppo/Sonic-Genesis-mixed-Train_mean1500_max6k.pt",
                    help="Path to the pytorch agent model file", metavar="MODELPATH")
parser.add_argument("--env-config", default="sonic_config.json",
                    help="Path to the env config json file", metavar="ENVCONFIGFILE")
args = parser.parse_args()

if os.path.exists(args.model_path):
    agent_policy, obs = torch.load(args.model_path)

env_confs = json.load(open(args.env_config, 'r'))
test_env_conf = env_confs['Test']
test_envs = [v for _, v in test_env_conf.items()]
print("test_envs:", test_envs)

# Step 1: Test the agent against 1 env
# Step 2: Test the agent against all the test env
test_env = test_envs[0]
env = make_local_env(game=test_env['game'], state=test_env['level'])
obs = env.reset()
env.render('human')
saved_state = torch.load('{0}{1}.dat'.format(args.load_model_dir, args.env),
                         map_location=lambda storage, loc: storage)

log = {}
setup_logger('{}_mon_log'.format(args.env),
             r'{0}{1}_mon_log'.format(args.log_dir, args.env))
log['{}_mon_log'.format(args.env)] = logging.getLogger('{}_mon_log'.format(
    args.env))

d_args = vars(args)
for k in d_args.keys():
    log['{}_mon_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))

env = make_local_env(env_conf["game"],
                     env_conf["level"],
                     stack=False,
                     scale_rew=False)
num_tests = 0
reward_total_sum = 0
player = Agent(None, env, args, None)
player.model = A3Clstm(player.env.observation_space.shape[0],
                       player.env.action_space)
player.gpu_id = gpu_id
if gpu_id >= 0:
    with torch.cuda.device(gpu_id):
        player.model = player.model.cuda()
if args.new_gym_eval:
    player.env = gym.wrappers.Monitor(player.env,
                                      "{}_monitor".format(args.env),
                                      force=True)
예제 #9
0
                    help='number of frames to stack (default: 4)')
parser.add_argument('--model-path', default='./trained_models/ppo/Sonic-GHZA1.pt',
                    help='Path to the agent Policy to be loaded (default: ./trained_models/ppo/Sonic-GHZA1.pt)')
parser.add_argument('--add-timestep', action='store_true', default=False,
                    help='add timestep to observations')
parser.add_argument('--num-episodes', type=int, default=100,
                    help="Number of episodes to test/run the agent for")
parser.add_argument('--log-dir', type=str, default='logs',
                    help='Log directory to store the tensorboard summary files')

#summary_file_path_prefix =
writer = SummaryWriter()

args = parser.parse_args()

env = make_local_env(game='SonicTheHedgehog-Genesis', state='GreenHillZone.Act1',stack=False, scale_rew =False)

actor_critic, ob_rms, saved_rew = torch.load(args.model_path)
print("Loaded Policy that got a mean reward of:", saved_rew)

render_func = env.render

obs_shape = env.observation_space.shape
obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])
current_obs = torch.zeros(1, *obs_shape)
states = torch.zeros(1, actor_critic.state_size)
masks = torch.zeros(1, 1)


def update_current_obs(obs):
    shape_dim0 = env.observation_space.shape[0]
예제 #10
0
gamma = 0.99
lam = 0.95
cliprange = 0.2
vf_coef = 0.5
max_grad_norm = 0.5  # ratio of sum of norms, for clipping gradients
nbatch_act = 1  # number of envs
nminibatches = 4
lr = 2e-4  # learning rate
noptepochs = 4
nbatch_train = horizon // nminibatches  # number of training batches

config = tf.ConfigProto()
config.gpu_options.allow_growth = True

env = make_local_env(game='SonicTheHedgehog-Genesis',
                     state='GreenHillZone.Act1',
                     stack=True,
                     scale_rew=True)
env.close()

train_data = pd.read_csv('../sonic-train.csv')
levels = []
for index, level in train_data.iterrows():
    levels.append((level.game, level.state))

with tf.Session(config=config):
    model = ppo2.Model(policy=policies.CnnPolicy,
                       ob_space=env.observation_space,
                       ac_space=env.action_space,
                       nbatch_act=nbatch_act,
                       nsteps=steps_per_ep,
                       nbatch_train=nbatch_train,