Exemplo n.º 1
0
def main():
    set_global_seeds(2021)
    args = parse_args()

    train_data = pd.read_pickle(args.train_file)
    valid_data = pd.read_pickle(args.valid_file)
    test_data = pd.read_pickle(args.test_file)

    word2vec = Word2Vec.load(args.emb_file).wv
    vocab_size = word2vec.vectors.shape[0]
    args.embed_size = word2vec.vectors.shape[1]
    embeddings = np.zeros((vocab_size + 1, args.embed_size), dtype="float32")
    embeddings[:vocab_size] = word2vec.vectors

    if not args.use_pretrain:
        embeddings = None
    if not os.path.exists(args.log_dir):
        os.makedirs(args.log_dir)

    log_file = os.path.join(args.log_dir, '%d.log' % time.time())

    model = CodeModel(args,
                      vocab_size + 1,
                      label_size=104,
                      log_file=log_file,
                      pretrain_emb=embeddings)
    if not args.only_test:
        model.train(train_data, valid_data, test_data)

    model_path = os.path.join(args.model_dir, 'best.model')
    model.load_model(model_path)
    test_acc = model.evaluate(test_data)
    print("Using model %s, Test Acc: %.4f" % (model_path, test_acc))
Exemplo n.º 2
0
def train(args, n_actors, batch_queue, prios_queue, param_queue):
    env = wrapper.make_atari(args.env)
    env = wrapper.wrap_atari_dqn(env, args)
    utils.set_global_seeds(args.seed, use_torch=True)

    model = DuelingDQN(env, args).to(args.device)
    # model.load_state_dict(torch.load('model_30h.pth'))
    tgt_model = DuelingDQN(env, args).to(args.device)
    tgt_model.load_state_dict(model.state_dict())

    writer = SummaryWriter(comment="-{}-learner".format(args.env))
    optimizer = torch.optim.Adam(model.parameters(), args.lr)
    # optimizer = torch.optim.RMSprop(model.parameters(), args.lr, alpha=0.95, eps=1.5e-7, centered=True)

    check_connection(n_actors)

    param_queue.put(model.state_dict())
    learn_idx = 0
    ts = time.time()
    tb_dict = {
        k: []
        for k in ['loss', 'grad_norm', 'max_q', 'mean_q', 'min_q']
    }
    while True:
        *batch, idxes = batch_queue.get()
        loss, prios, q_values = utils.compute_loss(model, tgt_model, batch,
                                                   args.n_steps, args.gamma)
        grad_norm = utils.update_parameters(loss, model, optimizer,
                                            args.max_norm)
        prios_queue.put((idxes, prios))
        batch, idxes, prios = None, None, None
        learn_idx += 1

        tb_dict["loss"].append(float(loss))
        tb_dict["grad_norm"].append(float(grad_norm))
        tb_dict["max_q"].append(float(torch.max(q_values)))
        tb_dict["mean_q"].append(float(torch.mean(q_values)))
        tb_dict["min_q"].append(float(torch.min(q_values)))

        if args.soft_target_update:
            tau = args.tau
            for p_tgt, p in zip(tgt_model.parameters(), model.parameters()):
                p_tgt.data *= 1 - tau
                p_tgt.data += tau * p
        elif learn_idx % args.target_update_interval == 0:
            print("Updating Target Network..")
            tgt_model.load_state_dict(model.state_dict())
        if learn_idx % args.save_interval == 0:
            print("Saving Model..")
            torch.save(model.state_dict(), "model.pth")
        if learn_idx % args.publish_param_interval == 0:
            param_queue.put(model.state_dict())
        if learn_idx % args.tb_interval == 0:
            bps = args.tb_interval / (time.time() - ts)
            print("Step: {:8} / BPS: {:.2f}".format(learn_idx, bps))
            writer.add_scalar("learner/BPS", bps, learn_idx)
            for k, v in tb_dict.items():
                writer.add_scalar(f'learner/{k}', np.mean(v), learn_idx)
                v.clear()
            ts = time.time()
Exemplo n.º 3
0
def main():
    loadpath = "./data/yelp_short_s10.p"
    x = cPickle.load(open(loadpath, "rb"))
    train, val, test = x[0], x[1], x[2]
    train_lab, val_lab, test_lab = x[3], x[4], x[5]
    wordtoix, ixtoword = x[6], x[7]

    train_lab = np.array(train_lab, dtype='float32')
    val_lab = np.array(val_lab, dtype='float32')
    test_lab = np.array(test_lab, dtype='float32')

    opt = Options()
    set_global_seeds(opt.seed)
    opt.n_words = len(ixtoword)
    sys.stdout = open(opt.log_path + '.log.txt', 'w')

    print datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y")
    print dict(opt)
    print('Total words: %d' % opt.n_words)

    if opt.part_data:
        # np.random.seed(123)
        train_ind = np.random.choice(
            len(train_lab),
            int(len(train_lab) * opt.train_percent / 100),
            replace=False)
        train = [train[t] for t in train_ind]
        train_lab = [train_lab[t] for t in train_ind]

    run_model(opt, train, val, test, train_lab, val_lab, test_lab, wordtoix,
              ixtoword)
Exemplo n.º 4
0
async def send_batch_worker(buffer, exe, event, lock, batch_size, beta, actor_num, actor_ips):
    """
    coroutine to send training batches to learner
    """
    seed = int(str(time.time())[-4:])
    utils.set_global_seeds(seed, use_torch=False)
    loop = asyncio.get_event_loop()
    ctx = Context.instance()
    socket = ctx.socket(zmq.DEALER)
    socket.connect("ipc:///tmp/5103.ipc")

    actors_sockets = []
    for i in range(actor_num):
        ctx = zmq.Context()
        socket = ctx.socket(zmq.DEALER)
        socket.connect('tcp://{}:51004'.format(actor_ips[i]))
        actors_sockets.append(socket)

    await event.wait()
    while True:
        identity, _ = await socket.recv_multipart(copy=False)
        # TODO: Is there any other greay way to support lock but make sampling faster?
        async with lock:
            batch = await loop.run_in_executor(exe, sample_batch, buffer, batch_size, beta, actors_sockets)
        await socket.send_multipart([identity, batch], copy=False)
        batch = None
    return True
Exemplo n.º 5
0
async def main():
    """
    main event loop
    """
    args = argparser()
    utils.set_global_seeds(args.seed, use_torch=False)

    procs = [
        Process(target=recv_batch_device),
        Process(target=recv_prios_device),
        Process(target=send_batch_device),
    ]
    for p in procs:
        p.start()

    buffer = CustomPrioritizedReplayBuffer(args.replay_buffer_size, args.alpha)
    exe = ThreadPoolExecutor()
    event = asyncio.Event()
    lock = asyncio.Lock()

    # TODO: How to decide the proper number of asyncio workers?
    workers = []
    for _ in range(args.n_recv_batch_worker):
        w = recv_batch_worker(buffer, exe, event, lock, args.threshold_size)
        workers.append(w)
    for _ in range(args.n_recv_prios_worker):
        w = recv_prios_worker(buffer, exe, event, lock)
        workers.append(w)
    for _ in range(args.n_send_batch_worker):
        w = send_batch_worker(buffer, exe, event, lock, args.batch_size, args.beta)
        workers.append(w)

    await asyncio.gather(*workers)
    return True
Exemplo n.º 6
0
def main():
    args = argparser()

    args.clip_rewards = False
    env = make_atari(args.env)
    env = wrap_atari_dqn(env, args)

    seed = args.seed + 1122
    utils.set_global_seeds(seed, use_torch=True)
    env.seed(seed)

    model = DuelingDQN(env)
    model.load_state_dict(torch.load('model.pth', map_location='cpu'))

    episode_reward, episode_length = 0, 0
    state = env.reset()
    while True:
        if args.render:
            env.render()
        action, _ = model.act(torch.FloatTensor(np.array(state)), 0.)
        next_state, reward, done, _ = env.step(action)

        state = next_state
        episode_reward += reward
        episode_length += 1

        if done:
            state = env.reset()
            print("Episode Length / Reward: {} / {}".format(
                episode_length, episode_reward))
            episode_reward = 0
            episode_length = 0
Exemplo n.º 7
0
def main():
    args = parse_args()
    set_global_seeds(666)
    config = read_config(args.config, "TRAIN")
    config_main = read_config(args.config, "MAIN")
    pprint(config)
    factory = Factory(config['train_params'])
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    callbacks = create_callbacks(config['train_params']['name'], config['dumps'])
    trainer = Runner(stages=config['stages'], factory=factory, callbacks=callbacks, device=device)

    aug_train = AUGMENTATIONS_TRAIN_CROP if config['train_params']['type'] == 'crop' else AUGMENTATIONS_TRAIN
    aug_test = AUGMENTATIONS_TEST_CROP if config['train_params']['type'] == 'crop' else AUGMENTATIONS_TEST

    train_dataset = SegmentationDataset(data_folder=config_main['path_to_data'], transforms=aug_train, phase='train', activation=config_main['activation'],
                                 fold=config['fold'], empty_mask_params=config['data_params']['empty_mask_increase'])

    val_dataset = SegmentationDataset(data_folder=config_main['path_to_data'], transforms=aug_test, phase='val',
                               fold=config['fold'], activation=config_main['activation'])

    train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, num_workers=16, drop_last=True)
    val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=False, num_workers=16)

    os.makedirs(os.path.join(config['dumps']['path'], config['dumps']['weights'], config['train_params']['name']), exist_ok=True)
    shutil.copy(args.config, os.path.join(config['dumps']['path'], config['dumps']['weights'], config['train_params']['name'], args.config.split('/')[-1]))
    trainer.fit(train_loader, val_loader)
Exemplo n.º 8
0
def exploration(args, actor_id, param_queue):
    writer = SummaryWriter(comment="-{}-eval".format(args.env))

    args.clip_rewards = False
    args.episode_life = False
    env = make_atari(args.env)
    env = wrap_atari_dqn(env, args)

    seed = args.seed + actor_id
    utils.set_global_seeds(seed, use_torch=True)
    env.seed(seed)

    model = DuelingDQN(env, args)

    param = param_queue.get(block=True)
    model.load_state_dict(param)
    param = None
    print("Received First Parameter!")

    episode_reward, episode_length, episode_idx = 0, 0, 0
    state = env.reset()
    tb_dict = {k: [] for k in ['episode_reward', 'episode_length']}
    while True:
        action, _ = model.act(torch.FloatTensor(np.array(state)), 0.)
        next_state, reward, done, _ = env.step(action)

        state = next_state
        episode_reward += reward
        episode_length += 1

        if done or episode_length == args.max_episode_length:
            state = env.reset()
            tb_dict["episode_reward"].append(episode_reward)
            tb_dict["episode_length"].append(episode_length)
            episode_reward = 0
            episode_length = 0
            episode_idx += 1
            param = param_queue.get()
            model.load_state_dict(param)
            print(f"{datetime.now()} Updated Parameter..")

            if (episode_idx *
                    args.num_envs_per_worker) % args.tb_interval == 0:
                writer.add_scalar('evaluator/episode_reward_mean',
                                  np.mean(tb_dict['episode_reward']),
                                  episode_idx)
                writer.add_scalar('evaluator/episode_reward_max',
                                  np.max(tb_dict['episode_reward']),
                                  episode_idx)
                writer.add_scalar('evaluator/episode_reward_min',
                                  np.min(tb_dict['episode_reward']),
                                  episode_idx)
                writer.add_scalar('evaluator/episode_reward_std',
                                  np.std(tb_dict['episode_reward']),
                                  episode_idx)
                writer.add_scalar('evaluator/episode_length_mean',
                                  np.mean(tb_dict['episode_length']),
                                  episode_idx)
                tb_dict['episode_reward'].clear()
                tb_dict['episode_length'].clear()
Exemplo n.º 9
0
def exploration(args, actor_id, n_actors, param_queue, send_queue,
                req_param_queue):
    writer = SummaryWriter(comment="-{}-actor{}".format(args.env, actor_id))

    env = make_atari(args.env)
    env = wrap_atari_dqn(env, args)

    seed = args.seed + actor_id
    utils.set_global_seeds(seed, use_torch=True)
    env.seed(seed)

    model = DuelingDQN(env)
    epsilon = args.eps_base**(1 + actor_id / (n_actors - 1) * args.eps_alpha)
    storage = BatchStorage(args.n_steps, args.gamma)
    req_param_queue.put(True)
    param = param_queue.get(block=True)
    model.load_state_dict(param)
    param = None
    print("Received First Parameter!")

    episode_reward, episode_length, episode_idx, actor_idx = 0, 0, 0, 0
    state = env.reset()
    while True:
        action, q_values = model.act(torch.FloatTensor(np.array(state)),
                                     epsilon)
        next_state, reward, done, _ = env.step(action)
        com_state = zlib.compress(np.array(state).tobytes())
        storage.add(com_state, reward, action, done, q_values)

        state = next_state
        episode_reward += reward
        episode_length += 1
        actor_idx += 1

        if done or episode_length == args.max_episode_length:
            state = env.reset()
            writer.add_scalar("actor/episode_reward", episode_reward,
                              episode_idx)
            writer.add_scalar("actor/episode_length", episode_length,
                              episode_idx)
            episode_reward = 0
            episode_length = 0
            episode_idx += 1

        if actor_idx % args.update_interval == 0:
            try:
                req_param_queue.put(True)
                param = param_queue.get(block=True)
                model.load_state_dict(param)
                print("Updated Parameter..")
            except queue.Empty:
                pass

        if len(storage) == args.send_interval:
            batch, prios = storage.make_batch()
            send_queue.put((batch, prios))
            batch, prios = None, None
            storage.reset()
Exemplo n.º 10
0
    def set_random_seed(self, seed):
        if seed is None:
            return

        set_global_seeds(seed)
        if self.env is not None:
            self.env.seed(seed)
            self.env.action_space.np_random.seed(seed)
        self.action_space.seed(seed)
Exemplo n.º 11
0
def main():
    parser = arg_parser()
    add_env_params(parser)
    parser.add_argument('--num-timesteps', type=int, default=int(1e12))
    parser.add_argument('--num_env', type=int, default=32)
    parser.add_argument('--use_news', type=int, default=0)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--gamma_ext', type=float, default=0.99)
    parser.add_argument('--lam', type=float, default=0.95)
    parser.add_argument('--update_ob_stats_every_step', type=int, default=0)
    parser.add_argument('--update_ob_stats_independently_per_gpu', type=int, default=0)
    parser.add_argument('--update_ob_stats_from_random_agent', type=int, default=1)
    parser.add_argument('--proportion_of_exp_used_for_predictor_update', type=float, default=1.)
    parser.add_argument('--tag', type=str, default='')
    parser.add_argument('--policy', type=str, default='cnn', choices=['cnn', 'rnn'])
    parser.add_argument('--int_coeff', type=float, default=1.)
    parser.add_argument('--ext_coeff', type=float, default=2.)
    parser.add_argument('--dynamics_bonus', type=int, default=0)


    args = parser.parse_args()
    logger.configure(dir=logger.get_dir(), format_strs=['stdout', 'log', 'csv'] if MPI.COMM_WORLD.Get_rank() == 0 else [])
    if MPI.COMM_WORLD.Get_rank() == 0:
        with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'), 'w') as f:
            f.write(args.tag)
        # shutil.copytree(os.path.dirname(os.path.abspath(__file__)), os.path.join(logger.get_dir(), 'code'))

    mpi_util.setup_mpi_gpus()

    seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank()
    set_global_seeds(seed)

    hps = dict(
        frame_stack=4,
        nminibatches=4,
        nepochs=4,
        lr=0.0001,
        max_grad_norm=0.0,
        use_news=args.use_news,
        gamma=args.gamma,
        gamma_ext=args.gamma_ext,
        max_episode_steps=args.max_episode_steps,
        lam=args.lam,
        update_ob_stats_every_step=args.update_ob_stats_every_step,
        update_ob_stats_independently_per_gpu=args.update_ob_stats_independently_per_gpu,
        update_ob_stats_from_random_agent=args.update_ob_stats_from_random_agent,
        proportion_of_exp_used_for_predictor_update=args.proportion_of_exp_used_for_predictor_update,
        policy=args.policy,
        int_coeff=args.int_coeff,
        ext_coeff=args.ext_coeff,
        dynamics_bonus = args.dynamics_bonus
    )

    tf_util.make_session(make_default=True)
    train(env_id=args.env, num_env=args.num_env, seed=seed,
        num_timesteps=args.num_timesteps, hps=hps)
Exemplo n.º 12
0
def train(args, n_actors, batch_queue, prios_queue, param_queue):
    env = RunTagEnv(width=5,
                    height=5,
                    number_of_subordinates=1,
                    max_steps=1000)
    #env = wrapper.make_atari(args.env)
    #env = wrapper.wrap_atari_dqn(env, args)
    utils.set_global_seeds(args.seed, use_torch=True)

    model = DuelingDQN(env).to(args.device)
    tgt_model = DuelingDQN(env).to(args.device)
    tgt_model.load_state_dict(model.state_dict())

    writer = SummaryWriter(comment="-{}-learner".format(args.env))
    # optimizer = torch.optim.Adam(model.parameters(), args.lr)
    optimizer = torch.optim.RMSprop(model.parameters(),
                                    args.lr,
                                    alpha=0.95,
                                    eps=1.5e-7,
                                    centered=True)

    check_connection(n_actors)

    param_queue.put(model.state_dict())
    learn_idx = 0
    ts = time.time()
    while True:
        *batch, idxes = batch_queue.get()
        loss, prios = utils.compute_loss(model, tgt_model, batch, args.n_steps,
                                         args.gamma)
        grad_norm = utils.update_parameters(loss, model, optimizer,
                                            args.max_norm)
        print('Updated parameters!')
        prios_queue.put((idxes, prios))
        batch, idxes, prios = None, None, None
        learn_idx += 1

        writer.add_scalar("learner/loss", loss, learn_idx)
        writer.add_scalar("learner/grad_norm", grad_norm, learn_idx)

        if learn_idx % args.target_update_interval == 0:
            print("Updating Target Network..")
            tgt_model.load_state_dict(model.state_dict())
        if learn_idx % args.save_interval == 0:
            print("Saving Model..")
            torch.save(model.state_dict(), "model.pth")
        if learn_idx % args.publish_param_interval == 0:
            param_queue.put(model.state_dict())
        if learn_idx % args.bps_interval == 0:
            bps = args.bps_interval / (time.time() - ts)
            print("Step: {:8} / BPS: {:.2f}".format(learn_idx, bps))
            writer.add_scalar("learner/BPS", bps, learn_idx)
            ts = time.time()
Exemplo n.º 13
0
def main():
    learner_ip = get_environ()
    args = argparser()

    writer = SummaryWriter(comment="-{}-eval".format(args.env))

    ctx = zmq.Context()
    param_socket = ctx.socket(zmq.SUB)
    param_socket.setsockopt(zmq.SUBSCRIBE, b'')
    param_socket.setsockopt(zmq.CONFLATE, 1)
    param_socket.connect('tcp://{}:52001'.format(learner_ip))

    env = make_atari(args.env)
    env = wrap_atari_dqn(env, args)

    seed = args.seed + 1122
    utils.set_global_seeds(seed, use_torch=True)
    env.seed(seed)

    model = DuelingDQN(env)

    data = param_socket.recv(copy=False)
    param = pickle.loads(data)
    model.load_state_dict(param)
    print("Loaded first parameter from learner")

    episode_reward, episode_length, episode_idx = 0, 0, 0
    state = env.reset()
    while True:
        if args.render:
            env.render()
        action, _ = model.act(torch.FloatTensor(np.array(state)), 0.01)
        next_state, reward, done, _ = env.step(action)

        state = next_state
        episode_reward += reward
        episode_length += 1

        if done:
            state = env.reset()
            writer.add_scalar("eval/episode_reward", episode_reward,
                              episode_idx)
            writer.add_scalar("eval/episode_length", episode_length,
                              episode_idx)
            episode_reward = 0
            episode_length = 0
            episode_idx += 1

            if episode_idx % args.eval_update_interval == 0:
                data = param_socket.recv(copy=False)
                param = pickle.loads(data)
                model.load_state_dict(param)
Exemplo n.º 14
0
def make_ple_envs(env_id, num_env, seed, start_index=0, *args, **kwargs):
    """
    Create a monitored SubprocVecEnv for PLE.
    """
    def make_env(rank): # pylint: disable=C0111
        def _thunk():
            env = gym.make(env_id)
            env.seed(seed + rank, *args, **kwargs) # TODO should be after the monitor command!
            env = Monitor(env, None, **kwargs)
            # env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), **kwargs)
            return env
        return _thunk
    set_global_seeds(seed)
    return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)], env_id)
Exemplo n.º 15
0
    def train(self):
        utils.set_global_seeds(self.seed, use_torch=True)

        learn_idx = 0
        while True:
            beta = self.beta_by_frame(learn_idx)
            states, actions, rewards, next_states, dones, weights, idxes = self.buffer.sample(
                self.batch_size, beta)
            states = torch.FloatTensor(states).to(self.device)
            actions = torch.LongTensor(actions).to(self.device)
            rewards = torch.FloatTensor(rewards).to(self.device)
            next_states = torch.FloatTensor(next_states).to(self.device)
            dones = torch.FloatTensor(dones).to(self.device)
            weights = torch.FloatTensor(weights).to(self.device)
            batch = (states, actions, rewards, next_states, dones, weights)

            loss, prios = utils.compute_loss(self.model, self.tgt_model, batch,
                                             self.n_step, self.gamma)

            self.scheduler.step()
            grad_norm = utils.update_parameters(loss, self.model,
                                                self.optimizer, self.max_norm)

            self.buffer.update_priorities(idxes, prios)

            batch, idxes, prios = None, None, None
            learn_idx += 1

            self.writer.add_scalar("learner/loss", loss, learn_idx)
            self.writer.add_scalar("learner/grad_norm", grad_norm, learn_idx)

            if learn_idx % self.target_update_interval == 0:
                print("Updating Target Network..")
                self.tgt_model.load_state_dict(self.model.state_dict())
            if learn_idx % self.save_interval == 0:
                print("Saving Model..")
                torch.save(self.model.state_dict(),
                           "model{}.pth".format(learn_idx))
            if learn_idx % self.publish_param_interval == 0:
                self.batch_recorder.set_worker_weights(
                    copy.deepcopy(self.model))
            if learn_idx >= self.max_step:
                torch.save(self.model.state_dict(),
                           "model{}.pth".format(learn_idx))
                self.batch_recorder.cleanup()
                break
Exemplo n.º 16
0
async def send_batch_worker(buffer, exe, event, lock, batch_size, beta):
    """
    coroutine to send training batches to learner
    """
    seed = int(str(time.time())[-4:])
    utils.set_global_seeds(seed, use_torch=False)
    loop = asyncio.get_event_loop()
    ctx = Context.instance()
    socket = ctx.socket(zmq.DEALER)
    socket.connect("ipc:///tmp/5103.ipc")
    await event.wait()
    while True:
        identity, _ = await socket.recv_multipart(copy=False)
        # TODO: Is there any other greay way to support lock but make sampling faster?
        async with lock:
            batch = await loop.run_in_executor(exe, sample_batch, buffer, batch_size, beta)
        print('Replay: Sending batch...')
        await socket.send_multipart([identity, batch], copy=False)
        batch = None
    return True
Exemplo n.º 17
0
def main():
    args = get_args()
    utils.set_global_seeds(args.seed)

    env = make_atari_env(args.env, args.seed)
    benchmark_env = make_atari_env(args.env, args.seed+1)

    optimizer = tf.train.AdamOptimizer(learning_rate=1e-4, epsilon=1e-4)

    n_timesteps = 10000000
    learning_starts = 50000
    exploration_schedule = utils.PiecewiseSchedule(
                               [(0, 1.0), (learning_starts, 1.0), (learning_starts + 1e6, 0.1)],
                               outside_value=0.1,
                           )

    replay_memory = NStepReplayMemory(
                        size=1000000,
                        history_len=args.history_len,
                        discount=0.99,
                        nsteps=args.nsteps,
                    )

    q_func = AtariRecurrentConvNet() if args.recurrent else AtariConvNet()

    dqn.learn(
        env,
        benchmark_env,
        q_func,
        replay_memory,
        optimizer=optimizer,
        exploration=exploration_schedule,
        max_timesteps=n_timesteps,
        batch_size=32,
        learning_starts=learning_starts,
        learning_freq=4,
        target_update_freq=10000,
        grad_clip=40.,
        log_every_n_steps=50000,
    )
    env.close()
def main():
    loadpath = "./data/yelp_short_s10.p"
    x = cPickle.load(open(loadpath, "rb"))
    train, val, test = x[0], x[1], x[2]
    train_lab, val_lab, test_lab = x[3], x[4], x[5]
    wordtoix, ixtoword = x[6], x[7]

    train_lab = np.array(train_lab, dtype='float32')
    val_lab = np.array(val_lab, dtype='float32')
    test_lab = np.array(test_lab, dtype='float32')

    opt = Options()
    set_global_seeds(opt.seed)
    opt.n_words = len(ixtoword)
    sys.stdout = open(opt.log_path + '.log.txt', 'w')

    print datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y")
    print dict(opt)
    print('Total words: %d' % opt.n_words)

    run_model(opt, train, val, test, test_lab, wordtoix, ixtoword)
Exemplo n.º 19
0
def exploration_eval(args, actor_id, param_queue):
    writer = SummaryWriter(comment="-{}-eval".format(args.env))

    args.clip_rewards = False
    env = make_atari(args.env)
    env = wrap_atari_dqn(env, args)

    seed = args.seed + actor_id
    utils.set_global_seeds(seed, use_torch=True)
    env.seed(seed)

    model = DuelingDQN(env)

    param = param_queue.get(block=True)
    model.load_state_dict(param)
    param = None
    print("Received First Parameter!")

    episode_reward, episode_length, episode_idx = 0, 0, 0
    state = env.reset()
    while True:
        action, _ = model.act(torch.FloatTensor(np.array(state)), 0.)
        next_state, reward, done, _ = env.step(action)

        state = next_state
        episode_reward += reward
        episode_length += 1

        if done or episode_length == args.max_episode_length:
            state = env.reset()
            writer.add_scalar("evaluator/episode_reward", episode_reward,
                              episode_idx)
            writer.add_scalar("evaluator/episode_length", episode_length,
                              episode_idx)
            episode_reward = 0
            episode_length = 0
            episode_idx += 1
            param = param_queue.get()
            model.load_state_dict(param)
            print("Updated Parameter..")
Exemplo n.º 20
0
def main():
    seed = 0
    utils.set_global_seeds(seed)

    name = 'CartPole-v0'
    env = make_continuouscontrol_env(name, seed)
    benchmark_env = make_continuouscontrol_env(name, seed + 1)

    optimizer = tf.train.AdamOptimizer(learning_rate=1e-4)

    n_timesteps = 500000
    learning_starts = 50000
    exploration_schedule = utils.PiecewiseSchedule(
        [(0, 1.0), (learning_starts, 1.0), (learning_starts + 3e5, 0.1)],
        outside_value=0.1,
    )

    replay_memory = NStepReplayMemory(
        size=500000,
        history_len=1,
        discount=0.99,
        nsteps=1,
    )

    dqn.learn(
        env,
        benchmark_env,
        CartPoleNet(),
        replay_memory,
        optimizer=optimizer,
        exploration=exploration_schedule,
        max_timesteps=n_timesteps,
        batch_size=32,
        learning_starts=learning_starts,
        learning_freq=4,
        target_update_freq=10000,
        log_every_n_steps=10000,
    )
    env.close()
Exemplo n.º 21
0
def start_experiment(**args):
    # create environment
    # coinrun environment is already vectorized
    env, test_env = make_env_all_params(args=args)

    # set random seeds for reproducibility
    utils.set_global_seeds(seed=args['seed'])

    # create tf.session
    tf_sess = utils.setup_tensorflow_session()

    if args['server_type'] == 'local':
        logger_context = logger.scoped_configure(dir=args['log_dir'],
                                                 format_strs=['stdout', 'csv'])
    else:
        logger_context = logger.scoped_configure(dir=args['log_dir'],
                                                 format_strs=['csv'])

    with logger_context, tf_sess:
        print("logging directory: {}".format(args['log_dir']))

        # create trainer
        trainer = Trainer(env=env, test_env=test_env, args=args)

        if args['evaluation'] == 1:
            # load_path is changed to model_path
            print('run.py, def start_experiment, evaluating model: {}'.format(
                args['load_path']))
            trainer.eval()

        # this is for visualizing the loss landscape
        elif args['visualize'] == 1:
            print('running visualization...')
            trainer.visualize()
        else:
            print('run.py, def start_experiment, training begins...')
            trainer.train()
Exemplo n.º 22
0
def learn(policy,
          env,
          test_env,
          seed,
          total_timesteps,
          log_interval,
          test_interval,
          show_interval,
          logdir,
          lr,
          max_grad_norm,
          units_per_hlayer,
          activ_fcn,
          gamma=0.99,
          vf_coef=0.5,
          ent_coef=0.01,
          batch_size=5,
          early_stop=False,
          keep_model=2,
          save_model=True,
          restore_model=False,
          save_traj=False):
    logger = logging.getLogger(__name__)
    tf.reset_default_graph()
    set_global_seeds(seed)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space

    model = Model(policy=policy,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  lr=lr,
                  max_grad_norm=max_grad_norm,
                  activ_fcn=activ_fcn,
                  units_per_hlayer=units_per_hlayer,
                  log_interval=log_interval,
                  logdir=logdir,
                  nenvs=nenvs,
                  batch_size=batch_size,
                  ent_coef=ent_coef,
                  vf_coef=vf_coef,
                  keep_model=keep_model
                  # total_timesteps=total_timesteps,
                  )

    sum_write = model.get_summary_writer()
    result_path = os.path.join(logdir, 'train_results.csv')
    if save_traj:
        rew_traj = []
        rew_results_path = os.path.join(
            logdir, ('lr' + str(lr) + '_tracking_results.csv'))
    else:
        rew_results_path = None

    i_sample, i_train = 0, 0
    return_threshold = -0.05
    horizon = 100
    avg_rm = deque(maxlen=30)

    runner = Runner(env,
                    model,
                    nsteps=batch_size,
                    gamma=gamma,
                    horizon=horizon,
                    show_interval=show_interval,
                    summary_writer=sum_write)

    if restore_model:
        for el in os.listdir(logdir):
            if 'final' in el and '.meta' in el:
                # Load pre trained model and set network parameters
                model.load(os.path.join(logdir, el[:-5]))
                # Reset global step parameter.
                model.sess.run(model.global_step.assign(0))

    logger.info('Start Training')
    breaked = False
    nbatch = nenvs * batch_size
    tstart = time.time()
    max_returns = deque([50],
                        maxlen=7)  # returns of the 7 best training episodes
    for update in range(1, total_timesteps // nbatch + 1):
        obs, states, rewards, actions, values, reward_window, raw_rewards = runner.run(
        )
        if rew_results_path is not None:
            rew_traj.append(raw_rewards)
        policy_loss, value_loss, policy_entropy, ap = model.train(
            obs, states, rewards, actions, values)
        if test_interval > 0 and i_train > 0 and (update % test_interval == 0):
            ep_return = model.test_run(
                test_env, n_eps=10, n_pipes=2000
            )  # TODO test, whether results.csv is saved properly
            with open(result_path, "a") as csvfile:
                writer = csv.writer(csvfile)
                ep_return[0:0] = [i_sample, i_train]
                writer.writerow(ep_return)

        # Log the performance during training at every update step.
        # Save the current model if the average reward of the last
        # 100 time steps is above the return threshold
        if ('ContFlappyBird' in env.env_id):
            saved = False
            for i, rw in enumerate(reward_window):
                rm = sum(rw) / horizon
                if sum_write is not None:
                    s_summary = tf.Summary()
                    s_summary.value.add(
                        tag='envs/environment%s/isample_return' % i,
                        simple_value=rm)
                    sum_write.add_summary(s_summary, i_sample)

                    t_summary = tf.Summary()
                    t_summary.value.add(
                        tag='envs/environment%s/itrain_return' % i,
                        simple_value=rm)
                    sum_write.add_summary(t_summary, i_train)
                    sum_write.flush()
                # logger.info(rm)
                if save_model and not saved and rm > return_threshold:
                    return_threshold = rm
                    logger.info('Save model at max rolling mean %s' %
                                return_threshold)
                    model.save('inter_model')
                    saved = True
                avg_rm.append(rm)

        if early_stop:
            if (i_sample > 500000) and (
                    i_sample <= 500000 + nbatch
            ):  # TODO how to determine early-stopping criteria non-heuristically, but automatically? - BOHB algorithm?
                if (sum(avg_rm) / 30) <= -0.88:
                    print('breaked')
                    breaked = True
                    break
        i_sample += nbatch
        i_train += 1

    if save_model:
        model.save('final_model')
        logger.info('Finished Training. Saving Final model.')

    if rew_results_path is not None:
        with open(rew_results_path, "a") as csvfile:
            writer = csv.writer(csvfile)
            traj = np.asanyarray(rew_traj).reshape(-1).tolist()
            traj[0:0] = [np.mean(traj)]  # i_train, i_sample
            writer.writerow(traj)

    logger.info('*******************************************************')
    logger.info('Total number of interactions with the environment: %s' %
                i_sample)
    logger.info(
        'Total number of finished episodes during training: sum(%s) = %s' %
        (runner.ep_idx, sum(runner.ep_idx)))
    logger.info('Total number of parameter updates during training: %s' %
                i_train)
    logger.info('*******************************************************\n')

    return breaked
Exemplo n.º 23
0
def main():
    parser = arg_parser()
    parser.add_argument('--env',
                        help='environment ID',
                        default='BreakoutNoFrameskip-v4')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--max_episode_steps', type=int, default=4500)

    parser.add_argument('--num-timesteps', type=int, default=int(1e8))
    parser.add_argument('--num_env', type=int, default=128)
    parser.add_argument('--use_news', type=int, default=0)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--lam', type=float, default=0.95)
    parser.add_argument('--update_ob_stats_every_step', type=int, default=0)
    parser.add_argument('--update_ob_stats_independently_per_gpu',
                        type=int,
                        default=0)
    parser.add_argument('--update_ob_stats_from_random_agent',
                        type=int,
                        default=1)
    parser.add_argument('--proportion_of_exp_used_for_predictor_update',
                        type=float,
                        default=1.)
    parser.add_argument('--tag', type=str, default='')
    parser.add_argument('--policy',
                        type=str,
                        default='cnn',
                        choices=['cnn', 'rnn'])
    parser.add_argument('--int_coeff', type=float, default=1.)
    parser.add_argument('--ext_coeff', type=float, default=0.)
    parser.add_argument('--beta', type=float, default=1e-3)
    parser.add_argument('--exploration_type', type=str, default='bottleneck')
    parser.add_argument('--noise_type',
                        type=str,
                        default='none',
                        choices=['none', 'box'])
    parser.add_argument('--noise_p', type=float, default=0.1)
    parser.add_argument('--use_sched', type=int, default=0)
    parser.add_argument('--exp_name', type=str, default='none')

    args = parser.parse_args()
    if args.policy == 'rnn':
        args.gamma_ext = 0.999
    else:
        args.gamma_ext = 0.99

    logger_dir = './results/' + args.env.replace("NoFrameskip-v4", "")
    logger_dir += datetime.datetime.now().strftime("-%m-%d-%H-%M-%S")
    logger.configure(dir=logger_dir,
                     format_strs=['stdout', 'log', 'csv']
                     if MPI.COMM_WORLD.Get_rank() == 0 else [])

    if MPI.COMM_WORLD.Get_rank() == 0:
        with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'),
                  'w') as f:
            f.write(args.tag)

    seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank()
    set_global_seeds(seed)

    hps = dict(
        frame_stack=4,
        nminibatches=4,
        nepochs=4,
        lr=0.0001,
        max_grad_norm=0.0,
        use_news=args.use_news,
        gamma=args.gamma,
        gamma_ext=args.gamma_ext,
        max_episode_steps=args.max_episode_steps,
        lam=args.lam,
        update_ob_stats_every_step=args.update_ob_stats_every_step,
        update_ob_stats_independently_per_gpu=args.
        update_ob_stats_independently_per_gpu,
        update_ob_stats_from_random_agent=args.
        update_ob_stats_from_random_agent,
        proportion_of_exp_used_for_predictor_update=args.
        proportion_of_exp_used_for_predictor_update,
        policy=args.policy,
        int_coeff=args.int_coeff,
        ext_coeff=args.ext_coeff,
        exploration_type=args.exploration_type,
        beta=args.beta,
        noise_type=args.noise_type,
        noise_p=args.noise_p,
        use_sched=args.use_sched,
        exp_name=args.exp_name,
    )

    tf_util.make_session(make_default=True)
    train(env_id=args.env,
          num_env=args.num_env,
          seed=seed,
          num_timesteps=args.num_timesteps,
          hps=hps)
Exemplo n.º 24
0
def learn(policy,
          env,
          test_env,
          seed,
          total_timesteps,
          log_interval,
          test_interval,
          show_interval,
          logdir,
          lr,
          max_grad_norm,
          units_per_hlayer,
          activ_fcn,
          gamma=0.99,
          vf_coef=0.5,
          ent_coef=0.01,
          nsteps=5,
          lam=0.95,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          early_stop=False,
          keep_model=2,
          save_model=True,
          restore_model=False,
          save_traj=False):

    total_timesteps = int(total_timesteps)

    logger = logging.getLogger(__name__)
    tf.reset_default_graph()
    set_global_seeds(seed)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches  # TODO number of samples per minibatch in an optimization episode

    make_model = lambda: Model(policy=policy,
                               ob_space=ob_space,
                               ac_space=ac_space,
                               nenvs=nenvs,
                               nbatch_train=nbatch_train,
                               nsteps=nsteps,
                               ent_coef=ent_coef,
                               vf_coef=vf_coef,
                               max_grad_norm=max_grad_norm,
                               activ_fcn=activ_fcn,
                               units_per_hlayer=units_per_hlayer,
                               log_interval=log_interval,
                               logdir=logdir,
                               keep_model=keep_model,
                               lr=lr,
                               cliprange=cliprange)
    model = make_model()

    sum_write = model.get_summary_writer()
    result_path = os.path.join(logdir, 'train_results.csv')
    if save_traj:
        rew_traj = []
        rew_results_path = os.path.join(
            logdir, ('lr' + str(lr) + '_tracking_results.csv'))
    else:
        rew_results_path = None

    i_sample, i_train = 0, 0
    return_threshold = -2.
    horizon = 100
    avg_rm = deque(maxlen=30)

    runner = Runner(env=env,
                    model=model,
                    nsteps=nsteps,
                    gamma=gamma,
                    lam=lam,
                    horizon=horizon,
                    show_interval=show_interval,
                    summary_writer=sum_write)

    if restore_model:
        for el in os.listdir(logdir):
            if 'final' in el and '.meta' in el:
                # Load pre trained model and set network parameters
                model.load(os.path.join(logdir, el[:-5]))
                # Reset global step parameter.
                model.sess.run(model.global_step.assign(0))

    logger.info('Start Training')
    breaked = False

    nupdates = total_timesteps // nbatch
    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0  # nbatch should be a multiple of nminibatches
        obs, returns, masks, actions, values, neglogpacs, states, reward_window, rewards = \
            runner.run()  #pylint: disable=E0632  # returns are estimates of the discounted reward

        if rew_results_path is not None:
            rew_traj.append(rewards)

        nbatch_train = nbatch // nminibatches  # number of samples per minibatch
        tstart = time.time()
        # frac = 1.0 - (update - 1.0) / nupdates  # converges to 0
        # lrnow = lr(frac)  #
        # cliprangenow = cliprange(frac)  # cliprange converges to 0

        # Update step
        mblossvals = []
        if states is None:  # nonrecurrent version
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                np.random.shuffle(inds)  #
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    # mblossvals.append(model.train(lrnow, cliprangenow, *slices))
                    mblossvals.append(model.train(*slices))
        else:  # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches  # minibatch contains batch data from several envs.
            envinds = np.arange(nenvs, dtype=np.int32)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            envsperbatch = nbatch_train // nsteps
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = np.array(
                        envinds[start:end]
                    )  # TODO int() does not work here. ensure that indices are integers beforehand
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    if nenvs == 1:
                        mbstates = states[:]
                    else:
                        if type(states) == tuple or type(
                                states
                        ) == tf.contrib.rnn.LSTMStateTuple:  # LSTM state
                            mbstates = [el[mbenvinds] for el in states]
                        else:  # GRU state
                            mbstates = states[mbenvinds]
                    # mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates))
                    mblossvals.append(model.train(*slices, mbstates))

        if test_interval > 0 and i_train > 0 and (update % test_interval == 0):
            ep_return = model.test_run(
                test_env, n_eps=10, n_pipes=2000
            )  # TODO test, whether results.csv is saved properly
            with open(result_path, "a") as csvfile:
                writer = csv.writer(csvfile)
                ep_return[0:0] = [i_sample, i_train]
                writer.writerow(ep_return)

        if ('ContFlappyBird' in env.env_id):
            saved = False
            for i, rw in enumerate(reward_window):
                rm = sum(rw) / horizon
                if sum_write is not None:
                    s_summary = tf.Summary()
                    s_summary.value.add(
                        tag='envs/environment%s/isample_return' % i,
                        simple_value=rm)
                    sum_write.add_summary(s_summary, i_sample)

                    t_summary = tf.Summary()
                    t_summary.value.add(
                        tag='envs/environment%s/itrain_return' % i,
                        simple_value=rm)
                    sum_write.add_summary(t_summary, i_train)
                    sum_write.flush()
                if save_model and not saved and rm > return_threshold:
                    return_threshold = rm
                    logger.info('Save model at max rolling mean %s' %
                                return_threshold)
                    model.save('inter_model')
                    saved = True
                avg_rm.append(rm)

        if early_stop:
            if (i_sample > 500000) and (
                    i_sample <= 500000 + nbatch
            ):  # TODO how to determine early-stopping criteria non-heuristically, but automatically? - BOHB algorithm?
                if (sum(avg_rm) / 30) <= -0.88:
                    print('breaked')
                    breaked = True
                    break
        i_sample += nbatch
        i_train += 1

    if save_model:
        model.save('final_model')
        logger.info('Finished Training. Saving Final model.')

    if rew_results_path is not None:
        with open(rew_results_path, "a") as csvfile:
            writer = csv.writer(csvfile)
            traj = np.asanyarray(rew_traj).reshape(-1).tolist()
            traj[0:0] = [np.mean(traj)]  # i_train, i_sample
            writer.writerow(traj)

    logger.info('*******************************************************')
    logger.info('Total number of interactions with the environment: %s' %
                i_sample)
    logger.info(
        'Total number of finished episodes during training: sum(%s) = %s' %
        (runner.ep_idx, sum(runner.ep_idx)))
    logger.info('Total number of parameter updates during training: %s' %
                i_train)
    logger.info('*******************************************************\n')

    return breaked
Exemplo n.º 25
0
def main():
    parser = arg_parser()
    add_env_params(parser)
    parser.add_argument(
        "--num-timesteps",
        type=int,
        default=int(1e12),
    )
    parser.add_argument(
        "--num_env",
        type=int,
        default=32,
    )
    parser.add_argument(
        "--use_news",
        type=int,
        default=0,
    )
    parser.add_argument(
        "--gamma",
        type=float,
        default=0.99,
    )
    parser.add_argument(
        "--gamma_ext",
        type=float,
        default=0.999,
    )
    parser.add_argument(
        "--lam",
        type=float,
        default=0.95,
    )
    parser.add_argument(
        "--update_ob_stats_every_step",
        type=int,
        default=0,
    )
    parser.add_argument(
        "--update_ob_stats_independently_per_gpu",
        type=int,
        default=0,
    )
    parser.add_argument(
        "--update_ob_stats_from_random_agent",
        type=int,
        default=1,
    )
    parser.add_argument(
        "--proportion_of_exp_used_for_predictor_update",
        type=float,
        default=1.0,
    )
    parser.add_argument(
        "--tag",
        type=str,
        default="",
    )
    parser.add_argument(
        "--policy",
        type=str,
        default="cnn",
        choices=["cnn", "rnn", "ffnn"],
    )
    parser.add_argument(
        "--int_coeff",
        type=float,
        default=1.0,
    )
    parser.add_argument(
        "--ext_coeff",
        type=float,
        default=2.0,
    )
    parser.add_argument(
        "--dynamics_bonus",
        type=int,
        default=0,
    )
    parser.add_argument(
        "--meta_rl",
        type=lambda x: True if x.lower() in {'true', 't'} else False,
        default=False,
    )

    args = parser.parse_args()
    logger.configure(
        dir=logger.get_dir(),
        format_strs=["stdout", "log", "csv"]
        if MPI.COMM_WORLD.Get_rank() == 0 else [],
    )
    if MPI.COMM_WORLD.Get_rank() == 0:
        with open(os.path.join(logger.get_dir(), "experiment_tag.txt"),
                  "w") as f:
            f.write(args.tag)

    mpi_util.setup_mpi_gpus()

    seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank()
    set_global_seeds(seed)

    hps = dict(
        frame_stack=4,
        nminibatches=4,
        nepochs=4,
        lr=0.0001,
        max_grad_norm=0.0,
        use_news=args.use_news,
        gamma=args.gamma,
        gamma_ext=args.gamma_ext,
        max_episode_steps=args.max_episode_steps,
        lam=args.lam,
        update_ob_stats_every_step=args.update_ob_stats_every_step,
        update_ob_stats_independently_per_gpu=args.
        update_ob_stats_independently_per_gpu,
        update_ob_stats_from_random_agent=args.
        update_ob_stats_from_random_agent,
        proportion_of_exp_used_for_predictor_update=args.
        proportion_of_exp_used_for_predictor_update,
        policy=args.policy,
        int_coeff=args.int_coeff,
        ext_coeff=args.ext_coeff,
        dynamics_bonus=args.dynamics_bonus,
        meta_rl=args.meta_rl,
    )

    tf_util.make_session(make_default=True)
    train(
        env_id=args.env,
        num_env=args.num_env,
        seed=seed,
        num_timesteps=args.num_timesteps,
        hps=hps,
    )
Exemplo n.º 26
0
def train(variant):
    set_global_seeds(variant['seed'])

    if variant['mode'] == 'local':
        import colored_traceback.always
    '''
    Set-up folder and files
    '''
    snapshot_dir = logger.get_snapshot_dir()
    working_dir = config.PROJECT_PATH
    param_path = os.path.join(working_dir, 'params/params.json')
    # copyfile(param_path, os.path.join(snapshot_dir,'params.json'))

    try:
        '''
        Save parameters
        '''
        if 'params' in variant:
            logger.log('Load params from variant.')
            params = variant['params']
        else:
            logger.log('Load params from file.')
            with open(param_path, 'r') as f:
                params = json.load(f)

        # Save to snapshot dir
        new_param_path = os.path.join(snapshot_dir, 'params.json')
        with open(new_param_path, 'w') as f:
            json.dump(params,
                      f,
                      sort_keys=True,
                      indent=4,
                      separators=(',', ': '))

        # TODO: can use variant to modify here.
        dynamics_opt_params = params['dynamics_opt_params']
        dynamics_opt_params['stop_critereon'] = stop_critereon(
            threshold=dynamics_opt_params['stop_critereon']['threshold'],
            offset=dynamics_opt_params['stop_critereon']['offset'])
        dynamics_opt_params = Dynamics_opt_params(**dynamics_opt_params)

        policy_opt_params = params['policy_opt_params']
        policy_opt_params['stop_critereon'] = stop_critereon(
            threshold=policy_opt_params['stop_critereon']['threshold'],
            offset=policy_opt_params['stop_critereon']['offset'],
            percent_models_threshold=policy_opt_params['stop_critereon']
            ['percent_models_threshold'])
        policy_opt_params = Policy_opt_params(**policy_opt_params)

        rollout_params = params['rollout_params']
        rollout_params['monitorpath'] = os.path.join(snapshot_dir, 'videos')
        rollout_params = Rollout_params(**rollout_params)

        assert params['rollout_params']['max_timestep'] == \
               params['policy_opt_params']['oracle_maxtimestep'] == \
               params['policy_opt_params']['T']
        '''
        Policy model
        '''
        def build_policy_from_rllab(scope_name='training_policy'):
            '''
            Return both rllab policy and policy model function.
            '''
            sess = tf.get_default_session()

            ### Initialize training_policy to copy from policy
            from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
            output_nonlinearity = eval(params['policy']['output_nonlinearity'])

            training_policy = GaussianMLPPolicy(
                name=scope_name,
                env_spec=env.spec,
                hidden_sizes=params['policy']['hidden_layers'],
                init_std=policy_opt_params.trpo['init_std'],
                output_nonlinearity=output_nonlinearity)
            training_policy_vars = tf.get_collection(
                tf.GraphKeys.GLOBAL_VARIABLES, scope='training_policy')
            sess.run([tf.variables_initializer(training_policy_vars)])

            ### Compute policy model function using the same weights.
            training_layers = training_policy._mean_network.layers

            def policy_model(x, stochastic=0.0, collect_summary=False):
                assert (training_layers[0].shape[1] == x.shape[1])
                h = x
                for i, layer in enumerate(training_layers[1:]):
                    w = layer.W
                    b = layer.b
                    pre_h = tf.matmul(h, w) + b
                    h = layer.nonlinearity(pre_h, name='policy_out')
                    if collect_summary:
                        with tf.name_scope(scope_name + '/observation'):
                            variable_summaries(x)
                        with tf.name_scope(scope_name + '/layer%d' % i):
                            with tf.name_scope('weights'):
                                variable_summaries(w)
                            with tf.name_scope('biases'):
                                variable_summaries(b)
                            with tf.name_scope('Wx_plus_b'):
                                tf.summary.histogram('pre_activations', pre_h)
                            tf.summary.histogram('activations', h)
                std = training_policy._l_std_param.param
                h += stochastic * tf.random_normal(
                    shape=(tf.shape(x)[0], n_actions)) * tf.exp(std)
                return h

            return training_policy, policy_model

        '''
        Dynamics model
        '''

        def get_value(key, dict):
            return key in dict and dict[key]

        def prepare_input(xgu, xgu_norm, scope_name, variable_name,
                          collect_summary, prediction_type):
            name_scope = '%s/%s' % (scope_name, variable_name)
            assert n_states > 1 and n_actions > 1 \
                   and xgu.shape[1] == n_states + n_actions + n_goals
            xu = tf.concat([xgu[:, :n_states], xgu[:, n_states + n_goals:]],
                           axis=1)
            xu_norm = tf.concat(
                [xgu_norm[:, :n_states], xgu_norm[:, n_states + n_goals:]],
                axis=1)
            # Collect data summaries
            if collect_summary:
                with tf.name_scope(name_scope + '/inputs'):
                    with tf.name_scope('states'):
                        data_summaries(xgu[:, :n_states])
                    with tf.name_scope('goals'):
                        data_summaries(xgu[:, n_states:n_states + n_goals])
                    with tf.name_scope('actions'):
                        data_summaries(xgu[:, n_states + n_goals:])
            # Ignore xy in the current state.
            if get_value('ignore_xy_input', params['dynamics_model']):
                n_inputs = n_states + n_actions - 2
                nn_input = xu_norm[:, 2:]
            elif get_value('ignore_x_input', params['dynamics_model']):
                n_inputs = n_states + n_actions - 1
                nn_input = xu_norm[:, 1:]
            else:
                n_inputs = n_states + n_actions
                nn_input = xu_norm
            hidden_layers = list(params['dynamics_model']['hidden_layers'])
            nonlinearity = [
                eval(_x) for _x in params['dynamics_model']['nonlinearity']
            ]
            assert (len(nonlinearity) == len(hidden_layers))
            # Verify if the input type is valid.
            if prediction_type == 'state_change' or \
                            prediction_type == 'state_change_goal':
                n_outputs = n_states
            else:
                assert prediction_type == 'second_derivative' or \
                       prediction_type == 'second_derivative_goal'
                n_outputs = int(n_states / 2)
            nonlinearity.append(tf.identity)
            hidden_layers.append(n_outputs)
            return xu, nn_input, n_inputs, n_outputs, \
                   nonlinearity, hidden_layers

        def build_ff_neural_net(nn_input,
                                n_inputs,
                                hidden_layers,
                                nonlinearity,
                                scope_name,
                                variable_name,
                                collect_summary,
                                logit_weights=None,
                                initializer=layers.xavier_initializer()):
            assert len(hidden_layers) == len(nonlinearity)
            name_scope = '%s/%s' % (scope_name, variable_name)
            h = nn_input
            n_hiddens = n_inputs
            n_hiddens_next = hidden_layers[0]
            for i in range(len(hidden_layers)):
                w = get_scope_variable(scope_name,
                                       "%s/layer%d/weights" %
                                       (variable_name, i),
                                       shape=(n_hiddens, n_hiddens_next),
                                       initializer=initializer)
                b = get_scope_variable(scope_name,
                                       "%s/layer%d/biases" %
                                       (variable_name, i),
                                       shape=(n_hiddens_next),
                                       initializer=initializer)
                if collect_summary:
                    with tf.name_scope(name_scope + '/layer%d' % i):
                        with tf.name_scope('weights'):
                            variable_summaries(w)
                        with tf.name_scope('biases'):
                            variable_summaries(b)
                        with tf.name_scope('Wx_plus_b'):
                            pre_h = tf.matmul(h, w) + b
                            tf.summary.histogram('pre_activations', pre_h)
                        h = nonlinearity[i](pre_h, name='activation')
                        tf.summary.histogram('activations', h)
                else:
                    pre_h = tf.matmul(h, w) + b
                    h = nonlinearity[i](pre_h, name='activation')
                n_hiddens = hidden_layers[i]
                if i + 1 < len(hidden_layers):
                    n_hiddens_next = hidden_layers[i + 1]
                if logit_weights is not None and i == len(hidden_layers) - 2:
                    h *= logit_weights
            return h

        def build_dynamics_model(n_states,
                                 n_actions,
                                 n_goals,
                                 dt=None,
                                 input_rms=None,
                                 diff_rms=None):
            prediction_type = params['dynamics_model']['prediction_type']

            def dynamics_model(xgu,
                               scope_name,
                               variable_name,
                               collect_summary=False):
                '''
                :param xu: contains states, goals, actions
                :param scope_name:
                :param variable_name:
                :param dt:
                :return:
                '''
                xu, nn_input, n_inputs, n_outputs, nonlinearity, hidden_layers = \
                    prepare_input(xgu,
                                  (xgu - input_rms.mean)/input_rms.std,
                                  scope_name,
                                  variable_name,
                                  collect_summary,
                                  prediction_type)

                if "use_logit_weights" in params["dynamics_model"] and params[
                        "dynamics_model"]["use_logit_weights"]:
                    logit_weights = build_ff_neural_net(
                        nn_input, n_inputs, hidden_layers[:-1],
                        nonlinearity[:-2] + [tf.nn.sigmoid], scope_name,
                        variable_name + '_sig', collect_summary)
                else:
                    logit_weights = None
                nn_output = build_ff_neural_net(nn_input,
                                                n_inputs,
                                                hidden_layers,
                                                nonlinearity,
                                                scope_name,
                                                variable_name,
                                                collect_summary,
                                                logit_weights=logit_weights)

                # predict the delta instead (x_next-x_current)
                if 'state_change' in prediction_type:
                    next_state = tf.add(
                        diff_rms.mean[:n_states] +
                        diff_rms.std[:n_outputs] * nn_output, xu[:, :n_states])
                else:
                    assert 'second_derivative' in prediction_type
                    # We train 'out' to match state_dot_dot
                    # Currently only works for swimmer.
                    qpos = xu[:, :n_outputs] + dt * xu[:, n_outputs:n_states]
                    qvel = xu[:, n_outputs:n_states] + dt * nn_output
                    next_state = tf.concat([qpos, qvel], axis=1)
                if '_goal' in prediction_type:
                    assert n_goals > 1
                    g = xgu[:, n_states:n_states + n_goals]
                    next_state = tf.concat([next_state, g], axis=1)
                return tf.identity(next_state,
                                   name='%s/%s/dynamics_out' %
                                   (scope_name, variable_name))

            return dynamics_model

        def get_regularizer_loss(scope_name, variable_name):
            if params['dynamics_model']['regularization']['method'] in [
                    None, ''
            ]:
                return tf.constant(0.0, dtype=tf.float32)
            constant = params['dynamics_model']['regularization']['constant']
            regularizer = eval(
                params['dynamics_model']['regularization']['method'])
            hidden_layers = params['dynamics_model']['hidden_layers']
            reg_loss = 0.0
            for i in range(len(hidden_layers) + 1):
                w = get_scope_variable(
                    scope_name, "%s/layer%d/weights" % (variable_name, i))
                b = get_scope_variable(
                    scope_name, "%s/layer%d/biases" % (variable_name, i))
                reg_loss += regularizer(w) + regularizer(b)
            return constant * reg_loss

        '''
        Main
        '''
        # with get_session() as sess:
        if variant['mode'] == 'local':
            sess = get_session(interactive=True, mem_frac=0.1)
        else:
            sess = get_session(interactive=True,
                               mem_frac=1.0,
                               use_gpu=variant['use_gpu'])

        # data = joblib.load(os.path.join(working_dir, params['trpo_path']))
        env = get_env(variant['params']['env'])

        # policy = data['policy']
        training_policy, policy_model = build_policy_from_rllab()
        if hasattr(env._wrapped_env, '_wrapped_env'):
            inner_env = env._wrapped_env._wrapped_env
        else:
            inner_env = env._wrapped_env.env.unwrapped
        n_obs = inner_env.observation_space.shape[0]
        n_actions = inner_env.action_space.shape[0]
        cost_np = inner_env.cost_np
        cost_tf = inner_env.cost_tf
        cost_np_vec = inner_env.cost_np_vec
        if hasattr(inner_env, 'n_goals'):
            n_goals = inner_env.n_goals
            n_states = inner_env.n_states
            assert n_goals + n_states == n_obs
        else:
            n_goals = 0
            n_states = n_obs
        dt = None
        # Only necessary for second_derivative
        if hasattr(inner_env, 'model') and hasattr(inner_env, 'frame_skip'):
            dt = inner_env.model.opt.timestep * inner_env.frame_skip
        from running_mean_std import RunningMeanStd
        with tf.variable_scope('input_rms'):
            input_rms = RunningMeanStd(epsilon=0.0,
                                       shape=(n_states + n_goals + n_actions))
        with tf.variable_scope('diff_rms'):
            diff_rms = RunningMeanStd(epsilon=0.0, shape=(n_states + n_goals))
        dynamics_model = build_dynamics_model(n_states=n_states,
                                              n_actions=n_actions,
                                              n_goals=n_goals,
                                              dt=dt,
                                              input_rms=input_rms,
                                              diff_rms=diff_rms)

        kwargs = {}
        kwargs['input_rms'] = input_rms
        kwargs['diff_rms'] = diff_rms
        kwargs['mode'] = variant['mode']

        if params['algo'] == 'vpg':
            from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
            from algos.vpg import VPG
            baseline = LinearFeatureBaseline(env_spec=env.spec)
            algo = VPG(
                env=env,
                policy=training_policy,
                baseline=baseline,
                batch_size=policy_opt_params.vpg['batch_size'],
                max_path_length=policy_opt_params.T,
                discount=policy_opt_params.vpg['discount'],
            )
            kwargs['rllab_algo'] = algo
            if params["policy_opt_params"]["vpg"]["reset"]:
                kwargs['reset_opt'] = tf.assign(
                    training_policy._l_std_param.param,
                    np.log(params["policy_opt_params"]["vpg"]["init_std"]) *
                    np.ones(n_actions))
        elif params['algo'] == 'trpo':
            ### Write down baseline and algo
            from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
            from algos.trpo import TRPO
            baseline = LinearFeatureBaseline(env_spec=env.spec)
            algo = TRPO(
                env=env,
                policy=training_policy,
                baseline=baseline,
                batch_size=policy_opt_params.trpo['batch_size'],
                max_path_length=policy_opt_params.T,
                discount=policy_opt_params.trpo['discount'],
                step_size=policy_opt_params.trpo['step_size'],
            )
            kwargs['rllab_algo'] = algo
            if params["policy_opt_params"]["trpo"]["reset"]:
                kwargs['reset_opt'] = tf.assign(
                    training_policy._l_std_param.param,
                    np.log(params["policy_opt_params"]["trpo"]["init_std"]) *
                    np.ones(n_actions))
            # if "decay_rate" in params["policy_opt_params"]["trpo"]:
            #     kwargs['trpo_std_decay'] = tf.assign_sub(training_policy._l_std_param.param,
            #     np.log(params["policy_opt_params"]["trpo"]["decay_rate"])*np.ones(n_actions))
        kwargs['inner_env'] = inner_env
        kwargs['algo_name'] = params['algo']
        kwargs['logstd'] = training_policy._l_std_param.param
        # Save initial policy
        joblib.dump(training_policy,
                    os.path.join(snapshot_dir, 'params-initial.pkl'))

        train_models(env=env,
                     dynamics_model=dynamics_model,
                     dynamics_opt_params=dynamics_opt_params,
                     get_regularizer_loss=get_regularizer_loss,
                     policy_model=policy_model,
                     policy_opt_params=policy_opt_params,
                     rollout_params=rollout_params,
                     cost_np=cost_np,
                     cost_np_vec=cost_np_vec,
                     cost_tf=cost_tf,
                     snapshot_dir=snapshot_dir,
                     working_dir=working_dir,
                     n_models=params['n_models'],
                     sweep_iters=params['sweep_iters'],
                     sample_size=params['sample_size'],
                     verbose=False,
                     variant=variant,
                     saved_policy=training_policy,
                     **kwargs)  # Make sure not to reinitialize TRPO policy.

        # Save the final policy
        joblib.dump(training_policy, os.path.join(snapshot_dir, 'params.pkl'))

    except Exception as e:
        rmtree(snapshot_dir)
        import sys, traceback
        # traceback.print_exception(*sys.exc_info())
        from IPython.core.ultratb import ColorTB
        c = ColorTB()
        exc = sys.exc_info()
        print(''.join(c.structured_traceback(*exc)))
        print('Removed the experiment folder %s.' % snapshot_dir)
Exemplo n.º 27
0
def q_learning(q_network,
               env,
               test_env,
               seed,
               total_timesteps,
               log_interval,
               test_interval,
               show_interval,
               logdir,
               lr,
               max_grad_norm,
               units_per_hlayer,
               activ_fcn,
               gamma=0.95,
               epsilon=0.4,
               epsilon_decay=.95,
               buffer_size=4000,
               batch_size=128,
               trace_length=32,
               tau=0.99,
               update_interval=30,
               early_stop=False,
               keep_model=2,
               save_model=True,
               restore_model=False,
               save_traj=False):
    # """
    # Q-Learning algorithm for off-policy TD control using Function Approximation.
    # Finds the optimal greedy policy while following an epsilon-greedy policy.
    # Implements the options of online learning or using experience replay and also
    # target calculation by target networks, depending on the flags. You can reuse
    # your Q-learning implementation of the last exercise.
    #
    # Args:
    #     env: PLE game
    #     approx: Action-Value function estimator
    #     num_episodes: Number of episodes to run for.
    #     max_time_per_episode: maximum number of time steps before episode is terminated
    #     discount_factor: gamma, discount factor of future rewards.
    #     epsilon: Chance to sample a random action. Float betwen 0 and 1.
    #     epsilon_decay: decay rate of epsilon parameter
    #     use_experience_replay: Indicator if experience replay should be used.
    #     batch_size: Number of samples per batch.
    #     target: Slowly updated target network to calculate the targets. Ignored if None.
    #
    # Returns:
    #     An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    # """
    logger = logging.getLogger(__name__)
    # logger.info(datetime.time)

    tf.reset_default_graph()
    set_global_seeds(seed)

    # Params
    ob_space = env.observation_space
    ac_space = env.action_space
    nd, = ob_space.shape
    n_ac = ac_space.n

    # Create learning agent and the replay buffer
    agent = DQNAgent(q_network=q_network,
                     ob_space=ob_space,
                     ac_space=ac_space,
                     lr=lr,
                     max_grad_norm=max_grad_norm,
                     units_per_hlayer=units_per_hlayer,
                     activ_fcn=activ_fcn,
                     log_interval=log_interval,
                     logdir=logdir,
                     batch_size=batch_size,
                     trace_length=trace_length,
                     update_interval=update_interval,
                     tau=tau,
                     keep_model=keep_model)
    summary_writer = agent.get_summary_writer()
    result_path = os.path.join(logdir, 'train_results.csv')
    if save_traj:
        rew_traj = []
        rew_results_path = os.path.join(
            logdir, ('lr' + str(lr) + '_tracking_results.csv'))
    else:
        rew_results_path = None
    replay_buffer = ReplayBuffer(buffer_size)

    # Keeps track of useful statistics
    stats = EpisodeStats

    if restore_model:
        for el in os.listdir(logdir):
            if 'final' in el and '.meta' in el:
                # Load pre trained model and set network parameters
                logger.info('load %s' % os.path.join(logdir, el[:-5]))
                agent.load(os.path.join(logdir, el[:-5]))
                # Reset global step parameter.
                agent.sess.run(agent.global_step.assign(0))

    # ------------------ TRAINING --------------------------------------------
    logger.info("Start Training")
    early_stopped = False
    i_episode, i_sample, i_train = 0, 0, 0
    len, rew = 0, 0
    horizon = 100
    reward_window = deque(maxlen=horizon)
    avg_rm = deque(maxlen=30)
    nbatch = batch_size * trace_length
    return_threshold = -0.05  # 40

    # Reset envnn
    obs = env.reset()
    obs = normalize_obs(obs)
    done = False
    rnn_state0 = agent.step_initial_state
    if rnn_state0 is None:  # If we use a normal feed forward architecture, we sample a batch of single samples, not a batch of sequences.
        trace_length = 1

    # Set the target network to be equal to the primary network
    agent.update_target(agent.target_ops)
    while i_sample < total_timesteps:
        if np.random.rand(1) < epsilon:
            _, next_rnn_state = agent.step([obs],
                                           rnn_state0)  # epsilon greedy action
            action = np.random.randint(0, n_ac)
        else:
            AP, next_rnn_state = agent.step(
                [obs], rnn_state0)  # epsilon greedy action
            action = AP[0]
        next_obs, reward, done, _ = env.step(action)
        next_obs = normalize_obs(next_obs)
        i_sample += 1
        # render only every i-th episode
        if show_interval != 0:
            if i_episode % show_interval == 0:
                env.render()

        len += 1
        rew += reward
        reward_window.append(reward)

        # When episode is done, add episode information to tensorboard summary and stats
        if done:  # env.game_over():
            next_obs = list(np.zeros_like(next_obs, dtype=np.float32))

            stats['episode_lengths'].append(len)
            stats['episode_rewards'].append(rew)

            if summary_writer is not None:
                summary = tf.Summary()
                summary.value.add(
                    tag='envs/ep_return',
                    simple_value=stats['episode_rewards'][i_episode])
                summary.value.add(
                    tag="envs/ep_length",
                    simple_value=stats['episode_lengths'][i_episode])
                summary_writer.add_summary(summary, i_episode)
                summary_writer.flush()

            if save_model and rew > return_threshold:
                return_threshold = rew
                logger.info('Save model at max reward %s' % return_threshold)
                agent.save('inter_model')

            i_episode += 1
            len, rew = 0, 0

        # Update replay buffer
        replay_buffer.add_transition(obs, action, next_obs, reward, done)
        if save_traj:
            rew_traj.append(reward)

        # Update model parameters every #update_interval steps. Use real experience and replayed experience.
        if replay_buffer.size() > nbatch and (i_sample % update_interval == 0):
            if (env.spec._env_name == 'ContFlappyBird'):
                rm = sum(reward_window) / horizon
                if summary_writer is not None:
                    s_summary = tf.Summary()
                    s_summary.value.add(tag='envs/isample_return',
                                        simple_value=rm)
                    summary_writer.add_summary(s_summary, i_sample)
                    summary_writer.flush()
                if save_model and rm > return_threshold:
                    return_threshold = rm
                    logger.info('Save model at max rolling mean %s' %
                                return_threshold)
                    agent.save('inter_model')
                avg_rm.append(rm)

            if early_stop:
                if (i_sample > 60000) and (i_sample <=
                                           (60000 + update_interval)):
                    if (sum(avg_rm) / 30) <= -0.88:
                        print('breaked')
                        early_stopped = True
                        break

            agent.update_target(agent.target_ops)

            # reset rnn state (history knowledge) before every training step
            rnn_state_train = agent.train_initial_state

            # Sample training mini-batch from replay buffer
            if rnn_state_train is not None:
                mb_obs, mb_actions, mb_next_obs, mb_rewards, _, batch_dones = \
                                                replay_buffer.recent_and_next_batch_of_seq(batch_size, trace_length)
            else:
                mb_obs, mb_actions, mb_next_obs, mb_rewards, _, batch_dones = \
                                                replay_buffer.recent_and_next_batch(batch_size)

            # Calculate TD target for batch. Use "old" fixed parameters if target network is available
            # to compute targets else use "old" parameters of value function estimate.
            # mb_next_obs = np.reshape(mb_next_obs, (-1, nd))
            mb_next_q_values, _ = agent.target_model.predict(
                mb_next_obs, rnn_state_train)
            mb_best_next_action = np.argmax(mb_next_q_values, axis=1)
            mb_td_target = [
                mb_rewards[j] +
                gamma * mb_next_q_values[j][mb_best_next_action[j]]
                for j in range(nbatch)
            ]

            # Update Q value estimator parameters by optimizing between Q network and Q-learning targets
            loss = agent.train(mb_obs, mb_actions, mb_td_target,
                               rnn_state_train)
            i_train += 1

            # If test_interval > 0 the learned model is evaluated every "test_interval" gradient updates
            if test_interval > 0 and i_train > 0 and (i_train % test_interval
                                                      == 0):
                ep_return = agent.test_run(test_env, n_eps=10, n_pipes=2000)
                with open(result_path, "a") as csvfile:
                    writer = csv.writer(csvfile)
                    ep_return[0:0] = [i_sample, i_train]
                    writer.writerow(ep_return)

        if done:
            # Reset the model
            next_obs = env.reset()
            next_obs = normalize_obs(next_obs)

        epsilon *= epsilon_decay
        obs = next_obs
        rnn_state0 = next_rnn_state

    # Save final model when training is finished.
    if save_model:
        agent.save('final_model')
        logger.info('Finished Training. Saving Final model.')

    if rew_results_path is not None:
        logger.info('Save reward trajectory to %s' % rew_results_path)
        with open(rew_results_path, "a") as csvfile:
            writer = csv.writer(csvfile)
            traj = np.asanyarray(rew_traj).reshape(-1).tolist()
            traj[0:0] = [np.mean(traj)]  # i_train, i_sample
            writer.writerow(traj)

    logger.info('*******************************************************')
    logger.info('Total number of interactions with the environment: %s' %
                i_sample)
    logger.info('Total number of parameter updates during training: %s' %
                i_train)
    logger.info('*******************************************************\n')

    return early_stopped, i_sample
Exemplo n.º 28
0
    print("---------------------------------------")
    print("Settings: %s" % (file_name))
    print("---------------------------------------")

    writer = SummaryWriter(comment=file_name)

    if not os.path.exists("./results"):
        os.makedirs("./results")
    if args.save_models and not os.path.exists("./pytorch_models"):
        os.makedirs("./pytorch_models")

    env = gym.make(args.env_name)

    # Set seeds
    env.seed(args.seed)
    set_global_seeds(args.seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Initialize policy
    if args.policy_name == "TD3":
        policy = TD3.TD3(state_dim, action_dim, max_action)
    elif args.policy_name == "OurDDPG":
        policy = OurDDPG.DDPG(state_dim, action_dim, max_action)
    elif args.policy_name == "DDPG":
        policy = DDPG.DDPG(state_dim, action_dim, max_action)

    replay_buffer = utils.ReplayBuffer()
Exemplo n.º 29
0
def eval_model(render,
               nepisodes,
               test_steps,
               save_traj=False,
               result_file='test_results.csv',
               **params):
    logger = logging.getLogger(__name__)
    logger.info('Evaluating learning algorithm...\n')
    logger.info(params["eval_model"])

    logger.debug('\nMake Environment with seed %s' % params["seed"])
    # TODO use different seed for every run!#, allow_early_resets=True)
    # TODO make non-clipped env, even if agent is trained on clipped env
    ple_env = make_ple_env(params["test_env"], seed=params["seed"])

    tf.reset_default_graph()
    set_global_seeds(params["seed"])
    model_idx = []

    if save_traj:
        result_path = os.path.join(params["logdir"], result_file)
    else:
        result_path = None

    recurrent = (params["architecture"] == 'lstm'
                 or params["architecture"] == 'gru')
    if params["eval_model"] == 'final':
        avg_performances = []
        var_performances = []
        maximal_returns = []
        for f in glob.glob(
                os.path.join(params["logdir"], '*final_model-*.meta')):
            logger.info('Restore model: %s' % f)
            idx = f.find('final_model')
            f_name = f[idx:-5]
            model_idx.append(f_name)
            with tf.Session() as sess:
                OBS, RNN_S_IN, RNN_S_OUT, PRED_Q = restore_dqn_model(
                    sess,
                    logdir=params["logdir"],
                    f_name=f_name,
                    isrnn=recurrent)
                logger.info('Run %s evaluation episodes' % nepisodes)
                model_performance = run_episodes(sess, ple_env, nepisodes,
                                                 test_steps, render, OBS,
                                                 RNN_S_IN, RNN_S_OUT, PRED_Q,
                                                 result_path, params["seed"])
                # Add model performance metrics
                avg_performances.append(np.mean(model_performance))
                var_performances.append(np.var(model_performance))
                maximal_returns.append(np.max(model_performance))
            tf.reset_default_graph()

    elif params["eval_model"] == 'inter':
        # Use all stored maximum performance models and the final model.
        # print('Eval now!')
        avg_performances = []
        var_performances = []
        maximal_returns = []
        for f in glob.glob(os.path.join(params["logdir"], '*inter*.meta')):
            logger.info('Restore model: %s' % f)
            idx = f.find('_model')
            f_name = f[idx - 5:-5]
            model_idx.append(f_name)
            with tf.Session() as sess:
                OBS, RNN_S_IN, RNN_S_OUT, PRED_Q = restore_dqn_model(
                    sess,
                    logdir=params["logdir"],
                    f_name=f_name,
                    isrnn=recurrent)
                logger.info('Run %s evaluation episodes' % nepisodes)
                model_performance = run_episodes(sess, ple_env, nepisodes,
                                                 test_steps, render, OBS,
                                                 RNN_S_IN, RNN_S_OUT, PRED_Q,
                                                 result_path, params["seed"])

                # Add model performance metrics
                avg_performances.append(np.mean(model_performance))
                var_performances.append(np.var(model_performance))
                maximal_returns.append(np.max(model_performance))
            tf.reset_default_graph()
    elif params["eval_model"] == 'analysis':
        # Use all stored maximum performance models and the final model.
        avg_performances = []
        std_performances = []
        maximal_returns = []
        for f in glob.glob(os.path.join(params["logdir"], '*.meta')):
            logger.info('Restore model: %s' % f)
            idx = f.find('_model')
            f_name = f[idx - 5:-5]
            model_idx.append(f_name)
            # print(f_name)
            with tf.Session() as sess:
                OBS, RNN_S_IN, RNN_S_OUT, PRED_Q = restore_dqn_model(
                    sess,
                    logdir=params["logdir"],
                    f_name=f_name,
                    isrnn=recurrent)
                logger.info('Run %s evaluation episodes' % nepisodes)
                model_performance = run_episodes(sess, ple_env, nepisodes,
                                                 test_steps, render, OBS,
                                                 RNN_S_IN, RNN_S_OUT, PRED_Q,
                                                 result_path, params["seed"])

                # Add model performance metrics
                avg_performances.append(np.mean(model_performance))
                std_performances.append(np.std(model_performance))
                maximal_returns.append(np.max(model_performance))
            tf.reset_default_graph()
        return model_idx, avg_performances, std_performances

    logger.info(params["logdir"])
    logger.info('Results of the evaluation of the learning algorithm:')
    logger.info('Restored models: %s' % model_idx)
    logger.info('Average performance per model: %s' % avg_performances)
    logger.info('Performance variance per model: %s' % var_performances)
    logger.info('Maximum episode return per model: %s' % maximal_returns)
    ple_env.close()

    if not avg_performances == []:
        return np.mean(avg_performances), np.mean(var_performances), np.mean(
            maximal_returns)
    else:
        return -3000, 3000, -3000
Exemplo n.º 30
0
def main():
    parser = arg_parser()
    add_env_params(parser)
    parser.add_argument('--num_timesteps', type=float, default=100e6)
    parser.add_argument('--num_env', type=int, default=128)
    parser.add_argument('--use_news', type=int, default=0)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--gamma_ext', type=float, default=0.99)
    parser.add_argument('--gamma_div', type=float, default=0.999)
    parser.add_argument('--lam', type=float, default=0.95)
    parser.add_argument('--update_ob_stats_every_step', type=int, default=0)
    parser.add_argument('--update_ob_stats_independently_per_gpu',
                        type=int,
                        default=1)
    parser.add_argument('--update_ob_stats_from_random_agent',
                        type=int,
                        default=1)
    parser.add_argument('--proportion_of_exp_used_for_predictor_updated',
                        type=float,
                        default=1.)
    parser.add_argument('--tag', type=str, default='')
    parser.add_argument('--policy',
                        type=str,
                        default='cnn',
                        choices=['cnn', 'rnn'])
    parser.add_argument('--int_coeff', type=float, default=1.)
    parser.add_argument('--ext_coeff', type=float, default=2.)
    parser.add_argument('--dynamics_bonus', type=int, default=0)
    parser.add_argument('--save_dir',
                        help="dir to save and log",
                        type=str,
                        default="save_dir")
    parser.add_argument('--load_path',
                        help="dir to load model",
                        type=str,
                        default=None)
    parser.add_argument('--base_load_path',
                        help="dir to load model",
                        type=str,
                        default=None)
    parser.add_argument('--r_path',
                        help="dir to load r network",
                        type=str,
                        default=None)

    parser.add_argument('--play', default=False, action='store_true')
    parser.add_argument('--only_train_r', default=False, action='store_true')
    parser.add_argument('--online_train_r', default=False, action='store_true')
    #parser.add_argument('--ec_type', type=str, default='episodic_curiosity', choices=['episodic_curiosity', 'none','oracle'])
    parser.add_argument('--rnd_type',
                        type=str,
                        default='rnd',
                        choices=['rnd', 'oracle'])
    parser.add_argument('--reset', default=False, action='store_true')
    parser.add_argument('--dynamics_sample',
                        default=False,
                        action='store_true')

    parser.add_argument('--num_agents', type=int, default=1)

    parser.add_argument('--div_type',
                        type=str,
                        default='oracle',
                        choices=['oracle', 'cls', 'rnd'])
    parser.add_argument('--load_ram', default=False, action='store_true')
    parser.add_argument('--debug', default=False, action='store_true')
    parser.add_argument('--rnd_mask_prob', type=float, default=1.)
    parser.add_argument('--rnd_mask_type',
                        type=str,
                        default='indep',
                        choices=['prog', 'indep', 'shared'])
    parser.add_argument('--indep_rnd', default=False, action='store_true')
    parser.add_argument('--indep_policy', default=True, action='store_true')
    parser.add_argument('--sd_type',
                        type=str,
                        default='oracle',
                        choices=['oracle', 'sd'])
    parser.add_argument('--from_scratch', default=False, action='store_true')

    parser.add_argument('--kl', default=False, action='store_true')

    args = parser.parse_args()

    log_path = os.path.join(args.save_dir, 'logs')
    save_path = os.path.join(args.save_dir, 'models')

    logger.configure(dir=log_path,
                     format_strs=['stdout', 'log', 'csv']
                     if MPI.COMM_WORLD.Get_rank() == 0 else [])
    if MPI.COMM_WORLD.Get_rank() == 0:
        with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'),
                  'w') as f:
            f.write(args.tag)
        # shutil.copytree(os.path.dirname(os.path.abspath(__file__)), os.path.join(logger.get_dir(), 'code'))

    mpi_util.setup_mpi_gpus()

    seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank()
    set_global_seeds(seed)

    hps = dict(
        frame_stack=4,
        nminibatches=4,
        nepochs=4,
        lr=0.0001,
        max_grad_norm=0.0,
        use_news=args.use_news,
        gamma=args.gamma,
        gamma_ext=args.gamma_ext,
        gamma_div=args.gamma_div,
        max_episode_steps=args.max_episode_steps,
        lam=args.lam,
        update_ob_stats_every_step=args.update_ob_stats_every_step,
        update_ob_stats_independently_per_gpu=args.
        update_ob_stats_independently_per_gpu,
        update_ob_stats_from_random_agent=args.
        update_ob_stats_from_random_agent,
        proportion_of_exp_used_for_predictor_update=args.
        proportion_of_exp_used_for_predictor_updated,
        policy=args.policy,
        int_coeff=args.int_coeff,
        ext_coeff=args.ext_coeff,
        dynamics_bonus=args.dynamics_bonus,
        log_interval=10,
        save_path=save_path,
        load_path=args.load_path,
        r_path=args.r_path,
        play=args.play,
        only_train_r=args.only_train_r,
        online_train_r=args.online_train_r,
        #ec_type = args.ec_type,
        rnd_type=args.rnd_type,
        reset=args.reset,
        dynamics_sample=args.dynamics_sample,
        num_agents=args.num_agents,
        div_type=args.div_type,
        load_ram=args.load_ram,
        debug=args.debug,
        rnd_mask_prob=args.rnd_mask_prob,
        rnd_mask_type=args.rnd_mask_type,
        indep_rnd=args.indep_rnd,
        indep_policy=args.indep_policy,
        sd_type=args.sd_type,
        from_scratch=args.from_scratch,
        base_load_path=args.base_load_path,
        use_kl=args.kl)

    if args.play:
        args.num_env = 1

    tf_util.make_session(make_default=True)
    train(env_id=args.env,
          num_env=args.num_env,
          seed=seed,
          num_timesteps=args.num_timesteps,
          hps=hps)