Пример #1
0
def test_make_output(_format):
    """
    test make output

    :param _format: (str) output format
    """
    writer = make_output_format(_format, LOG_DIR)
    writer.writekvs(KEY_VALUES)
    if _format == 'tensorboard':
        read_tb(LOG_DIR)
    elif _format == "csv":
        read_csv(LOG_DIR + 'progress.csv')
    elif _format == 'json':
        read_json(LOG_DIR + 'progress.json')
    writer.close()
def configure(dir, format_strs=None, custom_output_formats=None):
    if not dir:
        return

    assert isinstance(dir, str)
    os.makedirs(dir, exist_ok=True)

    if format_strs is None:
        strs = os.getenv('OPENAI_LOG_FORMAT')
        format_strs = strs.split(',') if strs else LOG_OUTPUT_FORMATS
    output_formats = [make_output_format(f, dir) for f in format_strs]

    if custom_output_formats is not None:
        assert isinstance(custom_output_formats, list)
        for custom_output_format in custom_output_formats:
            assert isinstance(custom_output_format, KVWriter)
        output_formats.extend(custom_output_formats)

    Logger.CURRENT = Logger(dir=dir, output_formats=output_formats)
    log('Logging to %s' % dir)
Пример #3
0
input_height, input_width = (86, 80)
batch_size = 32
update_freq = 10000
learn_freq = 4
save_freq = 500000
action_space_size = env.action_space.n
NUM_STEPS = 4000000
replay_memory_size = 40000
replay_alpha = 0.6
replay_beta = 0.4
replay_epsilon = 1e-6
is_load_model = True
watch_flag = True   
fps = 30 #frames shown per second when watch_flag == True

log_csv_writer = logger.make_output_format("json", "logs")
log = logger.Logger("logs", [log_csv_writer])

def preprocess_frame(frame):
    """Given a frame, scales it and converts to grayscale"""
    im = resize(color.rgb2gray(frame)[:176, :], (input_height, input_width), mode='constant')
    return im

"""
model arch
----------

Conv1 (8x8x32 filter) -> ReLU -> Conv2 (4x4x64 filter) -> ReLU -> Conv3 (3x3x64 filter) -> ReLU ->
FC4 (512 neurons) -> ReLU -> FC5 (9 neurons) -> ReLU ->  Output Q-value for each action
"""
def q_function_nn(obs, action_space_size, scope, reuse=False):
Пример #4
0
    def eval(self):
        # create base_dir to save results
        env_id = self.args['env_id'] if self.args[
            'env_kind'] == 'mario' else self.args['eval_type']
        # base_dir =  os.path.join(self.args['log_dir'], self.args['exp_name'], env_id)
        # os.makedirs(base_dir, exist_ok=True)

        # i forget to restore, i cannot believe myself
        # load_path = self.args['load_path']

        # args['IS_HIGH_RES'] is used to signal whether save videos
        nlevels = self.args['NUM_LEVELS']

        save_video = False

        # train progress results logger
        format_strs = ['csv']
        format_strs = filter(None, format_strs)
        dirc = os.path.join(self.args['log_dir'], 'inter')
        output_formats = [
            logger.make_output_format(f, dirc) for f in format_strs
        ]
        self.result_logger = logger.Logger(dir=dirc,
                                           output_formats=output_formats)

        if self.args['env_kind'] == 'mario':
            # do NOT FORGET to change this
            nlevels = 20

        # curr_iter = 0
        # results_list = []
        restore_iter = [25 * i for i in range(117)] + [2929]

        for r in restore_iter:
            load_path = os.path.join(self.args['load_dir'],
                                     'model-{}'.format(r))
            print(load_path)
            self.agent.load(load_path)

            save_video = False
            nlevels = 20 if self.args['env_kind'] == 'mario' else self.args[
                'NUM_LEVELS']
            results, _ = self.agent.evaluate(nlevels, save_video)
            results['iter'] = r
            for (k, v) in results.items():
                self.result_logger.logkv(k, v)
            self.result_logger.dumpkvs()
        '''    
        results['iter'] = curr_iter = int(l.split('/')[-1].split('-')[-1])
        print(results)
        results_list.append(results)

        csv_columns = results_list[0].keys()
        print(csv_columns)

        curr_dir = os.path.join(base_dir, str(curr_iter))
        os.makedirs(curr_dir, exist_ok=True)
        
        csv_save_path = os.path.join(curr_dir, 'results.csv'.format())
        with open(csv_save_path, 'w') as file:
            writer = csv.DictWriter(file, fieldnames=csv_columns)
            writer.writeheader()
            for data in results_list:
                writer.writerow(data)
        print('results are dumped to {}'.format(csv_save_path))
        '''
        '''
Пример #5
0
    def train(self):
        curr_iter = 0

        # train progress results logger
        format_strs = ['csv']
        format_strs = filter(None, format_strs)
        dirc = os.path.join(self.args['log_dir'], 'inter')
        if self.restore_iter > -1:
            dirc = os.path.join(self.args['log_dir'],
                                'inter-{}'.format(self.restore_iter))
        output_formats = [
            logger.make_output_format(f, dirc) for f in format_strs
        ]
        self.result_logger = logger.Logger(dir=dirc,
                                           output_formats=output_formats)

        # in case we are restoring the training
        if self.restore_iter > -1:
            self.agent.load(self.load_path)
            if not self.args['transfer_load']:
                curr_iter = self.restore_iter

        print('max_iter: {}'.format(self.max_iter))

        # interim saves to compare in the future
        # for 128M frames,

        inter_save = []
        for i in range(3):
            divisor = (2**(i + 1))
            inter_save.append(
                int(self.args['num_timesteps'] // divisor) //
                (self.args['nsteps'] * self.args['NUM_ENVS'] *
                 self.args['nframeskip']))
        print('inter_save: {}'.format(inter_save))

        total_time = 0.0
        # results_list = []

        while curr_iter < self.early_max_iter:
            frac = 1.0 - (float(curr_iter) / self.max_iter)

            # self.agent.update calls rollout
            start_time = time.time()

            ## linearly annealing
            curr_lr = self.lr(frac)
            curr_cr = self.cliprange(frac)

            ## removed within training evaluation
            ## i could not make flag_sum to work properly
            ## evaluate each 100 run for 20 training levels
            # only for mario (first evaluate, then update)
            # i am doing change to get zero-shot generalization without any effort
            if curr_iter % (self.args['save_interval']) == 0:
                save_video = False
                nlevels = 20 if self.args[
                    'env_kind'] == 'mario' else self.args['NUM_LEVELS']
                results, _ = self.agent.evaluate(nlevels, save_video)
                results['iter'] = curr_iter
                for (k, v) in results.items():
                    self.result_logger.logkv(k, v)
                self.result_logger.dumpkvs()

            # representation learning in each 25 steps
            info = self.agent.update(lr=curr_lr, cliprange=curr_cr)
            end_time = time.time()

            # additional info
            info['frac'] = frac
            info['curr_lr'] = curr_lr
            info['curr_cr'] = curr_cr
            info['curr_iter'] = curr_iter
            # info['max_iter'] = self.max_iter
            info['elapsed_time'] = end_time - start_time
            # info['total_time'] = total_time = (total_time + info['elapsed_time']) / 3600.0
            info['expected_time'] = self.max_iter * info[
                'elapsed_time'] / 3600.0

            ## logging results using baselines's logger
            logger.logkvs(info)
            logger.dumpkvs()

            if curr_iter % self.args['save_interval'] == 0:
                self.agent.save(curr_iter, cliprange=curr_cr)

            if curr_iter in inter_save:
                self.agent.save(curr_iter, cliprange=curr_cr)

            curr_iter += 1

        self.agent.save(curr_iter, cliprange=curr_cr)

        # final evaluation for mario
        save_video = False
        nlevels = 20 if self.args['env_kind'] == 'mario' else self.args[
            'NUM_LEVELS']
        results, _ = self.agent.evaluate(nlevels, save_video)
        results['iter'] = curr_iter
        for (k, v) in results.items():
            self.result_logger.logkv(k, v)
        self.result_logger.dumpkvs()
Пример #6
0
def create_json_logger(log_dir):
    return logger.Logger(log_dir,
            [logger.make_output_format(f, log_dir) for f in ['json']]
            )
Пример #7
0
def create_logger(log_dir):
    return logger.Logger(log_dir,
            [logger.make_output_format(f, log_dir) for f in logger.LOG_OUTPUT_FORMATS]
            )
Пример #8
0
def test_make_output_fail():
    """
    test value error on logger
    """
    with pytest.raises(ValueError):
        make_output_format('dummy_format', LOG_DIR)
Пример #9
0
def learn(*,
          network,
          env,
          total_timesteps,
          eval_env=None,
          seed=None,
          nsteps=2000,
          ent_coef=0.0,
          lr=3e-4,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          save_interval=0,
          load_path=None,
          **network_kwargs):
    '''
    nsteps = nsteps, ent_coef = ent_coef, vf_coef = vf_coef,
    max_grad_norm = max_grad_norm
    '''
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

    env:  baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    ent_coef: float                   policy entropy coefficient in the optimization objective

    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective

    max_grad_norm: float or None      gradient norm clipping coefficient

    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    nminibatches: int                 number of training minibatches per update. For recurrent policies,
                                      should be smaller or equal than number of environments run in parallel.

    noptepochs: int                   number of training epochs per update

    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.



    '''

    set_global_seeds(seed)

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    policy = build_policy(env, network, **network_kwargs)

    # Get the nb of env
    nenvs = env.num_envs

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    # Instantiate the model object (that creates act_model and train_model)
    make_model = lambda: Model(policy=policy,
                               ob_space=ob_space,
                               ac_space=ac_space,
                               nbatch_act=nenvs,
                               nbatch_train=nbatch_train,
                               nsteps=nsteps,
                               ent_coef=ent_coef,
                               vf_coef=vf_coef,
                               max_grad_norm=max_grad_norm)
    model = make_model()
    if load_path is not None:
        model.load(load_path)
    # Instantiate the runner object
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
    if eval_env is not None:
        eval_runner = Runner(env=eval_env,
                             model=model,
                             nsteps=nsteps,
                             gamma=gamma,
                             lam=lam)

    epinfobuf = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)

    # Start total timer
    tfirststart = time.time()

    nupdates = total_timesteps // nbatch
    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0
        # Start timer
        tstart = time.time()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(frac)
        # Calculate the cliprange
        cliprangenow = cliprange(frac)
        # Get minibatch
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )  #pylint: disable=E0632
        if eval_env is not None:
            eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run(
            )  #pylint: disable=E0632

        epinfobuf.extend(epinfos)
        if eval_env is not None:
            eval_epinfobuf.extend(eval_epinfos)

        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals = []
        if states is None:  # nonrecurrent version
            # Index of each element of batch_size
            # Create the indices array
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                # Randomize the indexes
                np.random.shuffle(inds)
                # 0 to batch_size with batch_train_size step
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow,
                                                  *slices))
        else:  # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            envsperbatch = nbatch_train // nsteps
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(
                        model.train(lrnow, cliprangenow, *slices, mbstates))

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        # End timer
        tnow = time.time()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))
        csv = logger.make_output_format('csv',
                                        '/home/jin/project/rlnabi/PPO/csv', 1)
        ave_return = returns.sum() / len(returns)
        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, returns)
            logger.logkv("serial_timesteps", update * nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('eprewmean',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            if eval_env is not None:
                logger.logkv(
                    'eval_eprewmean',
                    safemean([epinfo['r'] for epinfo in eval_epinfobuf]))
                logger.logkv(
                    'eval_eplenmean',
                    safemean([epinfo['l'] for epinfo in eval_epinfobuf]))
            logger.logkv('time_elapsed', tnow - tfirststart)
            logger.logkv('average_return', ave_return)

            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)
                lossdict = {lossname: lossval}
                csv.writekvs(lossdict)

            if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
                logger.dumpkvs()
        if save_interval and (update % save_interval == 0
                              or update == 1) and logger.get_dir() and (
                                  MPI is None
                                  or MPI.COMM_WORLD.Get_rank() == 0):
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)

    return model