示例#1
0
    def compute_gradients(self, loss, var_list, **kwargs):
        grads_and_vars = tf.train.AdamOptimizer.compute_gradients(
            self, loss, var_list, **kwargs)
        grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None]

        flat_grad = tf.concat(
            [tf.reshape(g, (-1, )) for g, v in grads_and_vars], axis=0)

        if Config.is_test_rank():
            flat_grad = tf.zeros_like(flat_grad)

        shapes = [v.shape.as_list() for g, v in grads_and_vars]
        sizes = [int(np.prod(s)) for s in shapes]

        num_tasks = self.comm.Get_size()
        buf = np.zeros(sum(sizes), np.float32)

        def _collect_grads(flat_grad):
            self.comm.Allreduce(flat_grad, buf, op=MPI.SUM)
            np.divide(buf, float(num_tasks) * self.train_frac, out=buf)
            return buf

        avg_flat_grad = tf.py_func(_collect_grads, [flat_grad], tf.float32)
        avg_flat_grad.set_shape(flat_grad.shape)
        avg_grads = tf.split(avg_flat_grad, sizes, axis=0)
        avg_grads_and_vars = [(tf.reshape(g, v.shape), v)
                              for g, (_, v) in zip(avg_grads, grads_and_vars)]

        return avg_grads_and_vars
示例#2
0
    def dump_model(self):
        #utils.save_params_in_scopes(self.sess, [self.scope_dir + "model"], Config.get_save_file())
        data_dict = {}

        save_path = utils.file_to_path(Config.get_save_file())

        data_dict['args'] = Config.get_args_dict()
        data_dict['args']['use_minimum_model'] = True
        param_dict = {}

        if len(self.params) > 0:
            #print('saving scope', scope, filename)
            ps = self.sess.run(self.params)

            param_dict["model"] = ps

        data_dict['params'] = param_dict
        joblib.dump(data_dict, save_path)
示例#3
0
def main():
    args = setup_utils.setup_and_load()
    setup_utils.load_for_setup_if_necessary()

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    print('size', size)

    # For wandb package to visualize results curves
    config = Config.get_args_dict()
    wandb.init(project="coinrun",
               notes=" baseline train",
               tags=["baseline", Config.RUN_ID.split('-')[0]],
               config=config)

    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)

    utils.setup_mpi_gpus()
    utils.mpi_print('Set up gpu')
    utils.mpi_print(args)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101

    # nenvs is how many envs run parallel on a cpu
    # VenEnv class allows parallel rollout
    nenvs = Config.NUM_ENVS
    total_timesteps = int(256 * 10**6)

    env = utils.make_general_env(nenvs, seed=rank)
    utils.mpi_print('Set up env')

    with tf.Session(config=config):
        env = wrappers.add_final_wrappers(env)

        policy = policies_back.get_policy()
        #policy = policies.get_policy()
        utils.mpi_print('Set up policy')

        learn_func(policy=policy,
                   env=env,
                   log_interval=args.log_interval,
                   save_interval=args.save_interval,
                   nsteps=Config.NUM_STEPS,
                   nminibatches=Config.NUM_MINIBATCHES,
                   lam=Config.GAE_LAMBDA,
                   gamma=Config.GAMMA,
                   noptepochs=Config.PPO_EPOCHS,
                   ent_coef=Config.ENTROPY_COEFF,
                   vf_coef=Config.VF_COEFF,
                   max_grad_norm=Config.MAX_GRAD_NORM,
                   lr=lambda f: f * Config.LEARNING_RATE,
                   cliprange=lambda f: f * Config.CLIP_RANGE,
                   total_timesteps=total_timesteps)
示例#4
0
def load_args(load_key='default'):
    """get train args of retore id"""
    load_data = Config.get_load_data(load_key)
    if load_data is None:
        return False

    args_dict = load_data['args']

    #Config.parse_args_dict(args_dict)

    return args_dict
示例#5
0
def main():
    # load from restore file
    args_dict = utils.load_args()
    # train args of restore id
    test_args = setup_utils.setup_and_load()
    if 'NR' in Config.RESTORE_ID:
        Config.USE_LSTM = 2
    if 'dropout' in Config.RESTORE_ID:
        Config.DROPOUT = 0
        Config.USE_BATCH_NORM = 0

    wandb.init(project="coinrun",
               notes="test",
               tags=["baseline", "test"],
               config=Config.get_args_dict())

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    seed = np.random.randint(100000)
    Config.SET_SEED = seed

    overlap = {
        'set_seed': Config.SET_SEED,
        'rep': Config.REP,
        'highd': Config.HIGH_DIFFICULTY,
        'num_levels': Config.NUM_LEVELS,
        'use_lstm': Config.USE_LSTM,
        'dropout': Config.DROPOUT,
        'use_batch_norm': Config.USE_BATCH_NORM
    }

    load_file = Config.get_load_filename(restore_id=Config.RESTORE_ID)
    mpi_print('load file name', load_file)
    mpi_print('seed', seed)
    mpi_print("---------------------------------------")
    for checkpoint in range(1, 33):
        with tf.Session() as sess:
            steps_elapsed = checkpoint * 8000000
            mpi_print('steps_elapsed:', steps_elapsed)
            enjoy_env_sess(sess, checkpoint, overlap)
示例#6
0
def setup_and_load(use_cmd_line_args=True, **kwargs):
    """
    Initialize the global config using command line options, defaulting to the values in `config.py`.

    `use_cmd_line_args`: set to False to ignore command line arguments passed to the program
    `**kwargs`: override the defaults from `config.py` with these values
    """
    args = Config.initialize_args(use_cmd_line_args=use_cmd_line_args,
                                  **kwargs)

    load_for_setup_if_necessary()

    return args
示例#7
0
def restore_file_back(restore_id, load_key='default'):
    if restore_id is not None:
        load_file = Config.get_load_filename(restore_id=restore_id)
        filepath = file_to_path(load_file)
        load_data = joblib.load(filepath)

        Config.set_load_data(load_data, load_key=load_key)

        restored_args = load_data['args']
        sub_dict = {}
        res_keys = Config.RES_KEYS

        for key in res_keys:
            if key in restored_args:
                sub_dict[key] = restored_args[key]
            else:
                print('warning key %s not restored' % key)

        Config.parse_args_dict(sub_dict)

    from coinrun.coinrunenv import init_args_and_threads
    init_args_and_threads(4)
示例#8
0
def main():
    args = setup_utils.setup_and_load()

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    print('size', size)

    # For wandb package to visualize results curves
    config = Config.get_args_dict()
    wandb.init(project="coinrun",
               notes="network randomization",
               tags=["baseline"],
               config=config)

    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)

    utils.setup_mpi_gpus()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101

    nenvs = Config.NUM_ENVS
    total_timesteps = int(256e6)

    env = utils.make_general_env(nenvs, seed=rank)

    with tf.Session(config=config):
        env = wrappers.add_final_wrappers(env)

        policy = nr_policies.get_policy()

        nr_ppo2.learn(policy=policy,
                      env=env,
                      save_interval=args.save_interval,
                      nsteps=Config.NUM_STEPS,
                      nminibatches=Config.NUM_MINIBATCHES,
                      lam=0.95,
                      gamma=Config.GAMMA,
                      noptepochs=Config.PPO_EPOCHS,
                      log_interval=1,
                      ent_coef=Config.ENTROPY_COEFF,
                      lr=lambda f: f * Config.LEARNING_RATE,
                      cliprange=lambda f: f * 0.2,
                      total_timesteps=total_timesteps)
示例#9
0
def create_act_model(sess, env, nenvs):
    load_data = Config.get_load_data('default')
    create_additional = 'use_minimum_model' not in load_data['args']

    ob_space = env.observation_space
    ac_space = env.action_space

    policy = policies.get_policy()
    act = policy(sess,
                 ob_space,
                 ac_space,
                 nenvs,
                 1,
                 reuse=False,
                 create_additional=create_additional)

    return act
示例#10
0
def load_params_for_scope(sess, scope, load_key='default'):
    load_data = Config.get_load_data(load_key)
    if load_data is None:
        return False

    params_dict = load_data['params']

    if scope in params_dict:
        print('Loading saved file for scope', scope)

        loaded_params = params_dict[scope]

        loaded_params, params = get_savable_params(loaded_params, scope, keep_heads=True)

        restore_params(sess, loaded_params, params)
    
    return True
示例#11
0
    def step_wait(self):
        self.buf_rew = np.zeros_like(self.buf_rew)
        self.buf_done = np.zeros_like(self.buf_done)

        lib.vec_wait(self.handle, self.buf_rgb, self.buf_render_rgb,
                     self.buf_rew, self.buf_done)

        obs_frames = self.buf_rgb.astype(np.float32)

        if Config.USE_BLACK_WHITE:
            obs_frames = np.mean(obs_frames.astype(np.float32),
                                 axis=-1).astype(np.float32)[..., None]

        if Config.is_test_rank():
            obs_frames = slice_spectrum(obs_frames, Config.TEST_SPECTRUM,
                                        Config.RADIUS)
        else:
            obs_frames = slice_spectrum(obs_frames, Config.TRAIN_SPECTRUM,
                                        Config.RADIUS)

        return obs_frames, self.buf_rew, self.buf_done, self.dummy_info
示例#12
0
def save_params_in_scopes(sess, scopes, filename, base_dict=None):
    data_dict = {}

    if base_dict is not None:
        data_dict.update(base_dict)

    save_path = file_to_path(filename)

    data_dict['args'] = Config.get_args_dict()
    param_dict = {}

    for scope in scopes:
        params = tf.trainable_variables(scope)

        if len(params) > 0:
            print('saving scope', scope, filename)
            ps = sess.run(params)

            param_dict[scope] = ps
        
    data_dict['params'] = param_dict
    joblib.dump(data_dict, save_path)
示例#13
0
    def try_load_model(self):
        load_data = Config.get_load_data('default')
        if load_data is None:
            return False

        params_dict = load_data['params']

        if "model" in params_dict:
            print('Loading saved file for scope', "model")

            loaded_params = params_dict["model"]

            if len(loaded_params) != len(self.params):
                print('param mismatch', len(loaded_params), len(self.params))
                assert (False)

            restore_ops = []
            for p, loaded_p in zip(self.params, loaded_params):
                restore_ops.append(tf.assign(p, loaded_p))
            self.sess.run(restore_ops)
            return True
        return False
示例#14
0
def load_params_for_scope(sess, scope, load_key='default', load_path=None):
    if load_path is None:
        load_data = Config.get_load_data(load_key)
    else:
        load_path = file_to_path(load_path)
        if os.path.exists(load_path):
            load_data = joblib.load(load_path)
            print('Load file', load_path)
        else:
            load_data = None
    if load_data is None:
        return False

    params_dict = load_data['params']
    if scope in params_dict:
        print('Loading saved file for scope', scope)
        loaded_params = params_dict[scope]
        loaded_params, params = get_savable_params(loaded_params,
                                                   scope,
                                                   keep_heads=True)
        restore_params(sess, loaded_params, params)

    return True
示例#15
0
def restore_file(restore_id,
                 base_name=None,
                 overlap_config=None,
                 load_key='default'):
    """overlap config means you can modify the config in savefile, e.g. test seed"""
    if restore_id is not None:
        load_file = Config.get_load_filename(restore_id=restore_id,
                                             base_name=base_name)
        filepath = file_to_path(load_file)
        assert os.path.exists(filepath), "don't exist"
        load_data = joblib.load(filepath)

        Config.set_load_data(load_data, load_key=load_key)

        restored_args = load_data['args']
        sub_dict = {}
        res_keys = Config.RES_KEYS

        for key in res_keys:
            if key in restored_args:
                sub_dict[key] = restored_args[key]
            else:
                print('warning key %s not restored' % key)

        Config.parse_args_dict(sub_dict)
        print("Load params")
        if overlap_config is not None:
            Config.parse_args_dict(overlap_config)

    from coinrun.coinrunenv import init_args_and_threads
    print("Init coinrun env threads and env args")
    init_args_and_threads(4)
    if restore_id == None:
        return None
    else:
        return load_file
示例#16
0
 def save_model(base_name=None):
     base_dict = {'datapoints': datapoints}
     utils.save_params_in_scopes(sess, ['model'],
                                 Config.get_save_file(base_name=base_name),
                                 base_dict)
示例#17
0
def create_env(
    num_envs,
    *,
    env_kind="procgen",
    epsilon_greedy=0.0,
    reward_scale=1.0,
    frame_stack=1,
    use_sticky_actions=0,
    coinrun_old_extra_actions=0,
    **kwargs,
):
    if env_kind == "procgen":
        env_kwargs = {k: v for k, v in kwargs.items() if v is not None}
        env_name = env_kwargs.pop("env_name")

        if env_name == "coinrun_old":
            import coinrun
            from coinrun.config import Config

            Config.initialize_args(use_cmd_line_args=False, **env_kwargs)
            global coinrun_initialized
            if not coinrun_initialized:
                coinrun.init_args_and_threads()
                coinrun_initialized = True
            venv = coinrun.make("standard", num_envs)
            if coinrun_old_extra_actions > 0:
                venv = VecExtraActions(
                    venv, extra_actions=coinrun_old_extra_actions, default_action=0
                )

        else:
            from procgen import ProcgenGym3Env
            import gym3

            env_kwargs = {
                k: v for k, v in env_kwargs.items() if k in PROCGEN_KWARG_KEYS
            }
            env = ProcgenGym3Env(num_envs, env_name=env_name, **env_kwargs)
            env = gym3.ExtractDictObWrapper(env, "rgb")
            venv = gym3.ToBaselinesVecEnv(env)

    elif env_kind == "atari":
        game_version = "v0" if use_sticky_actions == 1 else "v4"

        def make_atari_env(lower_env_id, num_env):
            env_id = ATARI_ENV_DICT[lower_env_id] + f"NoFrameskip-{game_version}"

            def make_atari_env_fn():
                env = make_atari(env_id)
                env = wrap_deepmind(env, frame_stack=False, clip_rewards=False)

                return env

            return SubprocVecEnv([make_atari_env_fn for i in range(num_env)])

        lower_env_id = kwargs["env_id"]

        venv = make_atari_env(lower_env_id, num_envs)

    else:
        raise ValueError(f"Unsupported env_kind: {env_kind}")

    if frame_stack > 1:
        venv = VecFrameStack(venv=venv, nstack=frame_stack)

    if reward_scale != 1:
        venv = VecRewardScale(venv, reward_scale)

    venv = VecMonitor(venv=venv, filename=None, keep_buf=100)

    if epsilon_greedy > 0:
        venv = EpsilonGreedy(venv, epsilon_greedy)

    venv = VecShallowCopy(venv)

    return venv
示例#18
0
 def __init__(self, comm, **kwargs):
     self.comm = comm
     self.train_frac = 1.0 - Config.get_test_frac()
     tf.train.AdamOptimizer.__init__(self, **kwargs)
示例#19
0
    def __init__(self, sess):
        comm = MPI.COMM_WORLD
        rank = comm.Get_rank()

        clean_tb_dir()

        tb_writer = tf.summary.FileWriter(
            Config.TB_DIR + '/' + Config.RUN_ID + '_' + str(rank), sess.graph)
        total_steps = [0]

        should_log = (rank == 0 or Config.LOG_ALL_MPI)

        if should_log:
            hyperparams = np.array(Config.get_arg_text())
            hyperparams_tensor = tf.constant(hyperparams)

            summary_op = tf.summary.text("hyperparameters info",
                                         hyperparams_tensor)
            summary = sess.run(summary_op)

            tb_writer.add_summary(summary)

        def add_summary(_merged, interval=1):
            if should_log:
                total_steps[0] += 1

                if total_steps[0] % interval == 0:
                    tb_writer.add_summary(_merged, total_steps[0])
                    tb_writer.flush()

        tuples = []

        def make_scalar_graph(name):
            scalar_ph = tf.placeholder(name='scalar_' + name, dtype=tf.float32)
            scalar_summary = tf.compat.v1.summary.scalar(name, scalar_ph)
            merged = tf.compat.v1.summary.merge([scalar_summary])
            tuples.append((scalar_ph, merged))

        name_dict = {}
        curr_name_idx = [0]

        def log_scalar(x, name, step=-1):
            if not name in name_dict:
                name_dict[name] = curr_name_idx[0]
                tf_name = (name + '_' +
                           Config.RUN_ID) if curr_name_idx[0] == 0 else name
                make_scalar_graph(tf_name)
                curr_name_idx[0] += 1

            idx = name_dict[name]

            scalar_ph, merged = tuples[idx]

            if should_log:
                if step == -1:
                    step = total_steps[0]
                    total_steps[0] += 1

                _merged = sess.run(merged, {scalar_ph: x})

                tb_writer.add_summary(_merged, step)
                tb_writer.flush()

        self.add_summary = add_summary
        self.log_scalar = log_scalar
示例#20
0
def setup(**kwargs):
    Config.merge(kwargs)
    from coinrun.coinrunenv import init_args_and_threads

    init_args_and_threads()
示例#21
0
def enjoy_env_sess(sess, checkpoint, overlap):
    #base_name = str(8*checkpoint)  + 'M'
    #load_file = setup_utils.restore_file(Config.RESTORE_ID,base_name=base_name)
    should_eval = True
    mpi_print('test levels seed', Config.SET_SEED)
    mpi_print('test levels ', Config.NUM_LEVELS)
    rep_count = 50

    env = utils.make_general_env(20)
    env = wrappers.add_final_wrappers(env)
    nenvs = env.num_envs

    sess.run(tf.global_variables_initializer())
    args_now = Config.get_args_dict()
    #args_run = utils.load_args()
    agent = create_act_model(sess, env, nenvs)

    # load name is specified by config.RESTORE_ID adn return True/False
    if checkpoint != 32:
        base_name = str(8 * checkpoint) + 'M'
    elif checkpoint == 0:
        mean_score = 0.0
        succ_rate = 0.0
        wandb.log({
            'Rew_mean': mean_score,
            'Succ_rate': succ_rate,
            'Step_elapsed': steps_elapsed
        })
        return mean_score, succ_rate
    else:
        base_name = None

    sess.run(tf.global_variables_initializer())
    # env init here
    load_file = setup_utils.restore_file(Config.RESTORE_ID,
                                         overlap_config=overlap,
                                         base_name=base_name)

    is_loaded = utils.load_params_for_scope(sess, 'model')
    if not is_loaded:
        mpi_print('NO SAVED PARAMS LOADED')
        return mean_score, succ_rate

    obs = env.reset()
    t_step = 0

    scores = np.zeros((nenvs, rep_count))
    eplens = np.zeros((nenvs, rep_count))
    #scores = np.array([0] * nenvs)
    score_counts = np.array([0] * nenvs)

    # curr_rews = np.zeros((nenvs, 3))

    def should_continue():
        if should_eval:
            return np.sum(score_counts) < rep_count * nenvs

        return True

    state = agent.initial_state
    done = np.zeros(nenvs)

    def rollout(obs, state, done):
        """rollout for rep * nenv times and return scores"""
        t = 0
        count = 0
        rews = np.zeros((nenvs, rep_count))
        while should_continue():
            action, values, state, _ = agent.step(obs, state, done)
            obs, rew, done, info = env.step(action)
            rews[:, count] += rew
            t += 1

            for i, d in enumerate(done):
                if d:
                    eplens[i][count] = t
                    if score_counts[i] < rep_count:
                        score_counts[i] += 1
                        count = score_counts[i] - 1
                        # aux score
                        if 'episode' in info[i]:
                            scores[i][count] = info[i].get('episode')['r']

        return scores, rews, eplens

    if is_loaded:
        mpi_print(load_file)
        scores, rews, eplens = rollout(obs, state, done)

    size = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    if size == 1:
        if rank == 0:
            testset_size = rep_count * nenvs
            utils.save_pickle(scores, Config.LOGDIR + 'scores')
            mean_score = np.sum(scores) / testset_size
            succ_rate = np.sum(scores == 10.0) / testset_size
            mpi_print('cpus ', size)
            mpi_print('testset size', testset_size)
            # NUM_LEVELS = 0 means unbounded set so the set size is rep_counts * nenvs
            # each one has a new seed(maybe counted)
            # mpi_print('score detail',scores.flatten())
            mpi_print('succ_rate', succ_rate)
            steps_elapsed = checkpoint * 8000000
            mpi_print('steps_elapsed:', steps_elapsed)
            mpi_print('mean score', mean_score)
            wandb.log({
                'Rew_mean': mean_score,
                'Succ_rate': succ_rate,
                'Step_elapsed': steps_elapsed
            })
            #mpi_print('mean score of each env',[np.mean(s) for s in scores])
    else:
        testset_size = rep_count * nenvs
        succ = np.sum(scores=10.0) / testset_size
        succ_rate = utils.mpi_average([succ])
        mean_score_tmp = np.sum(scores) / testset_size
        mean_score = utils.mpi_average([mean_score_tmp])
        if rank == 0:
            mpi_print('testset size', rep_count * nenvs * size)
            mpi_print('load file name', load_file)
            mpi_print('testset size', testset_size)
            # NUM_LEVELS = 0 means unbounded set so the set size is rep_counts * nenvs
            # each one has a new seed(maybe counted)
            # mpi_print('score detail',scores.flatten())
            mpi_print('succ_rate', succ_rate)
            mpi_print('mean score', mean_score)
            wandb.log({'Rew_mean': mean_score, 'Succ_rate': succ_rate})

    return mean_score, succ_rate
示例#22
0
def main():
    # general setup

    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'

    args = setup_utils.setup_and_load()

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)

    utils.setup_mpi_gpus()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101

    # perpare directory
    sub_dir = utils.file_to_path(Config.get_save_file(base_name="tmp"))
    if os.path.isdir(sub_dir):
        shutil.rmtree(path=sub_dir)
    os.mkdir(sub_dir)

    # hyperparams
    nenvs = Config.NUM_ENVS
    total_timesteps = Config.TIMESTEPS
    population_size = Config.POPULATION_SIZE
    timesteps_per_agent = Config.TIMESTEPS_AGENT
    worker_count = Config.WORKER_COUNT
    passthrough_perc = Config.PASSTHROUGH_PERC
    mutating_perc = Config.MUTATING_PERC

    # create environment
    def make_env():
        env = utils.make_general_env(nenvs, seed=rank)
        env = wrappers.add_final_wrappers(env)
        return env

    # setup session and workers, and therefore tensorflow ops
    graph = tf.get_default_graph()
    sess = tf.Session(graph=graph)

    policy = policies.get_policy()

    workers = [
        Worker(sess, i, nenvs, make_env, policy, sub_dir)
        for i in range(worker_count)
    ]

    tb_writer = TB_Writer(sess)

    def clean_exit():

        for worker in workers:
            Thread.join(worker.thread)

        utils.mpi_print("")
        utils.mpi_print("== total duration",
                        "{:.1f}".format(time.time() - t_first_start), " s ==")
        utils.mpi_print(" exit...")

        # save best performing agent
        population.sort(key=lambda k: k['fit'], reverse=True)
        workers[0].restore_model(name=population[0]["name"])
        workers[0].dump_model()

        # cleanup
        sess.close()
        shutil.rmtree(path=sub_dir)

    # load data from restore point and seed the whole population
    loaded_name = None
    if workers[0].try_load_model():
        loaded_name = str(uuid.uuid1())
        workers[0].save_model(name=loaded_name)

    # initialise population
    # either all random and no mutations pending
    # or all from restore point with all but one to be mutated
    population = [{
        "name": loaded_name or str(uuid.uuid1()),
        "fit": -1,
        "need_mut": loaded_name != None and i != 0,
        "age": -1,
        "mean_ep_len": -1
    } for i in range(population_size)]

    utils.mpi_print("== population size", population_size, ", t_agent ",
                    timesteps_per_agent, " ==")

    t_first_start = time.time()
    try:
        # main loop
        generation = 0
        timesteps_done = 0
        while timesteps_done < total_timesteps:
            t_generation_start = time.time()

            utils.mpi_print("")
            utils.mpi_print("__ Generation", generation, " __")

            # initialise and evaluate all new agents
            for agent in population:
                #if agent["fit"] < 0: # test/
                if True:  # test constant reevaluation, to dismiss "lucky runs" -> seems good

                    # pick worker from pool and let it work on the agent
                    not_in_work = True
                    while not_in_work:
                        for worker in workers:
                            if worker.can_take_work():
                                worker.work(agent, timesteps_per_agent)
                                not_in_work = False
                                break

                    timesteps_done += timesteps_per_agent * nenvs

            for worker in workers:
                Thread.join(worker.thread)

            # sort by fitness
            population.sort(key=lambda k: k["fit"], reverse=True)

            # print stuff
            fitnesses = [agent["fit"] for agent in population]
            ages = [agent["age"] for agent in population]
            ep_lens = [agent["mean_ep_len"] for agent in population]

            utils.mpi_print(*["{:5.3f}".format(f) for f in fitnesses])
            utils.mpi_print(*["{:5}".format(a) for a in ages])
            utils.mpi_print("__ average fit", "{:.1f}".format(
                np.mean(fitnesses)), ", t_done", timesteps_done, ", took",
                            "{:.1f}".format(time.time() - t_generation_start),
                            "s", ", total",
                            "{:.1f}".format(time.time() - t_first_start),
                            "s __")

            # log stuff
            tb_writer.log_scalar(np.mean(fitnesses), "mean_fit",
                                 timesteps_done)
            tb_writer.log_scalar(np.median(fitnesses), "median_fit",
                                 timesteps_done)
            tb_writer.log_scalar(np.max(fitnesses), "max_fit", timesteps_done)
            tb_writer.log_scalar(np.mean(ages), "mean_age", timesteps_done)
            ep_lens_mean = np.nanmean(ep_lens)
            if (ep_lens_mean):
                tb_writer.log_scalar(ep_lens_mean, "mean_ep_lens",
                                     timesteps_done)

            # cleanup to prevent disk clutter
            to_be_removed = set(
                re.sub(r'\..*$', '', f) for f in os.listdir(sub_dir)) - set(
                    [agent["name"] for agent in population])
            for filename in to_be_removed:
                os.remove(sub_dir + "/" + filename + ".index")
                os.remove(sub_dir + "/" + filename + ".data-00000-of-00001")

            # break when times up
            if not timesteps_done < total_timesteps:
                break

            # mark weak agents for replacement
            cutoff_passthrough = math.floor(population_size * passthrough_perc)
            cutoff_mutating = math.floor(population_size * mutating_perc)
            source_agents = population[:cutoff_mutating]

            new_population = population[:cutoff_passthrough]

            k = 0
            while len(new_population) < population_size:
                new_agent = {
                    "name": source_agents[k]
                    ["name"],  # Take name from source agent, so mutation knows the parent
                    "fit": -1,
                    "need_mut": True,
                    "age": 0
                }
                new_population.append(new_agent)
                k = (k + 1) % len(source_agents)

            population = new_population
            generation += 1

        clean_exit()
    except KeyboardInterrupt:
        clean_exit()

    return 0
示例#23
0
def main(sess):

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    seed = int(time.time()) % 10000

    if Config.EXTRACT_SEED != -1:
        seed = Config.EXTRACT_SEED
    if Config.EXTRACT_RANK != -1:
        rank = Config.EXTRACT_RANK

    set_global_seeds(seed * 100 + rank)

    utils.setup_mpi_gpus()

    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True # pylint: disable=E1101

    use_policy = (Config.RESTORE_ID != '')

    nenvs = Config.NUM_ENVS
    total_timesteps = int(502e6)
    env = utils.make_general_env(nenvs, seed=rank)

    if use_policy:
        agent = create_act_model(sess, env, nenvs)
        sess.run(tf.compat.v1.global_variables_initializer())
        loaded_params = utils.load_params_for_scope(sess, 'model')
        if not loaded_params:
            print('NO SAVED PARAMS LOADED')

    # make directory
    DIR_NAME = './VAE/records/'
    if not os.path.exists(DIR_NAME):
        os.makedirs(DIR_NAME, exist_ok=True)
    
    # set file name
    filename = DIR_NAME+"/"+Config.get_save_file()+"_"+str(seed * 100 + rank)+".npz"
    
    with tf.compat.v1.Session(config=config):
        env = wrappers.add_final_wrappers(env)
        nenv = nenv = env.num_envs if hasattr(env, 'num_envs') else 1
        obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name)
        obs[:] = env.reset()
        dones = [False for _ in range(nenv)]
        
        # remove noisy inputs
        actions = [env.action_space.sample() for _ in range(nenv)]
        actions = np.array(actions)
        obs[:], rewards, dones, _ = env.step(actions)
        state = agent.initial_state
        
        mb_obs, mb_rewards, mb_actions, mb_next_obs, mb_dones = [],[],[],[],[]
        # For n in range number of steps
        for _ in range(400):
            # Given observations, get action value and neglopacs
            # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
            if use_policy:
                actions, _, _, _ = agent.step(obs, state, dones)
            else:
                actions = [env.action_space.sample() for _ in range(nenv)]
            actions = np.array(actions)
            mb_obs.append(obs.copy())
            mb_actions.append(actions)
            mb_dones.append(dones)
            
            # Take actions in env and look the results
            # Infos contains a ton of useful informations
            obs[:], rewards, dones, _ = env.step(actions)
            mb_next_obs.append(obs.copy())
            mb_rewards.append(rewards)
        #batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=obs.dtype)
        mb_next_obs = np.asarray(mb_next_obs, dtype=obs.dtype)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
        mb_actions = np.asarray(mb_actions)
        mb_dones = np.asarray(mb_dones, dtype=np.bool)
        
        #np.savez_compressed(filename, obs=mb_obs, action=mb_actions, next_obs=mb_next_obs, reward=mb_rewards, dones=mb_dones)
        np.savez_compressed(filename, obs=mb_obs)
        return filename