Пример #1
0
def agent(game,
          n_ep,
          n_mcts,
          max_ep_len,
          lr,
          c,
          gamma,
          data_size,
          batch_size,
          temp,
          n_hidden_layers,
          n_hidden_units,
          stochastic=False,
          eval_freq=-1,
          eval_episodes=100,
          alpha=0.6,
          out_dir='../',
          pre_process=None,
          visualize=False):
    ''' Outer training loop '''
    if pre_process is not None:
        pre_process()

    # tf.reset_default_graph()

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    episode_returns = []  # storage
    timepoints = []
    # Environments
    Env = make_game(game)
    is_atari = is_atari_game(Env)
    mcts_env = make_game(game) if is_atari else None
    online_scores = []
    offline_scores = []
    mcts_params = dict(gamma=gamma)
    if stochastic:
        mcts_params['alpha'] = alpha
        mcts_maker = MCTSStochastic
    else:
        mcts_maker = MCTS

    D = Database(max_size=data_size, batch_size=batch_size)
    model = Model(Env=Env,
                  lr=lr,
                  n_hidden_layers=n_hidden_layers,
                  n_hidden_units=n_hidden_units)
    t_total = 0  # total steps
    R_best = -np.Inf

    with tf.Session() as sess:
        model.sess = sess
        sess.run(tf.global_variables_initializer())

        for ep in range(n_ep):
            if eval_freq > 0 and ep % eval_freq == 0:  #and ep > 0
                print(
                    'Evaluating policy for {} episodes!'.format(eval_episodes))
                seed = np.random.randint(1e7)  # draw some Env seed
                Env.seed(seed)
                s = Env.reset()
                mcts = mcts_maker(root_index=s,
                                  root=None,
                                  model=model,
                                  na=model.action_dim,
                                  **mcts_params)
                env_wrapper = EnvEvalWrapper()
                env_wrapper.mcts = mcts
                starting_states = []

                def reset_env():
                    s = Env.reset()
                    env_wrapper.mcts = mcts_maker(root_index=s,
                                                  root=None,
                                                  model=model,
                                                  na=model.action_dim,
                                                  **mcts_params)
                    starting_states.append(s)
                    if env_wrapper.curr_probs is not None:
                        env_wrapper.episode_probabilities.append(
                            env_wrapper.curr_probs)
                    env_wrapper.curr_probs = []
                    return s

                def forward(a, s, r):
                    env_wrapper.mcts.forward(a, s, r)
                    #pass

                env_wrapper.reset = reset_env
                env_wrapper.step = lambda x: Env.step(x)
                env_wrapper.forward = forward
                env_wrapper.episode_probabilities = []
                env_wrapper.curr_probs = None

                def pi_wrapper(ob):
                    if not is_atari:
                        mcts_env = None
                    env_wrapper.mcts.search(n_mcts=n_mcts,
                                            c=c,
                                            Env=Env,
                                            mcts_env=mcts_env)
                    state, pi, V = env_wrapper.mcts.return_results(temp=0)
                    #pi = model.predict_pi(s).flatten()
                    env_wrapper.curr_probs.append(pi)
                    a = np.argmax(pi)
                    return a

                rews, lens = eval_policy(pi_wrapper,
                                         env_wrapper,
                                         n_episodes=eval_episodes,
                                         verbose=True)
                offline_scores.append([
                    np.min(rews),
                    np.max(rews),
                    np.mean(rews),
                    np.std(rews),
                    len(rews),
                    np.mean(lens)
                ])
                # if len(rews) < eval_episodes or len(rews) == 0:
                #     print("WTF")
                # if np.std(rews) == 0.:
                #     print("WTF 2")
                np.save(out_dir + '/offline_scores.npy', offline_scores)
            start = time.time()
            s = Env.reset()
            R = 0.0  # Total return counter
            a_store = []
            seed = np.random.randint(1e7)  # draw some Env seed
            Env.seed(seed)
            if is_atari:
                mcts_env.reset()
                mcts_env.seed(seed)
            if ep % eval_freq == 0:
                print("Collecting %d episodes" % eval_freq)
            mcts = mcts_maker(
                root_index=s,
                root=None,
                model=model,
                na=model.action_dim,
                **mcts_params)  # the object responsible for MCTS searches
            for t in range(max_ep_len):
                # MCTS step
                if not is_atari:
                    mcts_env = None
                mcts.search(n_mcts=n_mcts, c=c, Env=Env,
                            mcts_env=mcts_env)  # perform a forward search
                if visualize:
                    mcts.visualize()
                state, pi, V = mcts.return_results(
                    temp)  # extract the root output
                D.store((state, V, pi))

                # Make the true step
                a = np.random.choice(len(pi), p=pi)
                a_store.append(a)
                s1, r, terminal, _ = Env.step(a)
                R += r
                t_total += n_mcts  # total number of environment steps (counts the mcts steps)

                if terminal:
                    break
                else:
                    mcts.forward(a, s1, r)

            # Finished episode
            episode_returns.append(R)  # store the total episode return
            online_scores.append(R)
            timepoints.append(
                t_total)  # store the timestep count of the episode return
            store_safely(out_dir, 'result', {
                'R': episode_returns,
                't': timepoints
            })
            np.save(out_dir + '/online_scores.npy', online_scores)
            # print('Finished episode {}, total return: {}, total time: {} sec'.format(ep, np.round(R, 2),
            #                                                                          np.round((time.time() - start),
            #                                                                                   1)))

            if R > R_best:
                a_best = a_store
                seed_best = seed
                R_best = R

            # Train
            D.reshuffle()
            try:
                for epoch in range(1):
                    for sb, Vb, pib in D:
                        model.train(sb, Vb, pib)
            except Exception as e:
                print("ASD")
            model.save(out_dir + 'model')
    # Return results
    return episode_returns, timepoints, a_best, seed_best, R_best, offline_scores
 def eval_policy_closure(**args):
     return eval_policy(env_eval, **args)
Пример #3
0
def agent(game,
          n_ep,
          n_mcts,
          max_ep_len,
          lr,
          c,
          gamma,
          data_size,
          batch_size,
          temp,
          n_hidden_layers,
          n_hidden_units,
          stochastic=False,
          eval_freq=-1,
          eval_episodes=100,
          alpha=0.6,
          n_epochs=100,
          c_dpw=1,
          numpy_dump_dir='../',
          pre_process=None,
          visualize=False,
          game_params={},
          parallelize_evaluation=False,
          mcts_only=False,
          particles=0,
          show_plots=False,
          n_workers=1,
          use_sampler=False,
          budget=np.inf,
          unbiased=False,
          biased=False,
          max_workers=100,
          variance=False,
          depth_based_bias=False,
          scheduler_params=None,
          out_dir=None,
          render=False,
          second_version=False,
          third_version=False):
    visualizer = None

    # if particles:
    #     parallelize_evaluation = False  # Cannot run parallelized evaluation with particle filtering

    if not mcts_only:
        from mcts import MCTS
        from mcts_dpw import MCTSStochastic
    elif particles:
        if unbiased:
            from particle_filtering.ol_uct import OL_MCTS
        elif biased:
            if second_version:
                from particle_filtering.pf_uct_2 import PFMCTS2 as PFMCTS
            elif third_version:
                from particle_filtering.pf_uct_3 import PFMCTS3 as PFMCTS
            else:
                from particle_filtering.pf_uct import PFMCTS
        else:
            from particle_filtering.pf_mcts_edo import PFMCTS
    else:
        from pure_mcts.mcts import MCTS
        from pure_mcts.mcts_dpw import MCTSStochastic

    if parallelize_evaluation:
        print("The evaluation will be parallel")

    parameter_list = {
        "game": game,
        "n_ep": n_ep,
        "n_mcts": n_mcts,
        "max_ep_len": max_ep_len,
        "lr": lr,
        "c": c,
        "gamma": gamma,
        "data_size": data_size,
        "batch_size": batch_size,
        "temp": temp,
        "n_hidden_layers": n_hidden_layers,
        "n_hidden_units": n_hidden_units,
        "stochastic": stochastic,
        "eval_freq": eval_freq,
        "eval_episodes": eval_episodes,
        "alpha": alpha,
        "n_epochs": n_epochs,
        "out_dir": numpy_dump_dir,
        "pre_process": pre_process,
        "visualize": visualize,
        "game_params": game_params,
        "n_workers": n_workers,
        "use_sampler": use_sampler,
        "variance": variance,
        "depth_based_bias": depth_based_bias,
        "unbiased": unbiased,
        "second_version": second_version,
        'third_version': third_version
    }
    if out_dir is not None:
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        with open(os.path.join(out_dir, "parameters.txt"), 'w') as d:
            d.write(json.dumps(parameter_list))
    #logger = Logger(parameter_list, game, show=show_plots)

    if DEBUG_TAXI:
        from utils.visualization.taxi import TaxiVisualizer
        with open(game_params["grid"]) as f:
            m = f.readlines()
            matrix = []
            for r in m:
                row = []
                for ch in r.strip('\n'):
                    row.append(ch)
                matrix.append(row)
            visualizer = TaxiVisualizer(matrix)
            f.close()
            exit()
    ''' Outer training loop '''
    if pre_process is not None:
        pre_process()

    # numpy_dump_dir = logger.numpy_dumps_dir
    #
    # if not os.path.exists(numpy_dump_dir):
    #     os.makedirs(numpy_dump_dir)

    episode_returns = []  # storage
    timepoints = []

    # Environments
    if game == 'Trading-v0':
        game_params['save_dir'] = out_dir  #logger.save_dir
    Env = make_game(game, game_params)
    num_actions = Env.action_space.n
    sampler = None
    if use_sampler and not (unbiased or biased):

        def make_pi(action_space):
            def pi(s):
                return np.random.randint(low=0, high=action_space.n)

            return pi

        def make_env():
            return make_game(game, game_params)

        sampler = ParallelSampler(make_pi=make_pi,
                                  make_env=make_env,
                                  n_particles=particles,
                                  n_workers=n_workers,
                                  seed=10)

    is_atari = is_atari_game(Env)
    mcts_env = make_game(game, game_params) if is_atari else None
    online_scores = []
    offline_scores = []

    # Setup the parameters for generating the search environments

    if game == "RaceStrategy-v1":
        mcts_maker, mcts_params, c_dpw = load_race_agents_config(
            'envs/configs/race_strategy_full.json', gamma)

    else:
        mcts_params = dict(gamma=gamma)
        if particles:
            if not (biased or unbiased):
                mcts_params['particles'] = particles
                mcts_params['sampler'] = sampler
            elif biased:
                mcts_params['alpha'] = alpha
                mcts_maker = PFMCTS

            mcts_params['depth_based_bias'] = depth_based_bias
            if unbiased:
                mcts_params['variance'] = variance
                mcts_maker = OL_MCTS

        elif stochastic:
            mcts_params['alpha'] = alpha
            mcts_params['depth_based_bias'] = depth_based_bias
            mcts_maker = MCTSStochastic
        else:
            mcts_maker = MCTS

    # Prepare the database for storing training data to be sampled
    db = Database(max_size=data_size, batch_size=batch_size)

    # TODO extract dimensions to avoid allocating model
    # Setup the model
    model_params = {
        "Env": Env,
        "lr": lr,
        "n_hidden_layers": n_hidden_layers,
        "n_hidden_units": n_hidden_units,
        "joint_networks": True
    }

    model_wrapper = ModelWrapper(**model_params)

    t_total = 0  # total steps
    R_best = -np.Inf
    a_best = None
    seed_best = None

    # Variables for storing values to be plotted
    avgs = []
    stds = []

    # Run the episodes
    for ep in range(n_ep):

        if DEBUG_TAXI:
            visualizer.reset()

        ##### Policy evaluation step #####
        if eval_freq > 0 and ep % eval_freq == 0:  # and ep > 0
            print(
                '--------------------------------\nEvaluating policy for {} episodes!'
                .format(eval_episodes))
            seed = np.random.randint(1e7)  # draw some Env seed
            Env.seed(seed)
            s = Env.reset()

            if parallelize_evaluation:
                penv = None
                pgame = {
                    "game_maker": make_game,
                    "game": game,
                    "game_params": game_params
                }
            else:
                penv = Env
                pgame = None

            model_file = os.path.join(out_dir, "model.h5")

            # model_wrapper.save(model_file)

            if game == "RaceStrategy-v1":
                env_wrapper = RaceWrapper(s,
                                          mcts_maker,
                                          model_file,
                                          model_params,
                                          mcts_params,
                                          is_atari,
                                          n_mcts,
                                          budget,
                                          mcts_env,
                                          c_dpw,
                                          temp,
                                          env=penv,
                                          game_maker=pgame,
                                          mcts_only=mcts_only,
                                          scheduler_params=scheduler_params)
            else:
                env_wrapper = Wrapper(s,
                                      mcts_maker,
                                      model_file,
                                      model_params,
                                      mcts_params,
                                      is_atari,
                                      n_mcts,
                                      budget,
                                      mcts_env,
                                      c_dpw,
                                      temp,
                                      env=penv,
                                      game_maker=pgame,
                                      mcts_only=mcts_only,
                                      scheduler_params=scheduler_params)

            # Run the evaluation
            if parallelize_evaluation:
                total_reward, reward_per_timestep, lens, action_counts = \
                    parallelize_eval_policy(env_wrapper, n_episodes=eval_episodes, verbose=False, max_len=max_ep_len,
                                            max_workers=max_workers, out_dir=out_dir)
            else:
                total_reward, reward_per_timestep, lens, action_counts = \
                    eval_policy(env_wrapper, n_episodes=eval_episodes, verbose=False, max_len=max_ep_len,
                                visualize=visualize, out_dir=out_dir, render=render)

            # offline_scores.append([np.min(rews), np.max(rews), np.mean(rews), np.std(rews),
            #                        len(rews), np.mean(lens)])

            offline_scores.append(
                [total_reward, reward_per_timestep, lens, action_counts])

            #np.save(numpy_dump_dir + '/offline_scores.npy', offline_scores)

            # Store and plot data
            avgs.append(np.mean(total_reward))
            stds.append(np.std(total_reward))

            #logger.plot_evaluation_mean_and_variance(avgs, stds)

        ##### Policy improvement step #####

        if not mcts_only:

            start = time.time()
            s = start_s = Env.reset()
            R = 0.0  # Total return counter
            a_store = []
            seed = np.random.randint(1e7)  # draw some Env seed
            Env.seed(seed)
            if is_atari:
                mcts_env.reset()
                mcts_env.seed(seed)

            if eval_freq > 0 and ep % eval_freq == 0:
                print("\nCollecting %d episodes" % eval_freq)
            mcts = mcts_maker(
                root_index=s,
                root=None,
                model=model_wrapper,
                na=model_wrapper.action_dim,
                **mcts_params)  # the object responsible for MCTS searches

            print("\nPerforming MCTS steps\n")

            ep_steps = 0
            start_targets = []

            for st in range(max_ep_len):

                print_step = max_ep_len // 10
                if st % print_step == 0:
                    print('Step ' + str(st + 1) + ' of ' + str(max_ep_len))

                # MCTS step
                if not is_atari:
                    mcts_env = None
                mcts.search(n_mcts=n_mcts, c=c, Env=Env,
                            mcts_env=mcts_env)  # perform a forward search

                if visualize:
                    mcts.visualize()

                state, pi, V = mcts.return_results(
                    temp)  # extract the root output

                # Save targets for starting state to debug
                if np.array_equal(start_s, state):
                    if DEBUG:
                        print("Pi target for starting state:", pi)
                    start_targets.append((V, pi))
                db.store((state, V, pi))

                # Make the true step
                a = np.random.choice(len(pi), p=pi)
                a_store.append(a)

                s1, r, terminal, _ = Env.step(a)

                # Perform command line visualization if necessary
                if DEBUG_TAXI:
                    olds, olda = copy.deepcopy(s1), copy.deepcopy(a)
                    visualizer.visualize_taxi(olds, olda)
                    print("Reward:", r)

                R += r
                t_total += n_mcts  # total number of environment steps (counts the mcts steps)
                ep_steps = st + 1

                if terminal:
                    break  # Stop the episode if we encounter a terminal state
                else:
                    mcts.forward(a, s1, r)  # Otherwise proceed

            # Finished episode
            if DEBUG:
                print("Train episode return:", R)
                print("Train episode actions:", a_store)
            episode_returns.append(R)  # store the total episode return
            online_scores.append(R)
            timepoints.append(
                t_total)  # store the timestep count of the episode return
            #store_safely(numpy_dump_dir, '/result', {'R': episode_returns, 't': timepoints})
            #np.save(numpy_dump_dir + '/online_scores.npy', online_scores)

            if DEBUG or True:
                print(
                    'Finished episode {} in {} steps, total return: {}, total time: {} sec'
                    .format(ep, ep_steps, np.round(R, 2),
                            np.round((time.time() - start), 1)))
            # Plot the online return over training episodes

            #logger.plot_online_return(online_scores)

            if R > R_best:
                a_best = a_store
                seed_best = seed
                R_best = R

            print()

            # Train only if the model has to be used
            if not mcts_only:
                # Train
                try:
                    print("\nTraining network")
                    ep_V_loss = []
                    ep_pi_loss = []

                    for _ in range(n_epochs):
                        # Reshuffle the dataset at each epoch
                        db.reshuffle()

                        batch_V_loss = []
                        batch_pi_loss = []

                        # Batch training
                        for sb, Vb, pib in db:

                            if DEBUG:
                                print("sb:", sb)
                                print("Vb:", Vb)
                                print("pib:", pib)

                            loss = model_wrapper.train(sb, Vb, pib)

                            batch_V_loss.append(loss[1])
                            batch_pi_loss.append(loss[2])

                        ep_V_loss.append(mean(batch_V_loss))
                        ep_pi_loss.append(mean(batch_pi_loss))

                    # Plot the loss over training epochs

                    #logger.plot_loss(ep, ep_V_loss, ep_pi_loss)

                except Exception as e:
                    print("Something wrong while training:", e)

                # model.save(out_dir + 'model')

                # Plot the loss over different episodes
                #logger.plot_training_loss_over_time()

                pi_start = model_wrapper.predict_pi(start_s)
                V_start = model_wrapper.predict_V(start_s)

                print("\nStart policy: ", pi_start)
                print("Start value:", V_start)

                #logger.log_start(ep, pi_start, V_start, start_targets)

    # Return results
    if use_sampler:
        sampler.close()
    return episode_returns, timepoints, a_best, seed_best, R_best, offline_scores
def learn(
        *,
        network,
        env,
        eval_policy,
        total_timesteps,
        timesteps_per_batch=1024,  # what to train on
        max_kl=0.001,
        cg_iters=10,
        gamma=0.99,
        lam=1.0,  # advantage estimation
        seed=None,
        ent_coef=0.0,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        vf_iters=3,
        max_episodes=0,
        max_iters=0,  # time constraint
        callback=None,
        load_path=None,
        checkpoint_path_in=None,
        checkpoint_dir_out=None,
        checkpoint_freq=100,  # In iterations!!,
        from_iter=0,
        eval_episodes=20,
        **network_kwargs):
    '''
    learn a policy function with TRPO algorithm

    Parameters:
    ----------

    network                 neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types)
                            or function that takes input placeholder and returns tuple (output, None) for feedforward nets
                            or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets

    env                     environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class

    timesteps_per_batch     timesteps per gradient estimation batch

    max_kl                  max KL divergence between old policy and new policy ( KL(pi_old || pi) )

    ent_coef                coefficient of policy entropy term in the optimization objective

    cg_iters                number of iterations of conjugate gradient algorithm

    cg_damping              conjugate gradient damping

    vf_stepsize             learning rate for adam optimizer used to optimie value function loss

    vf_iters                number of iterations of value function optimization iterations per each policy optimization step

    total_timesteps           max number of timesteps

    max_episodes            max number of episodes

    max_iters               maximum number of policy optimization iterations

    callback                function to be called with (locals(), globals()) each policy optimization step

    load_path               str, path to load the model from (default: None, i.e. no model is loaded)

    **network_kwargs        keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network

    Returns:
    -------

    learnt model

    '''

    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()

    cpus_per_worker = 1
    U.get_session(
        config=tf.ConfigProto(allow_soft_placement=True,
                              inter_op_parallelism_threads=cpus_per_worker,
                              intra_op_parallelism_threads=cpus_per_worker))

    policy = build_policy(env, network, value_network='copy', **network_kwargs)

    set_global_seeds(seed)

    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    # ob_space = Box(low=-np.inf, high=np.inf, shape=(env.observation_space.n,))
    ob_space = env.observation_space
    ac_space = env.action_space

    ob = observation_placeholder(ob_space)
    with tf.variable_scope("pi"):
        pi = policy(observ_placeholder=ob)
    with tf.variable_scope("oldpi"):
        oldpi = policy(observ_placeholder=ob)
    # Loading checkpoint
    if checkpoint_path_in is not None and os.path.isfile(checkpoint_path_in):
        pi.load(checkpoint_path_in)
        logger.log('Loaded policy weights from %s' % checkpoint_path_in)

    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = ent_coef * meanent

    vferr = tf.reduce_mean(tf.square(pi.vf - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = get_trainable_variables("pi")
    # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
    # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
    var_list = get_pi_trainable_variables("pi")
    vf_var_list = get_vf_trainable_variables("pi")

    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(klgrads, tangents)
    ])  # pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(get_variables("oldpi"), get_variables("pi"))
        ])

    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    U.initialize()
    # s = env.reset()
    # start = time.time()
    # for i in range(10000):
    #     pi.step(s, stochastic=True)
    # duration = time.time() - start
    # print(duration)
    # return
    if load_path is not None:
        pi.load(load_path)

    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True,
                                     gamma=gamma)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    iters_eval = 0
    all_logs = []
    best_rew = -np.inf

    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards

    online_scores = []
    offline_scores = []
    if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0:
        # noththing to be done
        return pi

    assert sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) < 2, \
        'out of max_iters, total_timesteps, and max_episodes only one should be specified'

    while True:
        if callback: callback(locals(), globals())
        if total_timesteps and timesteps_so_far >= total_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        logger.log("********** Iteration %i ************" % iters_so_far)

        if iters_so_far % checkpoint_freq == 0 and checkpoint_dir_out is not None:
            if not os.path.exists(checkpoint_dir_out):
                os.makedirs(checkpoint_dir_out)
            pi.save(
                os.path.join(checkpoint_dir_out,
                             'checkpoint_%d' % iters_so_far))
            logger.log('Saved policy weights as %s' % os.path.join(
                checkpoint_dir_out, 'checkpoint_%d.npy' % iters_so_far))

            def pi_wrapper(ob):
                ac, vpred, _, _ = pi.step(ob, stochastic=True)
                return ac

            rew, _, logs, disc_rets, num_stops, avg_damages = eval_policy(
                pi=pi_wrapper, n_episodes=eval_episodes, verbose=True)
            offline_scores.append(
                [np.mean(disc_rets),
                 np.mean(num_stops),
                 np.mean(avg_damages)])
            np.save(os.path.join(checkpoint_dir_out, 'offline_scores.npy'),
                    offline_scores)
            for log in logs:
                log['iter'] = iters_eval
            all_logs = all_logs + logs

            iters_eval += 1

        with timed("sampling"):
            seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate

        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)
        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        args = seg["ob"], seg["ac"], atarg
        fvpargs = [arr[::5] for arr in args]

        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        assign_old_eq_new()  # set old parameter values to new parameter values
        with timed("computegrad"):
            *lossbefore, g = compute_lossandgrad(*args)
        lossbefore = allmean(np.array(lossbefore))
        g = allmean(g)
        if np.allclose(g, 0):
            logger.log("Got zero gradient. not updating")
        else:
            with timed("cg"):
                stepdir = cg(fisher_vector_product,
                             g,
                             cg_iters=cg_iters,
                             verbose=rank == 0)
            assert np.isfinite(stepdir).all()
            shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
            lm = np.sqrt(shs / max_kl)
            # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
            fullstep = stepdir / lm
            expectedimprove = g.dot(fullstep)
            surrbefore = lossbefore[0]
            stepsize = 1.0
            thbefore = get_flat()
            for _ in range(10):
                thnew = thbefore + fullstep * stepsize
                set_from_flat(thnew)
                meanlosses = surr, kl, *_ = allmean(
                    np.array(compute_losses(*args)))
                improve = surr - surrbefore
                logger.log("Expected: %.3f Actual: %.3f" %
                           (expectedimprove, improve))
                if not np.isfinite(meanlosses).all():
                    logger.log("Got non-finite value of losses -- bad!")
                elif kl > max_kl * 1.5:
                    logger.log("violated KL constraint. shrinking step.")
                elif improve < 0:
                    logger.log("surrogate didn't improve. shrinking step.")
                else:
                    logger.log("Stepsize OK!")
                    break
                stepsize *= .5
            else:
                logger.log("couldn't compute a good step")
                set_from_flat(thbefore)
            if nworkers > 1 and iters_so_far % 20 == 0:
                paramsums = MPI.COMM_WORLD.allgather(
                    (thnew.sum(), vfadam.getflat().sum()))  # list of tuples
                assert all(
                    np.allclose(ps, paramsums[0]) for ps in paramsums[1:])

        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)

        with timed("vf"):

            for _ in range(vf_iters):
                for (mbob, mbret) in dataset.iterbatches(
                    (seg["ob"], seg["tdlamret"]),
                        include_final_partial_batch=False,
                        batch_size=64):
                    g = allmean(compute_vflossandgrad(mbob, mbret))
                    vfadam.update(g, vf_stepsize)

        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        ep_rew_mean = np.mean(rewbuffer)
        online_scores.append(ep_rew_mean)
        np.save(os.path.join(checkpoint_dir_out, 'online_scores.npy'),
                online_scores)
        # Saving best
        if iters_so_far % checkpoint_freq == 0 and ep_rew_mean > best_rew and checkpoint_dir_out is not None:
            pi.save(os.path.join(checkpoint_dir_out, 'best'))
            best_rew = ep_rew_mean
            logger.log('Saved policy weights as %s' %
                       os.path.join(checkpoint_dir_out, 'best.npy'))

        if rank == 0:
            logger.dump_tabular()

    return pi