Exemplo n.º 1
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env_id)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                    reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2)
    env = bench.Monitor(env, logger.get_dir() and
                        osp.join(logger.get_dir(), "monitor.json"))
    env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)
    dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation)
    savedir_fname = learn(env,
                          policy_fn,
                          dataset,
                          max_iters=args.BC_max_iter,
                          ckpt_dir=args.checkpoint_dir,
                          log_dir=args.log_dir,
                          task_name=task_name,
                          verbose=True)
    avg_len, avg_ret = runner(env,
                              policy_fn,
                              savedir_fname,
                              timesteps_per_batch=1024,
                              number_trajs=10,
                              stochastic_policy=args.stochastic_policy,
                              save=args.save_sample,
                              reuse=True)
Exemplo n.º 2
0
def test_coexistence(learn_fn, network_fn):
    '''
    Test if more than one model can exist at a time
    '''

    if learn_fn == 'deepq':
            # TODO enable multiple DQN models to be useable at the same time
            # github issue https://github.com/openai/baselines/issues/656
            return

    if network_fn.endswith('lstm') and learn_fn in ['acktr', 'trpo_mpi', 'deepq']:
            # TODO make acktr work with recurrent policies
            # and test
            # github issue: https://github.com/openai/baselines/issues/660
            return

    env = DummyVecEnv([lambda: gym.make('CartPole-v0')])
    learn = get_learn_function(learn_fn)

    kwargs = {}
    kwargs.update(network_kwargs[network_fn])
    kwargs.update(learn_kwargs[learn_fn])

    learn =  partial(learn, env=env, network=network_fn, total_timesteps=0, **kwargs)
    make_session(make_default=True, graph=tf.Graph())
    model1 = learn(seed=1)
    make_session(make_default=True, graph=tf.Graph())
    model2 = learn(seed=2)

    model1.step(env.observation_space.sample())
    model2.step(env.observation_space.sample())
Exemplo n.º 3
0
def train(num_timesteps, seed, model_path=None):
    env_id = 'Humanoid-v2'
    from baselines.ppo1 import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()
    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=64, num_hid_layers=2)
    env = make_mujoco_env(env_id, seed)

    # parameters below were the best found in a simple random search
    # these are good enough to make humanoid walk, but whether those are
    # an absolute best or not is not certain
    env = RewScale(env, 0.1)
    pi = pposgd_simple.learn(env, policy_fn,
            max_timesteps=num_timesteps,
            timesteps_per_actorbatch=2048,
            clip_param=0.2, entcoeff=0.0,
            optim_epochs=10, 
            optim_stepsize=3e-4, 
            optim_batchsize=64, 
            gamma=0.99, 
            lam=0.95,
            schedule='linear',
        )
    env.close()
    if model_path:
        U.save_state(model_path)
        
    return pi
Exemplo n.º 4
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    print('Evaluating {}'.format(args.env))
    bc_log = evaluate_env(args.env, args.seed, args.policy_hidden_size,
                          args.stochastic_policy, False, 'BC')
    print('Evaluation for {}'.format(args.env))
    print(bc_log)
    gail_log = evaluate_env(args.env, args.seed, args.policy_hidden_size,
                            args.stochastic_policy, True, 'gail')
    print('Evaluation for {}'.format(args.env))
    print(gail_log)
    plot(args.env, bc_log, gail_log, args.stochastic_policy)
Exemplo n.º 5
0
def train(env_id, num_timesteps, seed):
    from baselines.ppo1 import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()
    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=64, num_hid_layers=2)
    env = make_mujoco_env(env_id, seed)
    pposgd_simple.learn(env, policy_fn,
            max_timesteps=num_timesteps,
            timesteps_per_actorbatch=2048,
            clip_param=0.2, entcoeff=0.0,
            optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
            gamma=0.99, lam=0.95, schedule='linear',
        )
    env.close()
Exemplo n.º 6
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env_id)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                    reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2)
    env = bench.Monitor(env, logger.get_dir() and
                        osp.join(logger.get_dir(), "monitor.json"))
    env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)

    if args.task == 'train':
        dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation)
        reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff)
        train(env,
              args.seed,
              policy_fn,
              reward_giver,
              dataset,
              args.algo,
              args.g_step,
              args.d_step,
              args.policy_entcoeff,
              args.num_timesteps,
              args.save_per_iter,
              args.checkpoint_dir,
              args.log_dir,
              args.pretrained,
              args.BC_max_iter,
              task_name
              )
    elif args.task == 'evaluate':
        runner(env,
               policy_fn,
               args.load_model_path,
               timesteps_per_batch=1024,
               number_trajs=10,
               stochastic_policy=args.stochastic_policy,
               save=args.save_sample
               )
    else:
        raise NotImplementedError
    env.close()
Exemplo n.º 7
0
def test_serialization(learn_fn, network_fn):
    '''
    Test if the trained model can be serialized
    '''


    if network_fn.endswith('lstm') and learn_fn in ['acer', 'acktr', 'trpo_mpi', 'deepq']:
            # TODO make acktr work with recurrent policies
            # and test
            # github issue: https://github.com/openai/baselines/issues/660
            return

    def make_env():
        env = MnistEnv(episode_len=100)
        env.seed(10)
        return env

    env = DummyVecEnv([make_env])
    ob = env.reset().copy()
    learn = get_learn_function(learn_fn)

    kwargs = {}
    kwargs.update(network_kwargs[network_fn])
    kwargs.update(learn_kwargs[learn_fn])


    learn = partial(learn, env=env, network=network_fn, seed=0, **kwargs)

    with tempfile.TemporaryDirectory() as td:
        model_path = os.path.join(td, 'serialization_test_model')

        with tf.Graph().as_default(), make_session().as_default():
            model = learn(total_timesteps=100)
            model.save(model_path)
            mean1, std1 = _get_action_stats(model, ob)
            variables_dict1 = _serialize_variables()

        with tf.Graph().as_default(), make_session().as_default():
            model = learn(total_timesteps=0, load_path=model_path)
            mean2, std2 = _get_action_stats(model, ob)
            variables_dict2 = _serialize_variables()

        for k, v in variables_dict1.items():
            np.testing.assert_allclose(v, variables_dict2[k], atol=0.01,
                err_msg='saved and loaded variable {} value mismatch'.format(k))

        np.testing.assert_allclose(mean1, mean2, atol=0.5)
        np.testing.assert_allclose(std1, std2, atol=0.5)
Exemplo n.º 8
0
def test_env_after_learn(algo):
    def make_env():
        # acktr requires too much RAM, fails on travis
        env = gym.make('CartPole-v1' if algo == 'acktr' else 'PongNoFrameskip-v4')
        return env

    make_session(make_default=True, graph=tf.Graph())
    env = SubprocVecEnv([make_env])

    learn = get_learn_function(algo)

    # Commenting out the following line resolves the issue, though crash happens at env.reset().
    learn(network='mlp', env=env, total_timesteps=0, load_path=None, seed=None)

    env.reset()
    env.close()
Exemplo n.º 9
0
def train(env_id, num_timesteps, seed):
    from baselines.ppo1 import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(seed)
    env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=64, num_hid_layers=2)
    env = bench.Monitor(env, logger.get_dir() and 
        osp.join(logger.get_dir(), "monitor.json"))
    env.seed(seed)
    gym.logger.setLevel(logging.WARN)
    pposgd_simple.learn(env, policy_fn, 
            max_timesteps=num_timesteps,
            timesteps_per_batch=2048,
            clip_param=0.2, entcoeff=0.0,
            optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
            gamma=0.99, lam=0.95, schedule='linear',
        )
    env.close()
Exemplo n.º 10
0
def main():
    set_global_seeds(1)
    args = parse_args()
    with U.make_session(4) as sess:  # noqa
        _, env = make_env(args.env)
        act = deepq.build_act(
            make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name),
            q_func=dueling_model if args.dueling else model,
            num_actions=env.action_space.n)

        U.load_state(os.path.join(args.model_dir, "saved"))
        wang2015_eval(args.env, act, stochastic=args.stochastic)
Exemplo n.º 11
0
def test_microbatches():
    def env_fn():
        env = gym.make('CartPole-v0')
        env.seed(0)
        return env

    learn_fn = partial(learn, network='mlp', nsteps=32, total_timesteps=32, seed=0)

    env_ref = DummyVecEnv([env_fn])
    sess_ref = make_session(make_default=True, graph=tf.Graph())
    learn_fn(env=env_ref)
    vars_ref = {v.name: sess_ref.run(v) for v in tf.trainable_variables()}

    env_test = DummyVecEnv([env_fn])
    sess_test = make_session(make_default=True, graph=tf.Graph())
    learn_fn(env=env_test, model_fn=partial(MicrobatchedModel, microbatch_size=2))
    # learn_fn(env=env_test)
    vars_test = {v.name: sess_test.run(v) for v in tf.trainable_variables()}

    for v in vars_ref:
        np.testing.assert_allclose(vars_ref[v], vars_test[v], atol=3e-3)
Exemplo n.º 12
0
  def load(path, act_params, num_cpu=16):
    with open(path, "rb") as f:
      model_data = dill.load(f)
    act = deepq.build_act(**act_params)
    sess = U.make_session(num_cpu=num_cpu)
    sess.__enter__()
    with tempfile.TemporaryDirectory() as td:
      arc_path = os.path.join(td, "packed.zip")
      with open(arc_path, "wb") as f:
        f.write(model_data)

      zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td)
      U.load_state(os.path.join(td, "model"))

    return ActWrapper(act)
Exemplo n.º 13
0
def model(inpt, num_actions, scope, reuse=False):
    """This model takes as input an observation and returns values of all actions."""
    with tf.variable_scope(scope, reuse=reuse):
        out = inpt
        out = layers.fully_connected(out,
                                     num_outputs=64,
                                     activation_fn=tf.nn.tanh)
        out = layers.fully_connected(out,
                                     num_outputs=num_actions,
                                     activation_fn=None)
        return out


if __name__ == '__main__':
    with U.make_session():
        # Create the environment
        env = gym.make("Acrobot-v1")

        exp_demo = []
        temp_list = []
        N = 1000

        # Create all the functions necessary to train the model
        act, train, update_target, debug = deepq.build_train(
            make_obs_ph=lambda name: ObservationInput(env.observation_space,
                                                      name=name),
            q_func=model,
            num_actions=env.action_space.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
        )
Exemplo n.º 14
0
def learn(env,
          q_func,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=1,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          num_cpu=16,
          callback=None,
          trained_model=None):
    """Train a deepq model.

    Parameters
    -------
    env : gym.Env
        environment to train on
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to max_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    num_cpu: int
        number of cpus to use for training
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    sess = U.make_session(num_cpu=num_cpu)
    sess.__enter__()

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.shape, name=name)

    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10)
    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }
    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None

    obs, starttime, durationtime = env.reset()

    #i = 0
    #noise = 0.01 * np.random.randn(4,8,301)
    #np.save("./noise", noise)
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join(td, "model")
        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            action = act(np.array(obs)[None],
                         update_eps=exploration.value(t))[0]
            new_obs, rew, done, _ = env.step(action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs  #* (1 + noise[:,:,i].flatten())
            #i += 1

            episode_rewards[-1] += rew
            if done:
                #i = 0
                obs, starttime, durationtime = env.reset()
                episode_rewards.append(0.0)
            """
            obs = env.reset()
            with tempfile.TemporaryDirectory() as td:
                model_saved = False
                model_file = os.path.join(td, "model")
                for t in range(max_timesteps):
                    if callback is not None:
                        if callback(locals(), globals()):
                            break
                    # Take action and update exploration to the newest value
                    action = act(np.array(obs)[None], update_eps=exploration.value(t))[0]
                    new_obs, rew, done, _ = env.step(action)
                    # Store transition in the replay buffer.
                    replay_buffer.add(obs, action, rew, new_obs, float(done))
                    obs = new_obs
                    episode_rewards[-1] += rew
                    if done:
                        obs = env.reset()
                        episode_rewards.append(0.0)
            """

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                                  weights)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts
                    and num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_100ep_reward))
                    U.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    saved_mean_reward))
            U.load_state(model_file)

    return ActWrapper(act, act_params)
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()
    #print(np.abs(env.action_space.low))
    #print(np.abs(env.action_space.high))
    #assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
    max_action = env.action_space.high

    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    if load_memory:
        memory = pickle.load(
            open(
                "/home/vaisakhs_shaj/Desktop/BIG-DATA/memoryNorm300000.pickle",
                "rb"))
        '''
        samps = memoryPrev.sample(batch_size=memoryPrev.nb_entries)
        print(len(samps['obs0'][1]))
        for i in range(memoryPrev.nb_entries):
            memory.append(samps['obs0'][i], samps['actions'][i], samps['rewards'][i], samps['obs1'][i],  samps['terminals1'][i])
        print("=============memory loaded================")
        '''
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))
    envs = [make_env(seed) for seed in range(nproc)]
    envs = SubprocVecEnv(envs)
    '''
     # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None
    '''
    saver = tf.train.Saver()
    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=10)

    with U.make_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        if restore:
            filename = r"C:\Users\DELL\Desktop\MODELS\2d\tfSteps" + str(
                25000) + ".model"
            saver.restore(sess, filename)
            print("loaded!!!!!!!!!!!!!")
            #p=[v.name for v in tf.all_variables()]
            #print(p)

        obs = envs.reset()

        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_reward3 = 0.
        episode_step = 0
        episode_step3 = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = deque(maxlen=10)
        epoch_episode_steps3 = deque(maxlen=10)
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        learning_starts = 10000
        for epoch in range(nb_epochs):
            print("cycle-memory")
            print(max_action)
            for cycle in range(nb_epoch_cycles):
                print(cycle, "-", memory.nb_entries, end=" ")
                sys.stdout.flush()
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action = np.stack([
                        agent.pi(obs[i], apply_noise=True, compute_Q=False)[0]
                        for i in range(nproc)
                    ])
                    q = np.stack([
                        agent.pi(obs[i], apply_noise=True, compute_Q=True)[1]
                        for i in range(nproc)
                    ])
                    # action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    #assert action.shape == env.action_space.shape
                    #print(i)
                    # Execute next action in parallel.
                    if rank == 0 and render:
                        env.render()
                    #assert max_action.shape == action.shape
                    new_obs, r, done, info = envs.step(
                        action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    #print(r)
                    #print(r[1])
                    sys.stdout.flush()
                    episode_reward += r[1]
                    #episode_reward3 += r[2]
                    episode_step += 1
                    #episode_step3 += 1
                    '''
                    if episode_step==300:
                        e=episode_step
                        re=episode_reward
                    if episode_step>300:
                        episode_step=e
                        episode_reward=re
                    '''
                    #print(episode_step)

                    book_keeping_obs = obs
                    obs = new_obs
                    #print(envs[1])
                    #print(episode_reward)
                    # Book-keeping in parallel.
                    epoch_actions.append(np.mean(action))
                    epoch_qs.append(np.mean(q))
                    for i in range(nproc):
                        agent.store_transition(book_keeping_obs[i], action[i],
                                               r[i], new_obs[i], done[i])
                        #print(done)
                        if done[i]:
                            # Episode done.
                            #print("====done====",episode_reward)
                            if i == 1:

                                epoch_episode_rewards.append(episode_reward)
                                #rint(epoch_episode_rewards)
                                #episode_rewards_history.append(episode_reward)
                                epoch_episode_steps.append(episode_step)
                                episode_reward = 0.
                                #episode_reward3 = 0
                                episode_step = 0
                                epoch_episodes += 1
                                episodes += 1
                            '''
                            if i==2:
                                
                                #epoch_episode_rewards.append(episode_reward3)
                                #rint(epoch_episode_rewards)
                                episode_rewards_history.append(episode_reward3)
                                epoch_episode_steps3.append(episode_step3)
                                episode_reward3 = 0
                                episode_step3 = 0
                            '''

                            agent.reset()
                            temp = envs.reset()
                            obs[i] = temp[i]
                    '''
                    Variables in TensorFlow only have values inside sessions.
                    Once the session is over, the variables are lost.
                    saver,save and saver .restore depends on session and has to be inside the 
                    session.
                    '''

                    # Train.
                    epoch_actor_losses = []
                    epoch_critic_losses = []
                    epoch_adaptive_distances = []
                    for t_train in range(nb_train_steps):
                        # Adapt param noise, if necessary.
                        if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                            distance = agent.adapt_param_noise()
                            epoch_adaptive_distances.append(distance)

                        cl, al = agent.train()
                        epoch_critic_losses.append(cl)
                        epoch_actor_losses.append(al)
                        agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_rl

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.
                #print(episode_rewards_history)
            if (t) % 7500 == 0:
                fname = "/home/vaisakhs_shaj/Desktop/BIG-DATA/memoryNorm" + str(
                    memory.nb_entries) + ".pickle"
                pickle.dump(memory, open(fname, "wb"), protocol=-1)
            if t % 5000 == 0:
                print("=======saving interim model==========")
                filename = "/home/vaisakhs_shaj/Desktop/MODEL/normal/tfSteps" + str(
                    t) + ".model"
                saver.save(sess, filename)
            mpi_size = MPI.COMM_WORLD.Get_size()

            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(
                episode_rewards_history)
            combined_stats['rollout/episode_steps2'] = np.mean(
                epoch_episode_steps)
            combined_stats['rollout/episode_steps3'] = np.mean(
                epoch_episode_steps3)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(
                epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(
                duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.

            if eval_env is not None:
                combined_stats['eval/return'] = np.mean(eval_episode_rewards)
                combined_stats['eval/return_history'] = np.mean(
                    eval_episode_rewards_history)
                combined_stats['eval/Q'] = np.mean(eval_qs)
                combined_stats['eval/episodes'] = len(eval_episode_rewards)

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            combined_stats_sums = MPI.COMM_WORLD.allreduce(
                np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {
                k: v / mpi_size
                for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
            }

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            print(logdir)
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
Exemplo n.º 16
0
def prepare_env(env_id, seed, num_cpu):
    sess = U.make_session(num_cpu=num_cpu)
    sess.__enter__()
    set_global_seeds(seed)
    env = gym.make(env_id)
    return env
Exemplo n.º 17
0
def learn(env,
          q_func,
          num_actions=4,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=1,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          num_cpu=16,
          param_noise=False,
          param_noise_threshold=0.05,
          callback=None):
  """Train a deepq model.

Parameters
-------
env: pysc2.env.SC2Env
    environment to train on
q_func: (tf.Variable, int, str, bool) -> tf.Variable
    the model that takes the following inputs:
        observation_in: object
            the output of observation placeholder
        num_actions: int
            number of actions
        scope: str
        reuse: bool
            should be passed to outer variable scope
    and returns a tensor of shape (batch_size, num_actions) with values of every action.
lr: float
    learning rate for adam optimizer
max_timesteps: int
    number of env steps to optimizer for
buffer_size: int
    size of the replay buffer
exploration_fraction: float
    fraction of entire training period over which the exploration rate is annealed
exploration_final_eps: float
    final value of random action probability
train_freq: int
    update the model every `train_freq` steps.
    set to None to disable printing
batch_size: int
    size of a batched sampled from replay buffer for training
print_freq: int
    how often to print out training progress
    set to None to disable printing
checkpoint_freq: int
    how often to save the model. This is so that the best version is restored
    at the end of the training. If you do not wish to restore the best version at
    the end of the training set this variable to None.
learning_starts: int
    how many steps of the model to collect transitions for before learning starts
gamma: float
    discount factor
target_network_update_freq: int
    update the target network every `target_network_update_freq` steps.
prioritized_replay: True
    if True prioritized replay buffer will be used.
prioritized_replay_alpha: float
    alpha parameter for prioritized replay buffer
prioritized_replay_beta0: float
    initial value of beta for prioritized replay buffer
prioritized_replay_beta_iters: int
    number of iterations over which beta will be annealed from initial value
    to 1.0. If set to None equals to max_timesteps.
prioritized_replay_eps: float
    epsilon to add to the TD errors when updating priorities.
num_cpu: int
    number of cpus to use for training
callback: (locals, globals) -> None
    function called at every steps with state of the algorithm.
    If callback returns true training stops.

Returns
-------
act: ActWrapper
    Wrapper over act function. Adds ability to save it and load it.
    See header of baselines/deepq/categorical.py for details on the act function.
"""
  # Create all the functions necessary to train the model

  sess = U.make_session(num_cpu=num_cpu)
  sess.__enter__()

  def make_obs_ph(name):
    return U.BatchInput((32, 32), name=name)

  act, train, update_target, debug = deepq.build_train(
    make_obs_ph=make_obs_ph,
    q_func=q_func,
    num_actions=num_actions,
    optimizer=tf.train.AdamOptimizer(learning_rate=lr),
    gamma=gamma,
    grad_norm_clipping=10,
    scope="deepq")
  #
  # act_y, train_y, update_target_y, debug_y = deepq.build_train(
  #   make_obs_ph=make_obs_ph,
  #   q_func=q_func,
  #   num_actions=num_actions,
  #   optimizer=tf.train.AdamOptimizer(learning_rate=lr),
  #   gamma=gamma,
  #   grad_norm_clipping=10,
  #   scope="deepq_y"
  # )

  act_params = {
    'make_obs_ph': make_obs_ph,
    'q_func': q_func,
    'num_actions': num_actions,
  }

  # Create the replay buffer
  if prioritized_replay:
    replay_buffer = PrioritizedReplayBuffer(
      buffer_size, alpha=prioritized_replay_alpha)
    # replay_buffer_y = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)

    if prioritized_replay_beta_iters is None:
      prioritized_replay_beta_iters = max_timesteps
    beta_schedule = LinearSchedule(
      prioritized_replay_beta_iters,
      initial_p=prioritized_replay_beta0,
      final_p=1.0)

    # beta_schedule_y = LinearSchedule(prioritized_replay_beta_iters,
    #                                  initial_p=prioritized_replay_beta0,
    #                                  final_p=1.0)
  else:
    replay_buffer = ReplayBuffer(buffer_size)
    # replay_buffer_y = ReplayBuffer(buffer_size)

    beta_schedule = None
    # beta_schedule_y = None
  # Create the schedule for exploration starting from 1.
  exploration = LinearSchedule(
    schedule_timesteps=int(exploration_fraction * max_timesteps),
    initial_p=1.0,
    final_p=exploration_final_eps)

  # Initialize the parameters and copy them to the target network.
  U.initialize()
  update_target()
  # update_target_y()

  episode_rewards = [0.0]
  saved_mean_reward = None

  obs = env.reset()
  # Select all marines first
  obs = env.step(
    actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])])

  player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]

  screen = (player_relative == _PLAYER_NEUTRAL).astype(int)  #+ path_memory

  player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero()
  player = [int(player_x.mean()), int(player_y.mean())]

  if (player[0] > 16):
    screen = shift(LEFT, player[0] - 16, screen)
  elif (player[0] < 16):
    screen = shift(RIGHT, 16 - player[0], screen)

  if (player[1] > 16):
    screen = shift(UP, player[1] - 16, screen)
  elif (player[1] < 16):
    screen = shift(DOWN, 16 - player[1], screen)

  reset = True
  with tempfile.TemporaryDirectory() as td:
    model_saved = False
    model_file = os.path.join("model/", "mineral_shards")
    print(model_file)

    for t in range(max_timesteps):
      if callback is not None:
        if callback(locals(), globals()):
          break
      # Take action and update exploration to the newest value
      kwargs = {}
      if not param_noise:
        update_eps = exploration.value(t)
        update_param_noise_threshold = 0.
      else:
        update_eps = 0.
        if param_noise_threshold >= 0.:
          update_param_noise_threshold = param_noise_threshold
        else:
          # Compute the threshold such that the KL divergence between perturbed and non-perturbed
          # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
          # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
          # for detailed explanation.
          update_param_noise_threshold = -np.log(
            1. - exploration.value(t) +
            exploration.value(t) / float(num_actions))
        kwargs['reset'] = reset
        kwargs[
          'update_param_noise_threshold'] = update_param_noise_threshold
        kwargs['update_param_noise_scale'] = True

      action = act(
        np.array(screen)[None], update_eps=update_eps, **kwargs)[0]

      # action_y = act_y(np.array(screen)[None], update_eps=update_eps, **kwargs)[0]

      reset = False

      coord = [player[0], player[1]]
      rew = 0

      if (action == 0):  #UP

        if (player[1] >= 8):
          coord = [player[0], player[1] - 8]
          #path_memory_[player[1] - 16 : player[1], player[0]] = -1
        elif (player[1] > 0):
          coord = [player[0], 0]
          #path_memory_[0 : player[1], player[0]] = -1
          #else:
          #  rew -= 1

      elif (action == 1):  #DOWN

        if (player[1] <= 23):
          coord = [player[0], player[1] + 8]
          #path_memory_[player[1] : player[1] + 16, player[0]] = -1
        elif (player[1] > 23):
          coord = [player[0], 31]
          #path_memory_[player[1] : 63, player[0]] = -1
          #else:
          #  rew -= 1

      elif (action == 2):  #LEFT

        if (player[0] >= 8):
          coord = [player[0] - 8, player[1]]
          #path_memory_[player[1], player[0] - 16 : player[0]] = -1
        elif (player[0] < 8):
          coord = [0, player[1]]
          #path_memory_[player[1], 0 : player[0]] = -1
          #else:
          #  rew -= 1

      elif (action == 3):  #RIGHT

        if (player[0] <= 23):
          coord = [player[0] + 8, player[1]]
          #path_memory_[player[1], player[0] : player[0] + 16] = -1
        elif (player[0] > 23):
          coord = [31, player[1]]
          #path_memory_[player[1], player[0] : 63] = -1

      if _MOVE_SCREEN not in obs[0].observation["available_actions"]:
        obs = env.step(actions=[
          sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])
        ])

      new_action = [
        sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord])
      ]

      # else:
      #   new_action = [sc2_actions.FunctionCall(_NO_OP, [])]

      obs = env.step(actions=new_action)

      player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]
      new_screen = (player_relative == _PLAYER_NEUTRAL).astype(
        int)  #+ path_memory

      player_y, player_x = (
        player_relative == _PLAYER_FRIENDLY).nonzero()
      player = [int(player_x.mean()), int(player_y.mean())]

      if (player[0] > 16):
        new_screen = shift(LEFT, player[0] - 16, new_screen)
      elif (player[0] < 16):
        new_screen = shift(RIGHT, 16 - player[0], new_screen)

      if (player[1] > 16):
        new_screen = shift(UP, player[1] - 16, new_screen)
      elif (player[1] < 16):
        new_screen = shift(DOWN, 16 - player[1], new_screen)

      rew = obs[0].reward

      done = obs[0].step_type == environment.StepType.LAST

      # Store transition in the replay buffer.
      replay_buffer.add(screen, action, rew, new_screen, float(done))
      # replay_buffer_y.add(screen, action_y, rew, new_screen, float(done))

      screen = new_screen

      episode_rewards[-1] += rew
      reward = episode_rewards[-1]

      if done:
        obs = env.reset()
        player_relative = obs[0].observation["screen"][
          _PLAYER_RELATIVE]

        screen = (player_relative == _PLAYER_NEUTRAL).astype(
          int)  #+ path_memory

        player_y, player_x = (
          player_relative == _PLAYER_FRIENDLY).nonzero()
        player = [int(player_x.mean()), int(player_y.mean())]

        # Select all marines first
        env.step(actions=[
          sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])
        ])
        episode_rewards.append(0.0)
        #episode_minerals.append(0.0)

        reset = True

      if t > learning_starts and t % train_freq == 0:
        # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
        if prioritized_replay:

          experience = replay_buffer.sample(
            batch_size, beta=beta_schedule.value(t))
          (obses_t, actions, rewards, obses_tp1, dones, weights,
           batch_idxes) = experience

          # experience_y = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))
          # (obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y, batch_idxes_y) = experience_y
        else:

          obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
            batch_size)
          weights, batch_idxes = np.ones_like(rewards), None

          # obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y = replay_buffer_y.sample(batch_size)
          # weights_y, batch_idxes_y = np.ones_like(rewards_y), None

        td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                          weights)

        # td_errors_y = train_x(obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y)

        if prioritized_replay:
          new_priorities = np.abs(td_errors) + prioritized_replay_eps
          # new_priorities = np.abs(td_errors) + prioritized_replay_eps
          replay_buffer.update_priorities(batch_idxes,
                                          new_priorities)
          # replay_buffer.update_priorities(batch_idxes, new_priorities)

      if t > learning_starts and t % target_network_update_freq == 0:
        # Update target network periodically.
        update_target()
        # update_target_y()

      mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
      num_episodes = len(episode_rewards)
      if done and print_freq is not None and len(
          episode_rewards) % print_freq == 0:
        logger.record_tabular("steps", t)
        logger.record_tabular("episodes", num_episodes)
        logger.record_tabular("reward", reward)
        logger.record_tabular("mean 100 episode reward",
                              mean_100ep_reward)
        logger.record_tabular("% time spent exploring",
                              int(100 * exploration.value(t)))
        logger.dump_tabular()

      if (checkpoint_freq is not None and t > learning_starts
          and num_episodes > 100 and t % checkpoint_freq == 0):
        if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
          if print_freq is not None:
            logger.log(
              "Saving model due to mean reward increase: {} -> {}".
                format(saved_mean_reward, mean_100ep_reward))
          U.save_state(model_file)
          model_saved = True
          saved_mean_reward = mean_100ep_reward
    if model_saved:
      if print_freq is not None:
        logger.log("Restored model with mean reward: {}".format(
          saved_mean_reward))
      U.load_state(model_file)

  return ActWrapper(act)
Exemplo n.º 18
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          aux_apply,
          aux_tasks,
          tc_lambda,
          prop_lambda,
          caus_lambda,
          repeat_lambda,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    # Setup aux tasks' lambdas
    aux_lambdas = {
        'tc': tc_lambda,
        'prop': prop_lambda,
        'caus': caus_lambda,
        'repeat': repeat_lambda
    }

    # Create agent
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale,
                 aux_tasks=aux_tasks,
                 aux_lambdas=aux_lambdas)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.make_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            ep_rollout_times = []
            ep_train_times = []
            for cycle in range(nb_epoch_cycles):
                rollout_startt = time.time()
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    #print("action mean:{} -- Q: {}".format(np.mean(action), q))

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1
                        agent.reset()
                        obs = env.reset()

                # for the first 5 cycles just gather data
                if epoch == 0 and cycle < 5:
                    continue

                train_startt = time.time()
                ep_rollout_times.append(train_startt - rollout_startt)

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_aux_losses = {}
                epoch_aux_losses['grads/actor_grads'] = []
                epoch_aux_losses['grads/critic_grads'] = []
                epoch_aux_losses['grads/aux_grads'] = []
                for name in aux_tasks:
                    epoch_aux_losses['aux/' + name] = []
                epoch_adaptive_distances = []

                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al, auxl = agent.train()

                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    for name, value in auxl.items():
                        if 'grads' in name:
                            epoch_aux_losses['grads/' + name].append(
                                np.abs(value))
                        else:
                            epoch_aux_losses['aux/' + name].append(
                                np.abs(value))

                    agent.update_target_net()

                ep_train_times.append(time.time() - train_startt)

                if eval_env is not None:
                    # Evaluate.
                    eval_episode_rewards = []
                    eval_qs = []
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

            print('rollout avg time (s): {}'.format(np.mean(ep_rollout_times)))
            print('train avg time (s): {}'.format(np.mean(ep_train_times)))
            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(
                episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(
                epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(
                epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(
                duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Auxiliary statistics.
            if aux_tasks is not None:
                for name, values in epoch_aux_losses.items():
                    combined_stats[name] = np.mean(values)

            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(
                    eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            combined_stats_sums = MPI.COMM_WORLD.allreduce(
                np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {
                k: v / mpi_size
                for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
            }

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
Exemplo n.º 19
0
def train(args):

    env_id = args.env
    num_timesteps = args.num_timesteps
    seed = args.seed

    from baselines.ppo1 import mlp_policy
    import pposgd_simple_modified_final
    U.make_session(num_cpu=1).__enter__()

    # set random seed for tf, numpy.random, random
    # in common/misc_util.py
    set_global_seeds(seed)

    def policy_fn(name, ob_space, ac_space):
        # mlp: Multi-Layer Perceptron
        # state -> (num_hid_layers) fully-connected layers with (hid_size) units -> (action, predicted value)
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2)

    # ================================== modification 1 ================================== #
    """
    ppo_learn input: replace "env" (env class) with "env_id" (string)
    				 add input "seed" (int)
    	reason: to enable env.make() during training
    	modification detail: move following lines into learn()
    		env = gym.make(env_id)
    		env = bench.Monitor(env, logger.get_dir())
    		env.seed(seed)
    		env.close()
	"""

    # ====================================== hyperparameter begins ====================================== #
    joint_optimization_iters = args.joint_iters
    design_iters = args.design_iters  # number of robots sampled when updating physical design
    policy_iters = args.policy_iters  # number of robots sampled when updating policy
    policy_episodes = 1  # for each robot, number of epsiodes conducted to update policy
    policy_timesteps = 1e5
    design_learning_rate = 1e-4
    # ======================================= hyperparameter ends ======================================= #
    if 'Ant' in env_id:
        robot_name = 'ant'
    elif 'Hopper' in env_id:
        robot_name = 'hopper'
    elif 'Walker' in env_id:
        robot_name = 'Walker2d'
    else:
        print('!' * 50)
        print('Unknown Environment')
        print('!' * 50)
        exit(1)

    robot = GMM(robot_name=robot_name,
                m=design_iters,
                learning_rate=design_learning_rate)

    # ================================== modification 1 ================================== #
    gym.logger.setLevel(logging.WARN)
    pposgd_simple_modified_final.learn(
        # =========== modified part begins =========== #
        env_id,
        seed,
        robot,  # robot class with GMM params
        joint_optimization_iters,  # total number of joint optimization iterations
        design_iters,  # number of samples when updating physical design in each joint optimization iteration
        policy_iters,
        # ============ modified part ends ============ #
        policy_fn,
        max_timesteps=policy_timesteps,
        timesteps_per_actorbatch=2048,
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=3e-4,
        optim_batchsize=64,
        gamma=0.99,
        lam=0.95,
        schedule='linear',
    )
Exemplo n.º 20
0
    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps,
            ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
            alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'):

        sess = tf_util.make_session()
        nbatch = nenvs*nsteps

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
        train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train],
                td_map
            )
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            make_path(osp.dirname(save_path))
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
def train():
    from linear_schedule import Linear

    ledger = defaultdict(lambda: MovingAverage(Reporting.reward_average))

    M.config(file=os.path.join(RUN.log_directory, RUN.log_file))
    M.diff()

    with U.make_session(
            RUN.num_cpu), Logger(RUN.log_directory) as logger, contextify(
                gym.make(G.env_name)) as env:
        env = ScaledFloatFrame(wrap_dqn(env))

        if G.seed is not None:
            env.seed(G.seed)
        logger.log_params(G=vars(G), RUN=vars(RUN), Reporting=vars(Reporting))
        inputs = TrainInputs(action_space=env.action_space,
                             observation_space=env.observation_space)
        trainer = QTrainer(inputs=inputs,
                           action_space=env.action_space,
                           observation_space=env.observation_space)
        if G.prioritized_replay:
            replay_buffer = PrioritizedReplayBuffer(size=G.buffer_size,
                                                    alpha=G.alpha)
        else:
            replay_buffer = ReplayBuffer(size=G.buffer_size)

        class schedules:
            # note: it is important to have this start from the begining.
            eps = Linear(G.n_timesteps * G.exploration_fraction, 1,
                         G.final_eps)
            if G.prioritized_replay:
                beta = Linear(G.n_timesteps - G.learning_start, G.beta_start,
                              G.beta_end)

        U.initialize()
        trainer.update_target()
        x = np.array(env.reset())
        ep_ind = 0
        M.tic('episode')
        for t_step in range(G.n_timesteps):
            # schedules
            eps = 0 if G.param_noise else schedules.eps[t_step]
            if G.prioritized_replay:
                beta = schedules.beta[t_step - G.learning_start]

            x0 = x
            M.tic('sample', silent=True)
            (action, *_), action_q, q = trainer.runner.act([x], eps)
            x, rew, done, info = env.step(action)
            ledger['action_q_value'].append(action_q.max())
            ledger['action_q_value/mean'].append(action_q.mean())
            ledger['action_q_value/var'].append(action_q.var())
            ledger['q_value'].append(q.max())
            ledger['q_value/mean'].append(q.mean())
            ledger['q_value/var'].append(q.var())
            ledger['timing/sample'].append(M.toc('sample', silent=True))
            # note: adding sample to the buffer is identical between the prioritized and the standard replay strategy.
            replay_buffer.add(s0=x0,
                              action=action,
                              reward=rew,
                              s1=x,
                              done=float(done))

            logger.log(
                t_step, {
                    'q_value': ledger['q_value'].latest,
                    'q_value/mean': ledger['q_value/mean'].latest,
                    'q_value/var': ledger['q_value/var'].latest,
                    'q_value/action': ledger['action_q_value'].latest,
                    'q_value/action/mean':
                    ledger['action_q_value/mean'].latest,
                    'q_value/action/var': ledger['action_q_value/var'].latest
                },
                action=action,
                eps=eps,
                silent=True)

            if G.prioritized_replay:
                logger.log(t_step, beta=beta, silent=True)

            if done:
                ledger['timing/episode'].append(M.split('episode',
                                                        silent=True))
                ep_ind += 1
                x = np.array(env.reset())
                ledger['rewards'].append(info['total_reward'])

                silent = (ep_ind % Reporting.print_interval != 0)
                logger.log(t_step,
                           timestep=t_step,
                           episode=green(ep_ind),
                           total_reward=ledger['rewards'].latest,
                           episode_length=info['timesteps'],
                           silent=silent)
                logger.log(t_step, {
                    'total_reward/mean':
                    yellow(ledger['rewards'].mean, lambda v: f"{v:.1f}"),
                    'total_reward/max':
                    yellow(ledger['rewards'].max, lambda v: f"{v:.1f}"),
                    "time_spent_exploring":
                    default(eps, percent),
                    "timing/episode":
                    green(ledger['timing/episode'].latest, sec),
                    "timing/episode/mean":
                    green(ledger['timing/episode'].mean, sec),
                },
                           silent=silent)
                try:
                    logger.log(t_step, {
                        "timing/sample":
                        default(ledger['timing/sample'].latest, sec),
                        "timing/sample/mean":
                        default(ledger['timing/sample'].mean, sec),
                        "timing/train":
                        default(ledger['timing/train'].latest, sec),
                        "timing/train/mean":
                        green(ledger['timing/train'].mean, sec),
                        "timing/log_histogram":
                        default(ledger['timing/log_histogram'].latest, sec),
                        "timing/log_histogram/mean":
                        default(ledger['timing/log_histogram'].mean, sec)
                    },
                               silent=silent)
                    if G.prioritized_replay:
                        logger.log(t_step, {
                            "timing/update_priorities":
                            default(ledger['timing/update_priorities'].latest,
                                    sec),
                            "timing/update_priorities/mean":
                            default(ledger['timing/update_priorities'].mean,
                                    sec)
                        },
                                   silent=silent)
                except Exception as e:
                    pass
                if G.prioritized_replay:
                    logger.log(
                        t_step,
                        {"replay_beta": default(beta, lambda v: f"{v:.2f}")},
                        silent=silent)

            # note: learn here.
            if t_step >= G.learning_start and t_step % G.learn_interval == 0:
                if G.prioritized_replay:
                    experiences, weights, indices = replay_buffer.sample(
                        G.replay_batch_size, beta)
                    logger.log_histogram(t_step, weights=weights)
                else:
                    experiences, weights = replay_buffer.sample(
                        G.replay_batch_size), None
                M.tic('train', silent=True)
                x0s, actions, rewards, x1s, dones = zip(*experiences)
                td_error_val, loss_val = trainer.train(s0s=x0s,
                                                       actions=actions,
                                                       rewards=rewards,
                                                       s1s=x1s,
                                                       dones=dones,
                                                       sample_weights=weights)
                ledger['timing/train'].append(M.toc('train', silent=True))
                M.tic('log_histogram', silent=True)
                logger.log_histogram(t_step, td_error=td_error_val)
                ledger['timing/log_histogram'].append(
                    M.toc('log_histogram', silent=True))
                if G.prioritized_replay:
                    M.tic('update_priorities', silent=True)
                    new_priorities = np.abs(td_error_val) + eps
                    replay_buffer.update_priorities(indices, new_priorities)
                    ledger['timing/update_priorities'].append(
                        M.toc('update_priorities', silent=True))

            if t_step % G.target_network_update_interval == 0:
                trainer.update_target()

            if t_step % Reporting.checkpoint_interval == 0:
                U.save_state(os.path.join(RUN.log_directory, RUN.checkpoint))
def learn(env,
          q_func,
          num_actions=3,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=1,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          num_cpu=16,
          param_noise=False,
          param_noise_threshold=0.05,
          callback=None):
    """Train a deepq model.

  Parameters
  -------
  env: pysc2.env.SC2Env
      environment to train on
  q_func: (tf.Variable, int, str, bool) -> tf.Variable
      the model that takes the following inputs:
          observation_in: object
              the output of observation placeholder
          num_actions: int
              number of actions
          scope: str
          reuse: bool
              should be passed to outer variable scope
      and returns a tensor of shape (batch_size, num_actions) with values of every action.
  lr: float
      learning rate for adam optimizer
  max_timesteps: int
      number of env steps to optimizer for
  buffer_size: int
      size of the replay buffer
  exploration_fraction: float
      fraction of entire training period over which the exploration rate is annealed
  exploration_final_eps: float
      final value of random action probability
  train_freq: int
      update the model every `train_freq` steps.
      set to None to disable printing
  batch_size: int
      size of a batched sampled from replay buffer for training
  print_freq: int
      how often to print out training progress
      set to None to disable printing
  checkpoint_freq: int
      how often to save the model. This is so that the best version is restored
      at the end of the training. If you do not wish to restore the best version at
      the end of the training set this variable to None.
  learning_starts: int
      how many steps of the model to collect transitions for before learning starts
  gamma: float
      discount factor
  target_network_update_freq: int
      update the target network every `target_network_update_freq` steps.
  prioritized_replay: True
      if True prioritized replay buffer will be used.
  prioritized_replay_alpha: float
      alpha parameter for prioritized replay buffer
  prioritized_replay_beta0: float
      initial value of beta for prioritized replay buffer
  prioritized_replay_beta_iters: int
      number of iterations over which beta will be annealed from initial value
      to 1.0. If set to None equals to max_timesteps.
  prioritized_replay_eps: float
      epsilon to add to the TD errors when updating priorities.
  num_cpu: int
      number of cpus to use for training
  callback: (locals, globals) -> None
      function called at every steps with state of the algorithm.
      If callback returns true training stops.

  Returns
  -------
  act: ActWrapper
      Wrapper over act function. Adds ability to save it and load it.
      See header of baselines/deepq/categorical.py for details on the act function.
  """
    # Create all the functions necessary to train the model

    sess = U.make_session(num_cpu=num_cpu)
    sess.__enter__()

    def make_obs_ph(name):
        return U.BatchInput((64, 64), name=name)

    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=num_actions,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10)
    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': num_actions,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None

    obs = env.reset()
    # Select all marines first

    player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]

    screen = player_relative

    obs = init(env, player_relative, obs)

    group_id = 0
    reset = True
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join(td, "model")

        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                if param_noise_threshold >= 0.:
                    update_param_noise_threshold = param_noise_threshold
                else:
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = -np.log(
                        1. - exploration.value(t) +
                        exploration.value(t) / float(num_actions))
                kwargs['reset'] = reset
                kwargs[
                    'update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True

            group_list = update_group_list(obs)

            if (check_group_list(env, obs)):
                obs = init(env, player_relative, obs)
                group_list = update_group_list(obs)

            # if(len(group_list) == 0):
            #   obs = init(env, player_relative, obs)
            #   group_list = update_group_list(obs)

            player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]
            i = 0

            friendly_y, friendly_x = (
                player_relative == _PLAYER_FRIENDLY).nonzero()

            enemy_y, enemy_x = (player_relative == _PLAYER_HOSTILE).nonzero()

            danger_closest, danger_min_dist = None, None
            for e in zip(enemy_x, enemy_y):
                for p in zip(friendly_x, friendly_y):
                    dist = np.linalg.norm(np.array(p) - np.array(e))
                    if not danger_min_dist or dist < danger_min_dist:
                        danger_closest, danger_min_dist = p, dist

            marine_closest, marine_min_dist = None, None
            for e in zip(friendly_x, friendly_y):
                for p in zip(friendly_x, friendly_y):
                    dist = np.linalg.norm(np.array(p) - np.array(e))
                    if not marine_min_dist or dist < marine_min_dist:
                        if dist >= 2:
                            marine_closest, marine_min_dist = p, dist

            if (danger_min_dist != None and danger_min_dist <= 5):
                obs = env.step(actions=[
                    sc2_actions.FunctionCall(_SELECT_POINT,
                                             [[0], danger_closest])
                ])

                selected = obs[0].observation["screen"][_SELECTED]
                player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero()
                if (len(player_y) > 0):
                    player = [int(player_x.mean()), int(player_y.mean())]

            elif (marine_closest != None and marine_min_dist <= 3):
                obs = env.step(actions=[
                    sc2_actions.FunctionCall(_SELECT_POINT,
                                             [[0], marine_closest])
                ])

                selected = obs[0].observation["screen"][_SELECTED]
                player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero()
                if (len(player_y) > 0):
                    player = [int(player_x.mean()), int(player_y.mean())]

            else:

                # If there is no marine in danger, select random
                while (len(group_list) > 0):
                    # units = env._obs.observation.raw_data.units
                    # marine_list = []          # for unit in units:
                    #   if(unit.alliance == 1):
                    #     marine_list.append(unit)

                    group_id = np.random.choice(group_list)
                    #xy = [int(unit.pos.y - 10), int(unit.pos.x+8)]
                    #print("check xy : %s - %s" % (xy, player_relative[xy[0],xy[1]]))
                    obs = env.step(actions=[
                        sc2_actions.FunctionCall(
                            _SELECT_CONTROL_GROUP,
                            [[_CONTROL_GROUP_RECALL], [group_id]])
                    ])

                    selected = obs[0].observation["screen"][_SELECTED]
                    player_y, player_x = (
                        selected == _PLAYER_FRIENDLY).nonzero()
                    if (len(player_y) > 0):
                        player = [int(player_x.mean()), int(player_y.mean())]
                        break
                    else:
                        group_list.remove(group_id)

            if (player[0] > 32):
                screen = shift(LEFT, player[0] - 32, screen)
            elif (player[0] < 32):
                screen = shift(RIGHT, 32 - player[0], screen)

            if (player[1] > 32):
                screen = shift(UP, player[1] - 32, screen)
            elif (player[1] < 32):
                screen = shift(DOWN, 32 - player[1], screen)

            action = act(np.array(screen)[None],
                         update_eps=update_eps,
                         **kwargs)[0]
            reset = False
            rew = 0

            new_action = None

            player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]

            coord = [player[0], player[1]]
            enemy_y, enemy_x = (player_relative == _PLAYER_HOSTILE).nonzero()

            closest, min_dist = None, None
            for p in zip(enemy_x, enemy_y):
                dist = np.linalg.norm(np.array(player) - np.array(p))
                if not min_dist or dist < min_dist:
                    closest, min_dist = p, dist

            player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]
            friendly_y, friendly_x = (
                player_relative == _PLAYER_FRIENDLY).nonzero()

            closest_friend, min_dist_friend = None, None
            for p in zip(friendly_x, friendly_y):
                dist = np.linalg.norm(np.array(player) - np.array(p))
                if not min_dist_friend or dist < min_dist_friend:
                    closest_friend, min_dist_friend = p, dist

            if (closest == None):

                new_action = [sc2_actions.FunctionCall(_NO_OP, [])]

            elif (action == 0 and closest_friend != None
                  and min_dist_friend < 5):
                # Friendly marine is too close => Sparse!

                diff = np.array(player) - np.array(closest_friend)

                norm = np.linalg.norm(diff)

                if (norm != 0):
                    diff = diff / norm

                coord = np.array(player) + diff * 3

                if (coord[0] < 0):
                    coord[0] = 0
                elif (coord[0] > 63):
                    coord[0] = 63

                if (coord[1] < 0):
                    coord[1] = 0
                elif (coord[1] > 63):
                    coord[1] = 63

                new_action = [
                    sc2_actions.FunctionCall(_MOVE_SCREEN,
                                             [_NOT_QUEUED, coord])
                ]

            elif (action <= 1):  #Attack

                # nearest enemy

                coord = closest

                new_action = [
                    sc2_actions.FunctionCall(_ATTACK_SCREEN,
                                             [_NOT_QUEUED, coord])
                ]

                #print("action : %s Attack Coord : %s" % (action, coord))

            elif (action == 2):  # Oppsite direcion from enemy

                # nearest enemy opposite

                diff = np.array(player) - np.array(closest)

                norm = np.linalg.norm(diff)

                if (norm != 0):
                    diff = diff / norm

                coord = np.array(player) + diff * 3

                if (coord[0] < 0):
                    coord[0] = 0
                elif (coord[0] > 63):
                    coord[0] = 63

                if (coord[1] < 0):
                    coord[1] = 0
                elif (coord[1] > 63):
                    coord[1] = 63

                new_action = [
                    sc2_actions.FunctionCall(_MOVE_SCREEN,
                                             [_NOT_QUEUED, coord])
                ]

            elif (action == 4):  #UP
                coord = [player[0], player[1] - 3]
                new_action = [
                    sc2_actions.FunctionCall(_MOVE_SCREEN,
                                             [_NOT_QUEUED, coord])
                ]

            elif (action == 5):  #DOWN
                coord = [player[0], player[1] + 3]
                new_action = [
                    sc2_actions.FunctionCall(_MOVE_SCREEN,
                                             [_NOT_QUEUED, coord])
                ]

            elif (action == 6):  #LEFT
                coord = [player[0] - 3, player[1]]
                new_action = [
                    sc2_actions.FunctionCall(_MOVE_SCREEN,
                                             [_NOT_QUEUED, coord])
                ]

            elif (action == 7):  #RIGHT
                coord = [player[0] + 3, player[1]]
                new_action = [
                    sc2_actions.FunctionCall(_MOVE_SCREEN,
                                             [_NOT_QUEUED, coord])
                ]

                #print("action : %s Back Coord : %s" % (action, coord))
            army_count = env._obs.observation.player_common.army_count

            try:
                if army_count > 0 and _ATTACK_SCREEN in obs[0].observation[
                        "available_actions"]:
                    obs = env.step(actions=new_action)
                else:
                    new_action = [sc2_actions.FunctionCall(_NO_OP, [])]
                    obs = env.step(actions=new_action)
            except Exception as e:
                #print(e)
                1  # Do nothing

            player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]
            new_screen = player_relative

            rew += obs[0].reward

            done = obs[0].step_type == environment.StepType.LAST

            selected = obs[0].observation["screen"][_SELECTED]
            player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero()
            if (len(player_y) > 0):
                player = [int(player_x.mean()), int(player_y.mean())]

            if (player[0] > 32):
                new_screen = shift(LEFT, player[0] - 32, new_screen)
            elif (player[0] < 32):
                new_screen = shift(RIGHT, 32 - player[0], new_screen)

            if (player[1] > 32):
                new_screen = shift(UP, player[1] - 32, new_screen)
            elif (player[1] < 32):
                new_screen = shift(DOWN, 32 - player[1], new_screen)

            # Store transition in the replay buffer.
            replay_buffer.add(screen, action, rew, new_screen, float(done))
            screen = new_screen

            episode_rewards[-1] += rew

            if done:
                print("Episode Reward : %s" % episode_rewards[-1])
                obs = env.reset()
                player_relative = obs[0].observation["screen"][
                    _PLAYER_RELATIVE]

                screen = player_relative

                group_list = init(env, player_relative, obs)

                # Select all marines first
                #env.step(actions=[sc2_actions.FunctionCall(_SELECT_UNIT, [_SELECT_ALL])])
                episode_rewards.append(0.0)

                reset = True

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                                  weights)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts
                    and num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_100ep_reward))
                    U.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    saved_mean_reward))
            U.load_state(model_file)

    return ActWrapper(act)
Exemplo n.º 23
0
def create_session(num_cpu=None):
    U.make_session(num_cpu=num_cpu).__enter__()
Exemplo n.º 24
0
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.deepq.utils import ObservationInput
from baselines.common.schedules import LinearSchedule


def model(inpt, num_actions, scope, reuse=False):
    """This model takes as input an observation and returns values of all actions."""
    with tf.variable_scope(scope, reuse=reuse):
        out = inpt
        out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.tanh)
        out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
        return out


if __name__ == '__main__':
    with U.make_session(8):
        # Create the environment
        env = gym.make("CartPole-v0")
        # Create all the functions necessary to train the model
        act, train, update_target, debug = deepq.build_train(
            make_obs_ph=lambda name: ObservationInput(env.observation_space, name=name),
            q_func=model,
            num_actions=env.action_space.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
        )
        # Create the replay buffer
        replay_buffer = ReplayBuffer(50000)
        # Create the schedule for exploration starting from 1 (every action is random) down to
        # 0.02 (98% of actions are selected according to values predicted by the model).
        exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02)
import gym
import tensorflow as tf
from baselines.common import set_global_seeds, tf_util as U
from baselines.ppo1 import mlp_policy, pposgd_simple

env = gym.make("MountainCarContinuous-v0")

def policy_fn(name, ob_space, ac_space):
    return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
    hid_size=64, num_hid_layers=2)

# define the policy
pi = policy_fn('pi', env.observation_space, env.action_space)

# Define a TF session and restore graph
sess = U.make_session(num_cpu=1)
sess.__enter__()

# Load the previous trained graph
tf.train.Saver().restore(sess, '/tmp/experiments/continuous/PPO/models/TimeLimit_afterIter_80.model')
# tf.train.Saver().restore(sess, '/tmp/experiments/continuous/PPO/models/TimeLimit_afterIter_24.model')

env.render()
while True:
    obs, done = env.reset(), False
    episode_rew = 0
    while not done:
        env.render()
        obs, rew, done, _ = env.step(pi.act(True, obs)[0])
        episode_rew += rew
    print("Episode reward", episode_rew)
Exemplo n.º 26
0
def enjoy(env_id, seed):

    with tf.device('/cpu'):
        sess = U.make_session(num_cpu=1)
        sess.__enter__()

        env = gym.make(env_id)
        #env = gym.make('Mujoco-planar-snake-cars-cam-v1')
        #env = gym.make('Mujoco-planar-snake-cars-cam-dist-zigzag-v1')
        #env = gym.make('Mujoco-planar-snake-cars-cam-dist-random-v1')
        #env = gym.make('Mujoco-planar-snake-cars-cam-dist-line-v1')
        #env = gym.make('Mujoco-planar-snake-cars-cam-dist-circle-v1')
        #env = gym.make('Mujoco-planar-snake-cars-cam-dist-wave-v1')

        check_for_new_models = True

        # more steps
        #env._max_episode_steps = env.spec.max_episode_steps * 3
        #obs = env.reset()

        max_timesteps = 3000000

        # model_index = 254 # 251 # best

        # Select model file .....

        #check_for_new_models = False
        #
        # modelverion_in_k_ts = 2000
        modelverion_in_k_ts = 3000  # good
        modelverion_in_k_ts = 2510  # better

        model_index = int(max_timesteps / 1000 / 10 - modelverion_in_k_ts / 10)

        # TOdo last saved model
        model_index = 0

        print("actionspace", env.action_space)
        print("observationspace", env.observation_space)

        gym.logger.setLevel(logging.WARN)

        # init load
        model_dir = get_model_dir(env_id, 'ppo')
        model_files = get_model_files(model_dir)
        #model_file = get_latest_model_file(model_dir)
        print('available models: ', len(model_files))
        model_file = model_files[model_index]
        #model_file = model_files[75]
        logger.log("load model_file: %s" % model_file)

        sum_info = None
        pi = policy_fn('pi', env.observation_space, env.action_space)

        while True:
            # run one episode

            # TODO specify target velocity
            # only takes effect in angle envs
            #env.unwrapped.metadata['target_v'] = 0.05
            env.unwrapped.metadata['target_v'] = 0.15
            #env.unwrapped.metadata['target_v'] = 0.25

            #env._max_episode_steps = env._max_episode_steps * 3

            done, number_of_timesteps, info_collector = run_environment_episode(
                env,
                pi,
                seed,
                model_file,
                env._max_episode_steps,
                render=True,
                stochastic=False)

            info_collector.episode_info_print()

            check_model_file = get_latest_model_file(model_dir)
            if check_model_file != model_file and check_for_new_models:
                model_file = check_model_file
                logger.log('loading new model_file %s' % model_file)

            print('timesteps: %d, info: %s' %
                  (number_of_timesteps, str(sum_info)))
            """
Exemplo n.º 27
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear'):

        sess = tf_util.make_session()
        nbatch = nenvs * nsteps

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nenvs * nsteps,
                             nsteps,
                             reuse=True)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            make_path(osp.dirname(save_path))
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Exemplo n.º 28
0
def get_task_name(args):
    task_name = args.algo + "_gail."
    if args.pretrained:
        task_name += "with_pretrained."
    if args.traj_limitation != np.inf:
        task_name += "transition_limitation_%d." % args.traj_limitation
    task_name += args.env_id.split("-")[0]
    task_name = task_name + ".g_step_" + str(args.g_step) + ".d_step_" + str(args.d_step) + \
                ".policy_entcoeff_" + str(args.policy_entcoeff) + ".adversary_entcoeff_" + str(args.adversary_entcoeff)
    task_name += ".seed_" + str(args.seed)
    return task_name


args = argsparser()
U.make_session(num_cpu=1).__enter__()
set_global_seeds(args.seed)
env = gym.make(args.env_id)


def policy_fn(name, ob_space, ac_space, reuse=False):
    return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2)


env = bench.Monitor(env, logger.get_dir() and
                    osp.join(logger.get_dir(), "monitor.json"))
env.seed(args.seed)
gym.logger.setLevel(logging.WARN)
task_name = get_task_name(args)
args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
Exemplo n.º 29
0
    subdir = (datetime.datetime.now()
              ).strftime("%m-%d-%Y-%H:%M:%S") + " " + args.comment
    tf_writer = tf.summary.FileWriter(os.path.join(args.log_dir, subdir),
                                      tf.get_default_graph())
    value_summary = tf.Summary()
    qec_summary = tf.Summary()
    value_summary.value.add(tag='discount_reward_mean')
    value_summary.value.add(tag='non_discount_reward_mean')
    # value_summary.value.add(tag='episode')

    qec_summary.value.add(tag='qec_mean')
    qec_summary.value.add(tag='qec_fount')
    value_summary.value.add(tag='steps')
    value_summary.value.add(tag='episodes')

    with U.make_session(4) as sess:
        # EMDQN

        ec_buffer = []
        buffer_size = int(1000000 / env.action_space.n)
        # input_dim = 1024
        for i in range(env.action_space.n):
            ec_buffer.append(
                LRU_KNN_UCB(buffer_size,
                            args.latent_dim,
                            'game',
                            mode=args.mode))
        # rng = np.random.RandomState(123456)  # deterministic, erase 123456 for stochastic
        # rp = rng.normal(loc=0, scale=1. / np.sqrt(latent_dim), size=(latent_dim, input_dim))
        qecwatch = []
        update_counter = 0
def train(env_id, max_iter, inner_iter, seed, skilldim, tasknum, warmstart,
          mirror, dyn_params):
    from policy_transfer.meta_strategy_optimization import ars_mso
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(seed + MPI.COMM_WORLD.Get_rank())

    env = gym.make(env_id)

    env.env.param_manager.activated_param = dyn_params
    env.env.param_manager.controllable_param = dyn_params

    if hasattr(env.env, 'obs_perm') and skilldim > 0:
        cur_perm = env.env.obs_perm
        beginid = len(cur_perm)
        obs_perm_base = np.concatenate(
            [cur_perm, np.arange(beginid, beginid + skilldim)])
        env.env.obs_perm = obs_perm_base

    with open(logger.get_dir() + "/envinfo.txt", "w") as text_file:
        text_file.write(str(env.env.__dict__))

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=64,
                         num_hid_layers=2)

    def policy_mirror_fn(name, ob_space, ac_space):
        return MirrorPolicy(name=name,
                            ob_space=ob_space,
                            ac_space=ac_space,
                            hid_size=64,
                            num_hid_layers=2,
                            observation_permutation=env.env.env.obs_perm,
                            action_permutation=env.env.env.act_perm,
                            soft_mirror=False)

    env = bench.Monitor(env,
                        logger.get_dir()
                        and osp.join(logger.get_dir(), "monitor.json"),
                        allow_early_resets=True)
    with open(logger.get_dir() + "/config.txt", "w") as text_file:
        text_file.write(str(locals()))

    if hasattr(env.env.env, "param_manager"):
        with open(logger.get_dir() + "/params.txt", "w") as text_file:
            text_file.write(str(env.env.env.param_manager.__dict__))

    env.seed(seed + MPI.COMM_WORLD.Get_rank())

    gym.logger.setLevel(logging.WARN)

    pol_func = policy_fn
    if mirror:
        pol_func = policy_mirror_fn

    if len(warmstart) != 0:
        if 'pickle' in warmstart:
            warstart_params = pickle.load(open(warmstart, 'rb'))
        else:
            warstart_params = joblib.load(warmstart)
    else:
        warstart_params = None

    ars_mso.ars_optimize(
        env,
        pol_func,
        perturb_mag=0.02,
        learning_rate=0.005,
        eval_epoch=1,
        params_per_thread=8,
        top_perturb=8,
        maxiter=max_iter,
        callback=callback,
        init_policy_params=warstart_params,
        skilldim=skilldim,
        task_num=tasknum,
        inner_iters=inner_iter,
    )

    env.close()
Exemplo n.º 31
0
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.deepq.utils import ObservationInput
from baselines.common.schedules import LinearSchedule


def model(inpt, num_actions, scope, reuse=False):
    """This model takes as input an observation and returns values of all actions."""
    with tf.variable_scope(scope, reuse=reuse):
        out = inpt
        out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.tanh)
        out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
        return out


if __name__ == '__main__':
    with U.make_session(num_cpu=8):
        # Create the environment
        env = gym.make("CartPole-v0")
        # Create all the functions necessary to train the model
        act, train, update_target, debug = deepq.build_train(
            make_obs_ph=lambda name: ObservationInput(env.observation_space, name=name),
            q_func=model,
            num_actions=env.action_space.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
        )
        # Create the replay buffer
        replay_buffer = ReplayBuffer(50000)
        # Create the schedule for exploration starting from 1 (every action is random) down to
        # 0.02 (98% of actions are selected according to values predicted by the model).
        exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02)
Exemplo n.º 32
0
def train_mirror_sig(env, num_timesteps, seed, obs_perm, act_perm):
    from baselines.ppo1 import mlp_mirror_policy, pposgd_mirror
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(seed)


    def policy_fn(name, ob_space, ac_space):
        return mlp_mirror_policy.MlpMirrorPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                                 hid_size=64, num_hid_layers=3, gmm_comp=1,
                                                 mirror_loss=True,
                                                 observation_permutation=obs_perm,
                                                 action_permutation=act_perm)
    env = bench.Monitor(env, logger.get_dir() and
        osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True)
    env.seed(seed+MPI.COMM_WORLD.Get_rank())
    gym.logger.setLevel(logging.WARN)

    previous_params = None
    iter_num = 0
    last_iter = False

    # if initialize from previous runs
    #previous_params = joblib.load('')
    #env.env.env.assist_schedule = []

    joblib.dump(str(env.env.env.__dict__), logger.get_dir() + '/env_specs.pkl', compress=True)

    reward_threshold = None
    while True:
        if not last_iter:
            rollout_length_thershold = env.env.env.assist_schedule[2][0] / env.env.env.dt
        else:
            rollout_length_thershold = None
        opt_pi, rew = pposgd_mirror.learn(env, policy_fn,
                max_timesteps=num_timesteps,
                timesteps_per_batch=int(2500),
                clip_param=0.2, entcoeff=0.0,
                optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
                gamma=0.99, lam=0.95, schedule='linear',
                callback=callback,
                sym_loss_weight=4.0,
                positive_rew_enforce=False,
                init_policy_params = previous_params,
                reward_drop_bound=500,
                rollout_length_thershold = rollout_length_thershold,
                policy_scope='pi' + str(iter_num),
                return_threshold = reward_threshold,
            )
        if iter_num == 0:
            reward_threshold = 0.7 * rew
        if last_iter:
            break
        iter_num += 1

        opt_variable = opt_pi.get_variables()
        previous_params = {}
        for i in range(len(opt_variable)):
            cur_val = opt_variable[i].eval()
            previous_params[opt_variable[i].name] = cur_val
        # update the assist schedule
        for s in range(len(env.env.env.assist_schedule)-1):
            env.env.env.assist_schedule[s][1] = np.copy(env.env.env.assist_schedule[s+1][1])
        env.env.env.assist_schedule[-1][1][0] *= 0.75
        env.env.env.assist_schedule[-1][1][1] *= 0.75
        if env.env.env.assist_schedule[-1][1][0] < 5.0:
            env.env.env.assist_schedule[-1][1][0] = 0.0
        if env.env.env.assist_schedule[-1][1][1] < 5.0:
            env.env.env.assist_schedule[-1][1][1] = 0.0
        zero_assist = True
        for s in range(len(env.env.env.assist_schedule)-1):
            for v in env.env.env.assist_schedule[s][1]:
                if v != 0.0:
                    zero_assist = False
        print('Current Schedule: ', env.env.env.assist_schedule)
        if zero_assist:
            last_iter = True
            print('Entering Last Iteration!')

    env.close()
Exemplo n.º 33
0
def learn(env,
          q_func,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=1,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          num_cpu=16,
          param_noise=False,
          callback=None):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to max_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    num_cpu: int
        number of cpus to use for training
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    sess = U.make_session(num_cpu=num_cpu)
    sess.__enter__()

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.shape, name=name)

    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        param_noise=param_noise
    )
    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    reset = True
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join(td, "model")
        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                # for detailed explanation.
                update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n))
                kwargs['reset'] = reset
                kwargs['update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
            reset = False
            new_obs, rew, done, _ = env.step(action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)
                reset = True

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes, new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts and
                    num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log("Saving model due to mean reward increase: {} -> {}".format(
                                   saved_mean_reward, mean_100ep_reward))
                    U.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
            U.load_state(model_file)

    return ActWrapper(act, act_params)
Exemplo n.º 34
0
def evaluate_target_tracking(env_id):

    # there is somehow a bug somewhere. somehow always one run fails... therefore run everything twice
    seed = [1, 2]
    max_timesteps = 3000000

    # model select
    #
    #modelverion_in_k_ts = 2000
    modelverion_in_k_ts = 3000  # good
    modelverion_in_k_ts = 2510  # better
    model_index = int(max_timesteps / 1000 / 10 - modelverion_in_k_ts / 10)

    #TOdo last saved model
    model_index = 0  # uncomment fo select model by modelverion_in_k_ts

    # envs
    eval_env_id = [
        'Mujoco-planar-snake-cars-cam-dist-line-v1',
        'Mujoco-planar-snake-cars-cam-dist-wave-v1',
        'Mujoco-planar-snake-cars-cam-dist-zigzag-v1',
        'Mujoco-planar-snake-cars-cam-dist-random-v1',
    ]

    grid = ParameterGrid(param_grid={'eval_env_id': eval_env_id, 'seed': seed})
    paras = list(grid)

    render = False

    info_dict_collector = InfoDictCollector(None)

    # init load
    model_dir = get_model_dir(env_id, 'ppo')
    model_files = get_model_files(model_dir)
    model_file = model_files[model_index]
    # model_file = model_files[75]
    logger.log("load model_file: %s" % model_file)

    sess = U.make_session(num_cpu=1)
    sess.__enter__()
    gym.logger.setLevel(logging.WARN)

    env = gym.make(env_id)
    pi = policy_fn('pi', env.observation_space, env.action_space)
    env.close()

    with tf.device('/cpu'):

        for i, para in enumerate(paras):
            eval_env_id = para['eval_env_id']
            seed = int(para['seed'])

            env = gym.make(eval_env_id)

            # 3000 timesteps, default for evaluation
            env._max_episode_steps = env._max_episode_steps * 3


            done, number_of_timesteps, info_collector = \
                run_environment_episode(env, pi, seed, model_file, env._max_episode_steps, render, stochastic=False)

            print('run {}/{} para: {}, timesteps: {}'.format(
                i, len(paras), para, number_of_timesteps))

            info_dict_collector.add_info_collector(info_collector)

            env.close()

    modelversion = modelverion_in_k_ts
    info_dict_collector.following_eval_save(modelversion)

    # plot
    import_plots.evaluate_target_tracking()
Exemplo n.º 35
0
from baselines import deepq
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule


def model(inpt, num_actions, scope, reuse=False):
    """This model takes as input an observation and returns values of all actions."""
    with tf.variable_scope(scope, reuse=reuse):
        out = inpt
        out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.tanh)
        out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
        return out


if __name__ == '__main__':
    with U.make_session(8):
        # Create the environment
        env = gym.make("CartPole-v0")
        # Create all the functions necessary to train the model
        act, train, update_target, debug = deepq.build_train(
            make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name),
            q_func=model,
            num_actions=env.action_space.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
        )
        # Create the replay buffer
        replay_buffer = ReplayBuffer(50000)
        # Create the schedule for exploration starting from 1 (every action is random) down to
        # 0.02 (98% of actions are selected according to values predicted by the model).
        exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02)
Exemplo n.º 36
0
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.deepq.utils import ObservationInput
from baselines.common.schedules import LinearSchedule


def model(inpt, num_actions, scope, reuse=False):
    """This model takes as input an observation and returns values of all actions."""
    with tf.variable_scope(scope, reuse=reuse):
        out = inpt
        out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.tanh)
        out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
        return out


if __name__ == '__main__':
    with U.make_session(num_cpu=8):
        # Create the environment
        env = gym.make("CartPole-v0")
        # Create all the functions necessary to train the model
        act, train, update_target, debug = deepq.build_train(
            make_obs_ph=lambda name: ObservationInput(env.observation_space, name=name),
            q_func=model,
            num_actions=env.action_space.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
        )
        # Create the replay buffer
        replay_buffer = ReplayBuffer(50000)
        # Create the schedule for exploration starting from 1 (every action is random) down to
        # 0.02 (98% of actions are selected according to values predicted by the model).
        exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02)
Exemplo n.º 37
0
def train_mirror(args, num_timesteps):
    from baselines.ppo1 import mlp_mirror_policy, mlp_mirror_norms_policy, pposgd_mirror
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env)

    env.env._seed(args.seed + MPI.COMM_WORLD.Get_rank())
    env.env.init_params(args)

    U.ALREADY_INITIALIZED = set()
    U.ALREADY_INITIALIZED.update(set(tf.global_variables()))

    obs_per = np.array([
        0.0001, -1, 2, -3, -4, 11, 12, 13, 14, 15, 16, 5, 6, 7, 8, 9, 10, -17,
        18, -19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, -30, 31, -32, -33, 40,
        41, 42, 43, 44, 45, 34, 35, 36, 37, 38, 39, -46, 47, -48, 53, 54, 55,
        56, 49, 50, 51, 52
    ])

    if env.env.include_additional_info:
        obs_per = np.concatenate((obs_per, np.array([58, 57])))
        obs_per = np.concatenate((obs_per, np.array([59])))
        obs_per = np.concatenate((obs_per, np.array([63, 64, -65, 60, 61,
                                                     -62])))
        obs_per = np.concatenate((obs_per, np.array([66, 67, -68])))
        obs_per = np.concatenate((obs_per, np.array([72, 73, -74, 69, 70,
                                                     -71])))
        obs_per = np.concatenate((obs_per, np.array([75, 76, -77])))
        obs_per = np.concatenate((obs_per, np.array([78, 79, -80])))
        assert env.env.obs_dim == (57 + 3 + 3 * 6 + 3)
        assert env.env.act_dim == 21  # change action/state permutation if change action/state in env

    def policy_fn(name, ob_space, ac_space):
        if env.env.env.state_self_standardize:
            return mlp_mirror_norms_policy.MlpMirrorNormsPolicy(
                name=name,
                ob_space=ob_space,
                ac_space=ac_space,
                hid_size=args.hsize,
                num_hid_layers=args.layers,
                gmm_comp=1,
                mirror_loss=True,
                observation_permutation=obs_per,
                action_permutation=np.array([
                    5, 6, 7, 8, 9, 0.0001, 1, 2, 3, 4, -10, 11, -12, 17, 18,
                    19, 20, 13, 14, 15, 16
                ]))
        else:
            return mlp_mirror_policy.MlpMirrorPolicy(
                name=name,
                ob_space=ob_space,
                ac_space=ac_space,
                hid_size=args.hsize,
                num_hid_layers=args.layers,
                gmm_comp=1,
                mirror_loss=True,
                observation_permutation=obs_per,
                action_permutation=np.array([
                    5, 6, 7, 8, 9, 0.0001, 1, 2, 3, 4, -10, 11, -12, 17, 18,
                    19, 20, 13, 14, 15, 16
                ]))

    env = bench.Monitor(env,
                        logger.get_dir()
                        and osp.join(logger.get_dir(), "monitor.json"),
                        allow_early_resets=True)
    env.seed(args.seed + MPI.COMM_WORLD.Get_rank())
    gym.logger.setLevel(logging.WARN)

    joblib.dump(str(env.env.env.__dict__),
                logger.get_dir() + '/env_specs.pkl',
                compress=True)
    with open(logger.get_dir() + '/env_specs.txt', 'w') as f:
        pprint.pprint(env.env.env.__dict__, f)
    f.close()
    shutil.copyfile(env.env.env.model_file_name,
                    logger.get_dir() + '/using_model.skel')

    cur_sym_loss = 3.0
    iter_num = 0
    previous_params = None
    # previous_params = joblib.load('')
    reward_threshold = None
    rollout_length_threshold = None
    pposgd_mirror.learn(
        env,
        policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_batch=int(2000),
        clip_param=args.clip,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=3e-4,
        optim_batchsize=64,
        gamma=0.99,
        lam=0.95,
        schedule='linear',
        callback=callback,
        sym_loss_weight=cur_sym_loss,
        init_policy_params=previous_params,
        reward_drop_bound=None,
        rollout_length_threshold=rollout_length_threshold,
        policy_scope='pi' + str(iter_num),
        return_threshold=reward_threshold,
    )

    env.close()
Exemplo n.º 38
0
    """
    with tf.variable_scope(scope, reuse=reuse):
        out = inpt
        out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.tanh)
        out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
        return out


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Train DQN on cartpole using a custom mlp")
    parser.add_argument('--no-render', default=False, action="store_true", help="Disable rendering")
    parser.add_argument('--max-timesteps', default=50000, type=int,
                        help="Maximum number of timesteps when not rendering")
    args = parser.parse_args()

    with tf_utils.make_session(8):
        # Create the environment
        env = gym.make("CartPole-v0")
        # Create all the functions necessary to train the model
        act, train, update_target, debug = deepq.build_train(
            make_obs_ph=lambda name: ObservationInput(env.observation_space, name=name),
            q_func=model,
            num_actions=env.action_space.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
        )
        # Create the replay buffer
        replay_buffer = ReplayBuffer(50000)
        # Create the schedule for exploration starting from 1 (every action is random) down to
        # 0.02 (98% of actions are selected according to values predicted by the model).
        exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02)
Exemplo n.º 39
0
    return act.copy()


def observation_wrapper(obs):
    pov = obs['pov'].astype(np.float32) / 255.0 - 0.5
    #compass = obs['compassAngle']

    #compass_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*compass
    #compass_channel /= 180.0

    #return np.concatenate([pov, compass_channel], axis=-1)
    return pov


if __name__ == '__main__':
    with U.make_session(32):
        # Create the environment
        env = gym.make("MineRLTreechop-v0")
        spaces = env.observation_space.spaces['pov']
        shape = list(spaces.shape)
        #shape[-1] += 1

        # Create all the functions necessary to train the model
        act, train, update_target, debug = deepq.build_train(
            make_obs_ph=lambda name: U.BatchInput(shape, name=name),
            q_func=model,
            num_actions=5,
            gamma=0.99,
            optimizer=tf.train.AdamOptimizer(learning_rate=1e-3),
        )
        # Create the replay buffer
Exemplo n.º 40
0
def main():

    # tf.reset_default_graph()
    # config = tf.ConfigProto()
    # config.gpu_options.allow_growth = True

    FLAGS(sys.argv)
    # steps_left = FLAGS.timesteps

    logdir = "tensorboard"
    if (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time)
    elif (FLAGS.algorithm == "acktr"):
        logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr,
            start_time)
    elif (FLAGS.algorithm == "BicNet"):
        logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr,
            start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(
        feature_dimensions=sc2_env.Dimensions(
            screen=32, minimap=32
        ),  #feature_dimensions=sc2_env.Dimensions(screen=84, minimap=64)  将他俩处理成32*32的矩阵
        use_feature_units=True)

    lr = FLAGS.lr
    buffer_size = 60000  # 50000   减少一下,尽量是训练步数的1/10  70000  test 200  70000
    batch_size = 32  # 32
    gamma = 0.99
    num_agents = 2  #9
    vector_obs_len = 736  #33   #4096  # 32*32  1024
    output_len = 4  #3

    hidden_vector_len = 128  #128   #1
    tau = 0.001
    # stddev = 0.1

    sess = U.make_session()
    sess.__enter__()
    actor = tb.ActorNetwork(sess, lr, tau, batch_size, num_agents,
                            vector_obs_len, output_len, hidden_vector_len)
    critic = tb.CriticNetwork(sess, lr, tau, gamma,
                              actor.get_num_trainable_vars(), num_agents,
                              vector_obs_len, output_len, hidden_vector_len)
    sess.run(tf.global_variables_initializer())
    replay_buffer = ReplayBuffer(buffer_size)
    # action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(1), sigma=float(stddev) * np.ones(1))
    action_noise = noise_OU.OU_noise(decay_period=FLAGS.timesteps -
                                     buffer_size)

    # while(steps_left > 0):
    with sc2_env.SC2Env(
            map_name="CollectMineralShards",  #DefeatZerglingsAndBanelings
            # step_mul=step_mul,
            agent_interface_format=AGENT_INTERFACE_FORMAT,
            visualize=False,  #True
            game_steps_per_episode=steps * step_mul) as env:

        learn(
            env,
            sess=sess,
            max_timesteps=FLAGS.timesteps,
            train_freq=1,
            save_freq=10000,
            target_network_update_freq=1,  #1000
            gamma=gamma,
            # callback=BicNet_callback,
            actor=actor,
            critic=critic,
            replay_buffer=replay_buffer,
            num_agents=num_agents,
            action_noise=action_noise,
            output_len=output_len,
            num_exploring=buffer_size  #buffer_size
        )
Exemplo n.º 41
0
def train(env_id, num_timesteps, seed, num_options, app, saves, wsaves, epoch,
          dc):
    from baselines.ppo15 import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(seed)
    from gym.envs.registration import register
    # Potential Pendulum Env
    if (True):
        register(
            id='Pendulumnf-v0',
            entry_point='nfunk.envs_nf.pendulum_nf:PendulumEnv',
            max_episode_steps=400,
            #kwargs = vars(args),
        )
        env = gym.make('Pendulumnf-v0')
    # Potential Scalar Env
    if (False):
        register(
            id='Scalarnf-v0',
            entry_point='nfunk.envs_nf.gym_scalar_nf:GymScalarEnv',
            max_episode_steps=400,
            #kwargs = vars(args),
        )
        env = gym.make('Scalarnf-v0')
    if (False):
        env = gym.make(env_id)

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=32,
                                    num_hid_layers=2,
                                    num_options=num_options,
                                    dc=dc)

    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"))
    env.seed(seed)
    gym.logger.setLevel(logging.WARN)

    if num_options == 1:
        optimsize = 64
    elif num_options == 2:
        optimsize = 32
    else:
        print("Only two options or primitive actions is currently supported.")
        sys.exit()

    pposgd_simple.learn(env,
                        policy_fn,
                        max_timesteps=num_timesteps,
                        timesteps_per_batch=2048,
                        clip_param=0.2,
                        entcoeff=0.0,
                        optim_epochs=10,
                        optim_stepsize=3e-5,
                        optim_batchsize=optimsize,
                        gamma=0.99,
                        lam=0.95,
                        schedule='constant',
                        num_options=num_options,
                        app=app,
                        saves=saves,
                        wsaves=wsaves,
                        epoch=epoch,
                        seed=seed,
                        dc=dc)
    env.close()
def train(env_id, num_timesteps, seed, num_options, app, saves, wsaves, epoch,
          dc, path, render, official, orig_ppo):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(seed)
    # Episode len determines the length of the rollout. Remember: length of 20 means 1s.
    episode_len = 400
    from gym.envs.registration import register
    # add the current path to the repo -> we are loading exactly the repo it has been trained on!!!
    sys.path.append(path)
    print(sys.path)
    from src_code import mlp_policy
    # Depending on the environment argument, the right environment is selected
    if (env_id == 'Pendulumnf-v0'):
        register(
            id='Pendulumnf-v0',
            entry_point='src_code.pendulum_nf:PendulumEnv',
            max_episode_steps=episode_len,
            #kwargs = vars(args),
        )
        env = gym.make('Pendulumnf-v0')
    # Potential Scalar Env
    elif (env_id == 'Scalarnf-v0'):
        register(
            id='Scalarnf-v0',
            entry_point='src_code.gym_scalar_nf:GymScalarEnv',
            max_episode_steps=episode_len,
            #kwargs = vars(args),
        )
        env = gym.make('Scalarnf-v0')
    elif (env_id == 'CartPole-v9'):
        register(
            id='CartPole-v9',
            entry_point='src_code.cartpole:CartPoleEnv',
            max_episode_steps=episode_len,
            #kwargs = vars(args),
        )
        env = gym.make('CartPole-v9')
    else:
        env = gym.make(env_id)

    # Create the policies needed

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2,
                                    num_options=num_options,
                                    dc=dc)  #was 64,32 or 15

    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"))

    env.seed(seed)
    gym.logger.setLevel(logging.WARN)

    if num_options == 1:
        optimsize = 64
    elif num_options == 2:
        optimsize = 32
    else:
        print("Only two options or primitive actions is currently supported.")
        sys.exit()

    # Start the visualization script
    visual.learn(env,
                 policy_fn,
                 max_timesteps=num_timesteps,
                 timesteps_per_batch=2048,
                 clip_param=0.2,
                 entcoeff=0.0,
                 optim_epochs=10,
                 optim_stepsize=3e-4,
                 optim_batchsize=optimsize,
                 gamma=0.99,
                 lam=0.95,
                 schedule='constant',
                 num_options=num_options,
                 app=app,
                 saves=saves,
                 wsaves=wsaves,
                 epoch=epoch,
                 seed=seed,
                 dc=dc,
                 episode_len=episode_len,
                 path=path,
                 render=render,
                 official=official,
                 orig_ppo=orig_ppo)
    env.close()
Exemplo n.º 43
0
    else:
        container = None
    # Create and seed the env.
    env, monitored_env = make_env(args.env)
    if args.seed > 0:
        set_global_seeds(args.seed)
        env.unwrapped.seed(args.seed)

    if args.gym_monitor and savedir:
        env = gym.wrappers.Monitor(env, os.path.join(savedir, 'gym_monitor'), force=True)

    if savedir:
        with open(os.path.join(savedir, 'args.json'), 'w') as f:
            json.dump(vars(args), f)

    with U.make_session(4) as sess:
        # Create training graph and replay buffer
        def model_wrapper(img_in, num_actions, scope, **kwargs):
            actual_model = dueling_model if args.dueling else model
            return actual_model(img_in, num_actions, scope, layer_norm=args.layer_norm, **kwargs)
        act, train, update_target, debug = deepq.build_train(
            make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name),
            q_func=model_wrapper,
            num_actions=env.action_space.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr, epsilon=1e-4),
            gamma=0.99,
            grad_norm_clipping=10,
            double_q=args.double_q,
            param_noise=args.param_noise
        )
Exemplo n.º 44
0
def main(_):
    print("Used flags:", FLAGS)
    config = configparser.ConfigParser()
    config.read(FLAGS.config_file)
    timer = time.time()

    ps_hosts = FLAGS.ps_hosts.split(",") if FLAGS.ps_hosts else config.get(FLAGS.config, 'ps_hosts').split(",")
    worker_hosts = FLAGS.worker_hosts.split(",") if FLAGS.worker_hosts else config.get(FLAGS.config, 'worker_hosts').split(",")
    job = FLAGS.job_name
    task = FLAGS.task_index
    learning_rate = config.getfloat(FLAGS.config, 'learning_rate')
    batch_size = config.getint(FLAGS.config, 'batch_size')
    memory_size = config.getint(FLAGS.config, 'memory_size')
    target_update = config.getint(FLAGS.config, 'target_update')
    seed = FLAGS.seed if FLAGS.seed else config.getint(FLAGS.config, 'seed')
    max_comm_rounds = config.getint(FLAGS.config, 'comm_rounds')
    epochs = config.getint(FLAGS.config, 'start_epoch')
    end_epoch = config.getint(FLAGS.config, 'end_epoch')
    epoch_decay = config.getint(FLAGS.config, 'epoch_decay')
    # epoch_decay_rate = (epochs - end_epoch) / epoch_decay
    epoch = LinearSchedule(epoch_decay, end_epoch, epochs)
    backup = config.getint(FLAGS.config, 'backup')  # unused in async
    sync = config.getboolean(FLAGS.config, 'sync')
    gradient_prio = False if not sync else config.getboolean(FLAGS.config, 'gradient_prio')
    sync_workers = len(worker_hosts)-backup
    mute = FLAGS.mute if FLAGS.mute else config.getboolean(FLAGS.config, 'mute')
    animate = 0
    draw = 0

    print("Config:\nps_hosts={}\nworker_hosts={}\njob_name={}\ntask_index={}\nlearning_rate={}\n"
          "batch_size={}\nmemory_size={}\ntarget_update={}\nseed={}\ncomm_rounds={}\nepochs={}\n"
          "end_epoch={}\nepoch_decay={}\nnbackup={}\nsync={}"
          .format(ps_hosts, worker_hosts, job, task, learning_rate, batch_size, memory_size, target_update,
                  seed, max_comm_rounds, epochs, end_epoch, epoch_decay, backup, sync))

    cluster = tf.train.ClusterSpec({'ps': ps_hosts, 'worker': worker_hosts})
    chief = True if job == 'worker' and task == 0 else False
    print("/job:", job, "/task:", task, " - Chief: ", chief, sep='')

    # Create server
    server = tf.train.Server(cluster, job_name=job, task_index=task)

    run_code = "{}-{}-p-{}-w-{}-E-{}-b-{}-m-{}-N-{}-lr-{}-B-{}-s-{}-".\
        format(datetime.now().strftime("%y%m%d-%H%M%S"), env_name, len(ps_hosts), len(worker_hosts),
               epochs, batch_size, memory_size, target_update, learning_rate, backup, seed)
    run_code += "-sync" if sync else "-async"

    # Set a unique random seed for each client
    seed = ((seed * 10) + task)
    random.seed(seed)

    if not mute:
        print("Run code:", run_code)

    # Start parameter servers
    if job == 'ps':
        server.join()

    # Start training
    with U.make_session(num_cpu=4, target=server.target) as sess:
        # Create the environment
        env = gym.make(env_name)
        env.seed(seed)
        tf.set_random_seed(seed)

        # Create all the functions necessary to train the model
        act, train, global_opt,  update_target, update_weights, sync_opt, debug = deepq.build_train(
            make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name),
            q_func=model,
            num_actions=env.action_space.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate),
            # optimizer=tf.train.GradientDescentOptimizer(learning_rate=learning_rate),
            chief=chief,
            server=server,
            workers=sync_workers
        )
        # Create the replay buffer
        replay_buffer = ReplayBuffer(memory_size)
        # Create the schedule for exploration starting from 1 (every action is random) down to
        # 0.02 (98% of actions are selected according to values predicted by the model).
        exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02)

        if not chief:
            if not mute:
                print("Worker {}/{} will sleep (3s) for chief to initialize variables".format(task+1, len(worker_hosts)))
            time.sleep(4)

        # Initialize the parameters and copy them to the target network.
        U.initialize(chief=chief)

        if chief:
            sess.run(debug['run_code'].assign(run_code))
            if not mute:
                print("Set global run code to:", run_code)

        if not mute:
            print("initialized variables, sleeping for 1 sec")
        time.sleep(2)

        if not chief:
            while not sess.run(tf.is_variable_initialized(debug['run_code'])):
                if not mute:
                    print("Global run code not yet initialized")
                time.sleep(2)
            run_code = str(sess.run(debug['run_code']).decode())
            if run_code == '':
                if not mute:
                    print("Run code empty. Trying to fetch again...")
                time.sleep(5)
            if not mute:
                print("Read global run code:", run_code)

        run_code += "(w" + str(task) + ")"
        print("Final run_code:", run_code)

        t_global_old = update_weights()[0][0]
        update_target()
        exp_gen = 1000  # For how many timesteps sould we only generate experience (not train)
        t_start = exp_gen
        comm_rounds = 0
        comm_rounds_global = 0
        dt = 0
        write_csv(run_code, log=["episode", "reward" + str(task), "avg_reward" + str(task), "t_global", "cr"])

        episode_rewards = [0.0]
        cr_reward = 0
        obs = env.reset()
        for t in itertools.count():
            # Take action and update exploration to the newest value
            action = act(obs[None], update_eps=exploration.value(t))[0]
            new_obs, rew, done, _ = env.step(action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            cr_reward += rew

            # Animate every <animate> episodes
            if not mute and chief and animate > 0 and (len(episode_rewards) % animate) == 0:
                if done:
                    print("ep", len(episode_rewards), "ended with reward:", episode_rewards[-1])
                env.render()

            if done:
                if not mute and chief and draw > 0 and len(episode_rewards) % draw == 0:
                    env.render()
                avg_rew = np.round(np.mean(np.array(episode_rewards[-100:])), 1)
                write_csv(run_code, [len(episode_rewards), episode_rewards[-1], avg_rew, debug['t_global']()[0], comm_rounds_global])

                obs = env.reset()
                episode_rewards.append(0)

            [converged] = sync_opt['check_converged']()
            is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= max_reward or converged
            if is_solved or comm_rounds >= max_comm_rounds:
                sync_opt['set_converged']([True])
                if not mute:
                    print("Converged was set to", sync_opt['check_converged']()[0])
                write_csv_final(run_code, str(len(episode_rewards)), worker_hosts, chief, comm_rounds_global, mute)
                print("Converged after:  ", len(episode_rewards), "episodes")
                print("Agent total steps:", t)
                print("Global steps:     ", debug['t_global']()[0])
                sec = round(time.time() - timer)
                print("Total time:", sec // 3600, "h", (sec % 3600) // 60, "min", sec % 60, "s")
                return
            else:
                if t >= exp_gen:
                # if t >= batch_size:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                    td_error = train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards))

                    if t - t_start >= np.round(epoch.value(comm_rounds)):  

                        cr_old = comm_rounds_global

                        # Apply gradients to weights in PS
                        if sync:
                            # Tell the ps we are done and want to submit score
                            [[comm_rounds_global], [worker_count]] = sync_opt['request_submit']()

                            if comm_rounds_global == comm_rounds:
                                if worker_count <= sync_workers:
                                    # If allowed to submit score, do it
                                    [comm_rounds_global] = sync_opt['submit_score']([cr_reward])

                                    if chief: 
                                        [submits] = sync_opt['set_submit']([0])
                                        while worker_count != sync_workers:
                                            if sync_opt['check_converged']()[0]:
                                                if not mute:
                                                    print("Other worker converged! Finishing in check_wc")
                                                break
                                            worker_count = sync_opt['check_wc']()[0]

                                    while sync_opt['check_submit']()[0] == -1:
                                        if sync_opt['check_converged']()[0]:
                                            if not mute:
                                                print("Other worker converged! Finishing in check_submit")
                                            break
                                      
                                        pass

                                    if sync_opt['check_converged']()[0]:
                                        if not mute:
                                            print("Other worker converged! Continuing before submit")
                                        continue

                                    # Now all eligible workers have sent their score and gradient round has started
                                    # Submit gradient
                                    # TODO 4th argument overrides everything else unles it is set to -1 in the code
                                    [[dt], [comm_rounds_global], [factor]] = global_opt([t - t_start], [t_global_old],
                                                                              [cr_reward], [1/len(worker_hosts)], [True])

                                    submits = sync_opt['inc_submit']()
                                    if chief:
                                        while not sync_opt['check_submit']()[0] == sync_workers:
                                            if sync_opt['check_converged']()[0]:
                                                if not mute:
                                                    print("Other worker converged! Finishing in check_submit (chief)")
                                                break
                                          
                                            pass
                                        # print("Round", comm_rounds, "finished")
                                        [w] = sync_opt['reset_wc']()[0]
                                        # print("Worker count reset to:", w)
                                        sync_opt['reset_score']()
                                        submits = sync_opt['set_submit']([-1])
                                        # print("Submit round finished. Submits set to:", submits[0])
                                        [r] = sync_opt['inc_comm_round']()[0]
                                        # print("New round started:", r)

                                    # Normal workers wait until GCR > CR
                                    if not chief:
                                        while sync_opt['check_round']()[0] <= comm_rounds:
                                            if sync_opt['check_converged']()[0]:
                                                if not mute:
                                                    print("Other worker converged! Finishing in check_round")
                                                break
                                            # print("Worker submitted, waiting for next round:", comm_rounds + 1)
                                            # time.sleep(0.1)
                                            pass

                                else: #elif worker_count > sync_workers:
                                    # If not allowed to submit score, wait for next round to start
                                    if not mute:
                                        print("Worker finished too late but before new round started (", comm_rounds_global, ")")
                                        print("WC(", worker_count, ") > N(", sync_workers, ")", sep="")
                                    target = np.floor(comm_rounds_global + 1)  # +1 if x.0, +0.5 if x.5
                                    while not sync_opt['check_round']()[0] >= target:
                                        pass

                            elif comm_rounds_global > comm_rounds:
                                # This means the worker is behind. Do nothing and start next round
                                if not mute:
                                    print("Communication round ", comm_rounds, "missed. Actual round:", comm_rounds_global)
                                # TODO How to handle round count when skipping rounds?
                                comm_rounds = comm_rounds_global - 1

                            elif comm_rounds_global < comm_rounds:
                                print("WARNING! Worker ahead of global:", comm_rounds, ">", comm_rounds_global)
                                time.sleep(5)

                        else:
                            sync_opt['inc_comm_round']()
                            [[dt], [comm_rounds_global], [factor]] = global_opt([t - t_start], [t_global_old], [0], [-1], [False])

                        # Update the local weights with the new global weights from PS
                        t_global_old = update_weights()[0][0]

                        comm_rounds += 1
                        # print("Round finished. Increasing local comm_round to:", comm_rounds)
                        cr_reward = 0
                        # TODO RE-ENABLE comm-rounds LOGGING
                        # write_csv(run_code, [comm_rounds, t, dt, epoch.value(comm_rounds)], comm_rounds=True)

                        t_start = t
                if t % target_update == 0:
                    update_target()

            if not mute and done and len(episode_rewards) % 10 == 0:
                last_rewards = episode_rewards[-101:-1]
                logger.record_tabular("steps", t)
                logger.record_tabular("global steps", debug['t_global']()[0])
                logger.record_tabular("communication rounds", comm_rounds)
                logger.record_tabular("episodes", len(episode_rewards))
                logger.record_tabular("mean episode reward", np.round(np.mean(episode_rewards[-101:-1]), 4))
                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                # logger.record_tabular("last gradient factor", np.round(factor, 4))
                logger.dump_tabular()
                rew_ill = ['●' if x >= max_reward else str(int(np.floor(x / (max_reward/10)))) if x >= (max_reward/10) else '_' for x in last_rewards]
                streak = 0
                for i in reversed(rew_ill):
                    if i == "●":
                        streak += 1
                    else:
                        break
                #print("[" + ''.join(rew_ill) + "] ([● " + str(rew_ill.count('●')) + " | " + str(rew_ill.count('9')) +
                      " | " + str(rew_ill.count('8')) + " | " + str(rew_ill.count('7')) +
                      " | " + str(rew_ill.count('6')) + " | " + str(rew_ill.count('5')) +
                      " | " + str(rew_ill.count('4')) + " | " + str(rew_ill.count('3')) +
                      " | " + str(rew_ill.count('2')) + " | " + str(rew_ill.count('1')) +
                      " | " + str(rew_ill.count('_')) + " _]/" + str(len(rew_ill)) + " {S:" + str(streak) + "})", sep='')
Exemplo n.º 45
0
def train(env,
          policy,
          policy_init,
          n_episodes,
          horizon,
          seed,
          njobs=1,
          save_weights=False,
          **alg_args):

    if env.startswith('rllab.'):
        # Get env name and class
        env_name = re.match('rllab.(\S+)', env).group(1)
        env_rllab_class = rllab_env_from_name(env_name)

        # Define env maker
        def make_env():
            env_rllab = env_rllab_class()
            _env = Rllab2GymWrapper(env_rllab)
            return _env

        # Used later
        env_type = 'rllab'
    else:
        # Normal gym, get if Atari or not.
        env_type = get_env_type(env)
        assert env_type is not None, "Env not recognized."
        # Define the correct env maker
        if env_type == 'atari':
            # Atari, custom env creation
            def make_env():
                _env = make_atari(env)
                return wrap_deepmind(_env)
        else:
            # Not atari, standard env creation
            def make_env():
                env_rllab = gym.make(env)
                return env_rllab

    if policy == 'linear':
        hid_size = num_hid_layers = 0
    elif policy == 'nn':
        hid_size = [100, 50, 25]
        num_hid_layers = 3

    if policy_init == 'xavier':
        policy_initializer = tf.contrib.layers.xavier_initializer()
    elif policy_init == 'zeros':
        policy_initializer = U.normc_initializer(0.0)
    else:
        raise Exception('Unrecognized policy initializer.')

    if policy == 'linear' or policy == 'nn':

        def make_policy(name, ob_space, ac_space):
            return MlpPolicy(name=name,
                             ob_space=ob_space,
                             ac_space=ac_space,
                             hid_size=hid_size,
                             num_hid_layers=num_hid_layers,
                             gaussian_fixed_var=True,
                             use_bias=False,
                             use_critic=False,
                             hidden_W_init=policy_initializer,
                             output_W_init=policy_initializer)
    elif policy == 'cnn':

        def make_policy(name, ob_space, ac_space):
            return CnnPolicy(name=name,
                             ob_space=ob_space,
                             ac_space=ac_space,
                             gaussian_fixed_var=True,
                             use_bias=False,
                             use_critic=False,
                             hidden_W_init=policy_initializer,
                             output_W_init=policy_initializer)
    else:
        raise Exception('Unrecognized policy type.')

    sampler = ParallelSampler(make_policy,
                              make_env,
                              n_episodes,
                              horizon,
                              True,
                              n_workers=njobs,
                              seed=seed)

    try:
        affinity = len(os.sched_getaffinity(0))
    except:
        affinity = njobs
    sess = U.make_session(affinity)
    sess.__enter__()

    set_global_seeds(seed)

    gym.logger.setLevel(logging.WARN)

    pois.learn(make_env,
               make_policy,
               n_episodes=n_episodes,
               horizon=horizon,
               sampler=sampler,
               save_weights=save_weights,
               **alg_args)

    sampler.close()
Exemplo n.º 46
0
def behavioral_cloning_nn(num_epochs,
                          num_layers,
                          num_hidden,
                          X,
                          Y,
                          validation=0.2,
                          lr=1e-4,
                          l2=0.,
                          batch_size=128,
                          init_logstd=1.,
                          state_dependent_variance=True,
                          starting_point='',
                          discrete=False,
                          beta=1.0):
    input_dim = X.shape[-1]
    output_dim = Y.shape[-1]
    observation_space = Box(low=-np.inf, high=np.inf, shape=(input_dim, ))
    if discrete:
        action_space = Discrete(n=len(np.unique(Y)))
    else:
        action_space = Box(low=-np.inf, high=np.inf, shape=(output_dim, ))
    tf.reset_default_graph()
    config = tf.ConfigProto(allow_soft_placement=True,
                            inter_op_parallelism_threads=8,
                            intra_op_parallelism_threads=8,
                            device_count={'CPU': 8})

    config.gpu_options.allow_growth = True
    sess = U.make_session(make_default=True, config=config)
    network = mlp(num_hidden=num_hidden, num_layers=num_layers)
    policy_train = build_policy(
        observation_space,
        action_space,
        network,
        l2=l2,
        lr=lr,
        trainable_variance=state_dependent_variance,
        init_logstd=init_logstd,
        beta=beta,
        state_dependent_variance=state_dependent_variance)()
    U.initialize()
    if starting_point != '':
        policy_train.load(starting_point)
    # dataset build
    states = X
    actions = Y

    if discrete:
        print("Original Dataset Size:", states.shape[0])
        classes = np.unique(Y)
        class_counts = np.array([np.sum(Y == cl) for cl in classes])
        max_count = max(class_counts)
        ratios = class_counts / max_count
        print("Class Distribution:", class_counts / states.shape[0])
        print("Class ratios:", ratios)
        states_to_add = []
        actions_to_add = []
        for j, ratio in enumerate(ratios):

            if ratio != 1:
                for i in range(int(1 / ratio)):
                    states_to_add += states[actions == classes[j]].tolist()
                    actions_to_add += actions[actions == classes[j]].tolist()
                remaining = int((1 / ratio - int(1 / ratio)) * class_counts[j])
                all_indexes = np.array([x for x in range(class_counts[j])])
                random.shuffle(all_indexes)
                shuffled_indexes = all_indexes[0:remaining]
                states_to_add += states[actions ==
                                        classes[j]][shuffled_indexes].tolist()
                actions_to_add += actions[
                    actions == classes[j]][shuffled_indexes].tolist()
        states_to_add = np.array(states_to_add)
        actions_to_add = np.array(actions_to_add)
        states = np.concatenate([states, states_to_add], axis=0)
        actions = np.concatenate([actions, actions_to_add], axis=0)
        print("Oversampled Dataset Size", states.shape[0])
    dataset = list(zip(states, actions))
    random.shuffle(dataset)
    if validation > 0.:
        k = math.floor(validation * len(dataset))
        dataset_training = dataset[:-k]
        dataset_validation = dataset[-k:]
    else:
        dataset_training = dataset[:]

    # pre-processing statistics
    num_batches = len(dataset_training) // batch_size
    num_batches += (0 if len(dataset_training) % batch_size == 0 else 1)
    print('# batches: ', num_batches)
    print('# training samples: ', len(dataset_training))
    logger = {
        'training_samples': len(dataset_training),
        'batch_size': batch_size,
        'num_batches': num_batches,
        'num_epochs': num_epochs
    }
    if validation > 0.:
        print('# validation samples: ', len(dataset_validation))
        logger['validation_samples'] = len(dataset_validation)

        # validation samples built
        X_val, y_val = zip(*dataset_validation)
        X_val, y_val = np.array(X_val), np.array(y_val)
    # train + accuracy over epochs
    counter = 0
    best_loss = np.inf
    for epoch in trange(num_epochs):
        # train batches built
        random.shuffle(dataset_training)
        batches = []
        for i in range(num_batches):
            base = batch_size * i
            batches.append(dataset_training[base:base + batch_size])
        # train
        if validation > 0.:
            target = y_val
            accuracy, _, loss = policy_train.evaluate(X_val[:], target, False)
            if epoch % 1 == 0 and loss <= best_loss:
                best_loss = loss
        else:
            pass
        for batch in batches:

            batch_X, batch_y = zip(*batch)
            target = batch_y
            output = policy_train.fit(batch_X, target)
            summaries = [
                tf.Summary.Value(tag="loss", simple_value=output[0]),
                tf.Summary.Value(tag="r2", simple_value=output[1])
            ]
            if not discrete:
                summaries += [
                    tf.Summary.Value(tag="mean_std", simple_value=output[2]),
                    tf.Summary.Value(tag="min_std", simple_value=output[3]),
                    tf.Summary.Value(tag="max_std", simple_value=output[4])
                ]
            else:
                summaries += [
                    tf.Summary.Value(tag="entropy", simple_value=output[2]),
                    tf.Summary.Value(tag="stochastic_accuracy",
                                     simple_value=output[3])
                ]
            counter += 1
        # validation
    if validation > 0.:
        target = y_val
        accuracy, _, loss = policy_train.evaluate(X_val[:], target, False)
        summary = tf.Summary(value=[
            tf.Summary.Value(tag="accuracy", simple_value=accuracy),
            tf.Summary.Value(tag="test_loss", simple_value=loss)
        ])
        if num_epochs % 1 == 0 and loss <= best_loss:
            best_loss = loss
    batch_X, batch_Y = zip(*dataset)
    _, _, loss, ll = policy_train.evaluate(batch_X[:], batch_Y[:], False)
    logger['cost'] = loss
    logger['ll'] = ll
    return policy_train, logger, None