예제 #1
0
def train(env_id, num_timesteps, seed, lam, sgd_steps, klcoeff, log):
    """
    Train TRPO model for the mujoco environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    with tf_util.single_threaded_session():
        rank = MPI.COMM_WORLD.Get_rank()
        log_path = './experiments/' + str(
            env_id) + './OURS-LOADED/noent_klcoeffanneal_samesgdsteps' + str(
                sgd_steps) + '_longer_wgae0.95_exp1_2_' + str(seed)
        #log_path = './experiments/'+str(env_id)+'./TRPO-3x/TRPOR-oldsampling/noent_klcoeff'+str(sgd_steps)+'_sgdstep_steps5_'+str(seed)
        if not log:
            if rank == 0:
                logger.configure(log_path)
            else:
                logger.configure(log_path, format_strs=[])
                logger.set_level(logger.DISABLED)
        else:
            if rank == 0:
                logger.configure()
            else:
                logger.configure(format_strs=[])
                logger.set_level(logger.DISABLED)

        workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

        #env = make_mujoco_env(env_id, workerseed)
        def make_env():
            env_out = gym.make(env_id)
            env_out = bench.Monitor(env_out,
                                    logger.get_dir(),
                                    allow_early_resets=True)
            env_out.seed(seed)
            return env_out

        env = DummyVecEnv([make_env])
        env = VecNormalize(env)  #, norm_reward=False, norm_obs=False)

        #env = VecNormalize(env)
        model = TRPO(MlpPolicy,
                     env,
                     timesteps_per_batch=2048,
                     max_kl=0.01,
                     cg_iters=10,
                     cg_damping=0.1,
                     entcoeff=0.0,
                     gamma=0.99,
                     lam=0.95,
                     vf_iters=5,
                     vf_stepsize=1e-3,
                     verbose=1,
                     seed=seed,
                     sgd_steps=sgd_steps,
                     klcoeff=klcoeff,
                     method="multistep-SGD")
        model.learn(total_timesteps=10e6)  #num_timesteps, seed=seed)
        env.close()
예제 #2
0
    def __exit__(self, exc_type, exc_val, exc_tb):
        if self.verbose <= 1:
            os.environ['TF_CPP_MIN_LOG_LEVEL'] = self.tf_level

        if self.verbose <= 0:
            logger.set_level(self.log_level)
            gym.logger.set_level(self.gym_level)
예제 #3
0
def train(env_id, num_timesteps, seed):
    """
    Train TRPO model for the mujoco environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    with tf_util.single_threaded_session():
        rank = MPI.COMM_WORLD.Get_rank()
        if rank == 0:
            logger.configure()
        else:
            logger.configure(format_strs=[])
            logger.set_level(logger.DISABLED)
        workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

        tblog = "/cvgl2/u/surajn/workspace/tb_logs/reacher/"
        env = make_mujoco_env(env_id, workerseed)
        model = TRPO(MlpPolicy,
                     env,
                     timesteps_per_batch=1024,
                     max_kl=0.01,
                     cg_iters=10,
                     cg_damping=0.1,
                     entcoeff=0.0,
                     gamma=0.99,
                     lam=0.98,
                     vf_iters=5,
                     vf_stepsize=1e-3,
                     tensorboard_log)
        model.learn(total_timesteps=num_timesteps)
        env.close()
def train(env_id, num_timesteps, run, kappa, vf_phi_update_interval, log):
    """
    Train TRPO model for the mujoco environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    with tf_util.single_threaded_session():
        rank = MPI.COMM_WORLD.Get_rank()
        log_path = './experiments/'+str(env_id)+'./updated_nkappa_x7_ent_0.01_new/'+str(kappa)+'_'+str(vf_phi_update_interval)+'_'+str(run)
        if not log:
            if rank == 0:
                logger.configure(log_path)
            else:
                logger.configure(log_path, format_strs=[])
                logger.set_level(logger.DISABLED)
        else:
            if rank == 0:
                logger.configure()
            else:
                logger.configure(format_strs=[])
                logger.set_level(logger.DISABLED)
        seed = run
        workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

        #set_global_seeds(run)
        env = make_mujoco_env(env_id, workerseed)
        test_env = None#make_mujoco_env(env_id, workerseed)
        model = TRPO(MlpPolicy, env, test_env=test_env, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, entcoeff=0.01,
                     gamma=0.99, kappa=kappa, vf_iters=5, vf_stepsize=1e-3, verbose=1, vf_phi_update_interval=vf_phi_update_interval, seed=run)
        model.learn(total_timesteps=int(2e6), seed=run)
        #model.save("./"+str(env_id)+"./models/"+str(kappa)+"_"+str(run)+'_xnew_longer_slower'+str(vf_phi_update_interval)+'.pkl')
        env.close()
예제 #5
0
def train(env_id, num_timesteps, seed, lam, sgd_steps, klcoeff, log):
    """
    Train TRPO model for the mujoco environment, for testing purposes
    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    with tf_util.single_threaded_session():
        rank = MPI.COMM_WORLD.Get_rank()
        log_path = './experiments/' + str(
            env_id) + './SAC-M/nips_test19/m' + str(sgd_steps) + '_c' + str(
                0.5) + '_e' + str(klcoeff) + '_' + str(seed)
        #log_path = './experiments/'+str(env_id)+'./TRPO-3x/TRPOR-oldsampling/noent_klcoeff'+str(sgd_steps)+'_sgdstep_steps5_'+str(seed)
        if not log:
            if rank == 0:
                logger.configure(log_path)
            else:
                logger.configure(log_path, format_strs=[])
                logger.set_level(logger.DISABLED)
        else:
            if rank == 0:
                logger.configure()
            else:
                logger.configure(format_strs=[])
                logger.set_level(logger.DISABLED)

        workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

        #env = make_mujoco_env(env_id, workerseed)
        def make_env():
            env_out = gym.make(env_id)
            env_out = bench.Monitor(env_out,
                                    logger.get_dir(),
                                    allow_early_resets=True)
            env_out.seed(seed)
            return env_out

        env = DummyVecEnv([make_env])
        env = VecNormalize(env, norm_reward=False, norm_obs=False)

        #env = VecNormalize(env)
        model = MDPO(MlpPolicy,
                     env,
                     gamma=0.99,
                     verbose=1,
                     seed=seed,
                     buffer_size=1000000,
                     ent_coef=1.0,
                     gradient_steps=sgd_steps,
                     lam=klcoeff,
                     train_freq=1,
                     tsallis_q=1,
                     reparameterize=True,
                     klconst=0.5)
        model.learn(
            total_timesteps=int(num_timesteps))  #num_timesteps, seed=seed)
        env.close()
예제 #6
0
    def __enter__(self):
        self.tf_level = os.environ.get('TF_CPP_MIN_LOG_LEVEL', '0')
        self.log_level = logger.get_level()
        self.gym_level = gym.logger.MIN_LEVEL

        if self.verbose <= 1:
            os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

        if self.verbose <= 0:
            logger.set_level(logger.DISABLED)
            gym.logger.set_level(gym.logger.DISABLED)
예제 #7
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    """
    run the training of DDPG

    :param env_id: (str) the environment ID
    :param seed: (int) the initial random seed
    :param noise_type: (str) the wanted noises ('adaptive-param', 'normal' or 'ou'), can use multiple noise type by
        seperating them with commas
    :param layer_norm: (bool) use layer normalization
    :param evaluation: (bool) enable evaluation of DDPG training
    :param kwargs: (dict) extra keywords for the training.train function
    """

    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)
    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'))
        env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mean=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mean=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    start_time = 0
    if rank == 0:
        start_time = time.time()
    model = DDPG(policy=MlpPolicy,
                 env=env,
                 memory_policy=Memory,
                 eval_env=eval_env,
                 param_noise=param_noise,
                 action_noise=action_noise,
                 memory_limit=int(1e6),
                 layer_norm=layer_norm,
                 verbose=2,
                 **kwargs)
    model.learn(total_timesteps=10000)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
예제 #8
0
def main(args):

    logger.configure(SIMPLE.config.LOGDIR)

    if args.debug:
        logger.set_level(SIMPLE.config.DEBUG)
    else:
        logger.set_level(SIMPLE.config.INFO)

    #make environment
    env = get_environment(args.env_name)(verbose=args.verbose,
                                         manual=args.manual)
    env.seed(args.seed)

    total_rewards = {}

    if args.recommend:
        ppo_model = load_model(env, 'best_model.zip')
        ppo_agent = Agent('best_model', ppo_model)
    else:
        ppo_agent = None

    agents = []

    #load the agents
    if len(args.agents) != env.n_players:
        raise Exception(
            f'{len(args.agents)} players specified but this is a {env.n_players} player game!'
        )

    for i, agent in enumerate(args.agents):
        if agent == 'human':
            agent_obj = Agent('human')
        elif agent == 'rules':
            agent_obj = Agent('rules')
        elif agent == 'base':
            base_model = load_model(env, 'base.zip')
            agent_obj = Agent('base', base_model)
        else:
            ppo_model = load_model(env, f'{agent}.zip')
            agent_obj = Agent(agent, ppo_model)
        agents.append(agent_obj)
        total_rewards[agent_obj.id] = 0

    #play games
    logger.info(f'\nPlaying {args.games} games...')
    for game in range(args.games):
        players = agents[:]

        if args.randomise_players:
            random.shuffle(players)

        obs = env.reset()
        done = False

        for i, p in enumerate(players):
            logger.debug(f'Player {i+1} = {p.name}')

        while not done:

            current_player = players[env.current_player_num]
            env.render()
            logger.debug(f'\nCurrent player name: {current_player.name}')

            if args.recommend and current_player.name in ['human', 'rules']:
                # show recommendation from last loaded model
                logger.debug(f'\nRecommendation by {ppo_agent.name}:')
                action = ppo_agent.choose_action(env,
                                                 choose_best_action=True,
                                                 mask_invalid_actions=True)

            if current_player.name == 'human':
                action = input('\nPlease choose an action: ')
                try:
                    # for int actions
                    action = int(action)
                except:
                    # for MulitDiscrete action input as list TODO
                    action = eval(action)
            elif current_player.name == 'rules':
                logger.debug(f'\n{current_player.name} model choices')
                action = current_player.choose_action(
                    env, choose_best_action=False, mask_invalid_actions=True)
            else:
                logger.debug(f'\n{current_player.name} model choices')
                action = current_player.choose_action(
                    env,
                    choose_best_action=args.best,
                    mask_invalid_actions=True)

            obs, reward, done, _ = env.step(action)

            for r, player in zip(reward, players):
                total_rewards[player.id] += r
                player.points += r

            if args.cont:
                input('Press any key to continue')

        env.render()

        logger.info(f"Played {game + 1} games: {total_rewards}")

        if args.write_results:
            write_results(players, game, args.games, env.turns_taken)

        for p in players:
            p.points = 0

    env.close()
예제 #9
0
파일: train.py 프로젝트: iCodeIN/SIMPLE
def main(args):

    rank = MPI.COMM_WORLD.Get_rank()

    model_dir = os.path.join(config.MODELDIR, args.env_name)

    if rank == 0:
        try:
            os.makedirs(model_dir)
        except:
            pass
        if args.reset:
            reset_files(model_dir)
        logger.configure(config.LOGDIR)
    else:
        logger.configure(format_strs=[])

    if args.debug:
        logger.set_level(config.DEBUG)
    else:
        time.sleep(5)
        logger.set_level(config.INFO)

    workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)

    logger.info('\nSetting up the selfplay training environment opponents...')
    base_env = get_environment(args.env_name)
    env = selfplay_wrapper(base_env)(opponent_type=args.opponent_type,
                                     verbose=args.verbose)
    env.seed(workerseed)

    CustomPolicy = get_network_arch(args.env_name)

    params = {
        'gamma': args.gamma,
        'timesteps_per_actorbatch': args.timesteps_per_actorbatch,
        'clip_param': args.clip_param,
        'entcoeff': args.entcoeff,
        'optim_epochs': args.optim_epochs,
        'optim_stepsize': args.optim_stepsize,
        'optim_batchsize': args.optim_batchsize,
        'lam': args.lam,
        'adam_epsilon': args.adam_epsilon,
        'schedule': 'linear',
        'verbose': 1,
        'tensorboard_log': config.LOGDIR
    }

    time.sleep(
        5
    )  # allow time for the base model to be saved out when the environment is created

    if args.reset or not os.path.exists(
            os.path.join(model_dir, 'best_model.zip')):
        logger.info('\nLoading the base PPO agent to train...')
        model = PPO1.load(os.path.join(model_dir, 'base.zip'), env, **params)
    else:
        logger.info(
            '\nLoading the best_model.zip PPO agent to continue training...')
        model = PPO1.load(os.path.join(model_dir, 'best_model.zip'), env,
                          **params)

    #Callbacks
    logger.info(
        '\nSetting up the selfplay evaluation environment opponents...')
    callback_args = {
        'eval_env':
        selfplay_wrapper(base_env)(opponent_type=args.opponent_type,
                                   verbose=args.verbose),
        'best_model_save_path':
        config.TMPMODELDIR,
        'log_path':
        config.LOGDIR,
        'eval_freq':
        args.eval_freq,
        'n_eval_episodes':
        args.n_eval_episodes,
        'deterministic':
        False,
        'render':
        True,
        'verbose':
        0
    }

    if args.rules:
        logger.info(
            '\nSetting up the evaluation environment against the rules-based agent...'
        )
        # Evaluate against a 'rules' agent as well
        eval_actual_callback = EvalCallback(
            eval_env=selfplay_wrapper(base_env)(opponent_type='rules',
                                                verbose=args.verbose),
            eval_freq=1,
            n_eval_episodes=args.n_eval_episodes,
            deterministic=args.best,
            render=True,
            verbose=0)
        callback_args['callback_on_new_best'] = eval_actual_callback

    # Evaluate the agent against previous versions
    eval_callback = SelfPlayCallback(args.opponent_type, args.threshold,
                                     args.env_name, **callback_args)

    logger.info('\nSetup complete - commencing learning...\n')

    model.learn(total_timesteps=int(1e9),
                callback=[eval_callback],
                reset_num_timesteps=False,
                tb_log_name="tb")

    env.close()
    del env
예제 #10
0
def train(env_id, algo, num_timesteps, seed, sgd_steps, t_pi, t_c, lam, log,
          expert_path, pretrain, pretrain_epochs, mdpo_update_steps,
          num_trajectories, expert_model, exploration_bonus, bonus_coef,
          random_action_len, is_action_features, dir_name, neural, lipschitz,
          args):
    """
    Train TRPO model for the mujoco environment, for testing purposes
    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """

    with tf_util.single_threaded_session():
        # from mpi4py import MPI
        # rank = MPI.COMM_WORLD.Get_rank()
        rank = 0
        env_name = env_id[:-3].lower()
        log_dir = './experiments/' + env_name + '/' + str(algo).lower() + '/'\
                  + 'tpi' + str(t_pi) + '_tc' + str(t_c) + '_lam' + str(lam)
        log_dir += '_' + dir_name + '/'
        log_name = str(algo) + '_updateSteps' + str(mdpo_update_steps)
        # log_name += '_randLen' + str(random_action_len)
        if exploration_bonus:
            log_name += '_exploration' + str(bonus_coef)
        if pretrain:
            log_name += '_pretrain' + str(pretrain_epochs)
        if not is_action_features:
            log_name += "_states_only"
        log_name += '_s' + str(seed)

        log_path = log_dir + log_name
        expert_path = './experts/' + expert_path

        num_timesteps = int(num_timesteps)

        args = args.__dict__

        dir_path = os.getcwd() + log_dir[1:]
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
            with open(os.getcwd() + log_dir[1:] + 'args.txt', 'w') as file:
                file.write("Experiment Arguments:")
                for key, val in args.items():
                    print(key, ": ", val, file=file)

        if log:
            if rank == 0:
                logger.configure(log_path)
            else:
                logger.configure(log_path, format_strs=[])
                logger.set_level(logger.DISABLED)
        else:
            if rank == 0:
                logger.configure()
            else:
                logger.configure(format_strs=[])
                logger.set_level(logger.DISABLED)

        # workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

        # env = make_mujoco_env(env_id, workerseed)
        def make_env():
            # env_out = gym.make(env_id, reset_noise_scale=1.0)
            env_out = gym.make(env_id)
            env_out = bench.Monitor(env_out,
                                    logger.get_dir(),
                                    allow_early_resets=True)
            env_out.seed(seed)
            env_out = wrap_mujoco(env_out, random_action_len=random_action_len)
            return env_out

        #

        env = DummyVecEnv([make_env])
        # env = VecNormalize(env)

        if algo == 'Train':
            train = True
        else:
            train = False

        if algo == 'Evaluate':
            eval = True
        else:
            eval = False

        if train:
            from stable_baselines import SAC
            env = VecNormalize(env, norm_reward=False, norm_obs=False)

            if num_timesteps > 0:
                model = SAC('MlpPolicy',
                            env_id,
                            verbose=1,
                            buffer_size=1000000,
                            batch_size=256,
                            ent_coef='auto',
                            train_freq=1,
                            tau=0.01,
                            gradient_steps=1,
                            learning_starts=10000)
            else:
                model = SAC.load(expert_model, env)
            generate_expert_traj(model,
                                 expert_path,
                                 n_timesteps=num_timesteps,
                                 n_episodes=num_trajectories)
            if num_timesteps > 0:
                model.save('sac_' + env_name + '_' + str(num_timesteps))
        elif eval:
            from stable_baselines import SAC
            env = VecNormalize(env, norm_reward=False, norm_obs=False)
            model = SAC.load(expert_model, env)
            generate_expert_traj(model,
                                 expert_path,
                                 n_timesteps=num_timesteps,
                                 n_episodes=10,
                                 evaluate=True)
        else:
            expert_path = expert_path + '.npz'
            dataset = ExpertDataset(expert_path=expert_path,
                                    traj_limitation=10,
                                    verbose=1)

            if algo == 'MDAL':
                model = MDAL_MDPO_OFF('MlpPolicy',
                                      env,
                                      dataset,
                                      verbose=1,
                                      tensorboard_log="./experiments/" +
                                      env_name + "/mdal/",
                                      seed=seed,
                                      buffer_size=1000000,
                                      ent_coef=0.0,
                                      learning_starts=10000,
                                      batch_size=256,
                                      tau=0.01,
                                      gamma=0.99,
                                      gradient_steps=sgd_steps,
                                      mdpo_update_steps=mdpo_update_steps,
                                      lam=0.0,
                                      train_freq=1,
                                      d_step=10,
                                      tsallis_q=1,
                                      reparameterize=True,
                                      t_pi=t_pi,
                                      t_c=t_c,
                                      exploration_bonus=exploration_bonus,
                                      bonus_coef=bonus_coef,
                                      is_action_features=is_action_features,
                                      neural=neural,
                                      lipschitz=lipschitz)
            elif algo == 'MDAL_ON_POLICY':
                model = MDAL_MDPO_ON('MlpPolicy',
                                     env,
                                     dataset,
                                     verbose=1,
                                     timesteps_per_batch=2048,
                                     tensorboard_log="./experiments/" +
                                     env_name + "/mdal_mdpo_on/",
                                     seed=seed,
                                     max_kl=0.01,
                                     cg_iters=10,
                                     cg_damping=0.1,
                                     entcoeff=0.0,
                                     adversary_entcoeff=0.001,
                                     gamma=0.99,
                                     lam=0.95,
                                     vf_iters=5,
                                     vf_stepsize=1e-3,
                                     sgd_steps=sgd_steps,
                                     klcoeff=1.0,
                                     method="multistep-SGD",
                                     tsallis_q=1.0,
                                     t_pi=t_pi,
                                     t_c=t_c,
                                     exploration_bonus=exploration_bonus,
                                     bonus_coef=bonus_coef,
                                     is_action_features=is_action_features,
                                     neural=neural)

            elif algo == 'MDAL_TRPO':
                model = MDAL_TRPO('MlpPolicy',
                                  env,
                                  dataset,
                                  verbose=1,
                                  tensorboard_log="./experiments/" + env_name +
                                  "/mdal_trpo/",
                                  seed=seed,
                                  gamma=0.99,
                                  g_step=3,
                                  d_step=5,
                                  sgd_steps=1,
                                  d_stepsize=9e-5,
                                  entcoeff=0.0,
                                  adversary_entcoeff=0.001,
                                  max_kl=t_pi,
                                  t_pi=t_pi,
                                  t_c=t_c,
                                  exploration_bonus=exploration_bonus,
                                  bonus_coef=bonus_coef,
                                  is_action_features=is_action_features,
                                  neural=neural,
                                  lam=0.98,
                                  timesteps_per_batch=2000,
                                  lipschitz=lipschitz)

            elif algo == 'GAIL':
                from mpi4py import MPI
                from stable_baselines import GAIL

                model = GAIL('MlpPolicy',
                             env,
                             dataset,
                             verbose=1,
                             tensorboard_log="./experiments/" + env_name +
                             "/gail/",
                             seed=seed,
                             entcoeff=0.0,
                             adversary_entcoeff=0.001,
                             lipschitz=lipschitz)

            elif algo == 'GAIL_MDPO_OFF':
                # from mpi4py import MPI
                from stable_baselines import GAIL_MDPO_OFF

                model = GAIL_MDPO_OFF('MlpPolicy',
                                      env,
                                      dataset,
                                      verbose=1,
                                      tensorboard_log="./experiments/" +
                                      env_name + "/gail_mdpo_off/",
                                      seed=seed,
                                      ent_coef=0.0,
                                      adversary_entcoeff=0.001,
                                      buffer_size=1000000,
                                      learning_starts=10000,
                                      batch_size=256,
                                      tau=0.01,
                                      gamma=0.99,
                                      gradient_steps=sgd_steps,
                                      mdpo_update_steps=mdpo_update_steps,
                                      lam=0.0,
                                      train_freq=1,
                                      tsallis_q=1,
                                      reparameterize=True,
                                      t_pi=t_pi,
                                      t_c=t_c,
                                      exploration_bonus=exploration_bonus,
                                      bonus_coef=bonus_coef,
                                      is_action_features=is_action_features,
                                      lipschitz=lipschitz)
            else:
                raise ValueError("Not a valid algorithm.")

            if pretrain:
                model.pretrain(dataset, n_epochs=pretrain_epochs)

            model.learn(total_timesteps=num_timesteps, tb_log_name=log_name)

        env.close()
def train(params, model=None, path=None):
    if model: # indicate in filename that this is a finetune
        if params['name']:
            params['name'] += '_Finetune'
        else:
            params['name'] = 'Finetune'
    
    data_dir, tb_path = get_paths(params, path=path)
    print("Training Parameters: ", params)
    os.makedirs(data_dir, exist_ok=True)
    # Save parameters immediatly
    params.save(data_dir)

    rank = mpi_rank_or_zero()
    if rank != 0:
        logger.set_level(logger.DISABLED)
    
    def make_env(i):
        env = get_env(params)
        env = Monitor(env, data_dir + '/' + str(i), allow_early_resets=params['early_reset'])
        return env

    use_her = params['env_args']['use_her'] if 'use_her' in params['env_args'] else False

    if use_her:
        env = make_env(0)
        goal_selection_strategy = 'future'
    else:
        env = DummyVecEnv([(lambda n: lambda: make_env(n))(i) for i in range(params['num_proc'])])

    if model: # indicate in filename that this is a finetune
        print("Model action space", model.action_space, model.action_space.low)
        print("Env action space", env.action_space, env.action_space.low)
    if params['normalize']:
        env = VecNormalize(env)
    if params['seed']:
        seed = params['seed'] + 100000 * rank
        set_global_seeds(seed)
        params['alg_args']['seed'] = seed
    if 'noise' in params and params['noise']:
        from stable_baselines.ddpg import OrnsteinUhlenbeckActionNoise
        n_actions = env.action_space.shape[-1]
        params['alg_args']['action_noise'] = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(params['noise'])*np.ones(n_actions))
    
    if model is None:
        alg = get_alg(params)
        policy = get_policy(params)
        if use_her:
            from stable_baselines import HER
            model = HER(policy, env, alg, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1, 
                            tensorboard_log=tb_path, policy_kwargs=params['policy_args'], **params['alg_args'])
        else:
            model = alg(policy,  env, verbose=1, tensorboard_log=tb_path, policy_kwargs=params['policy_args'], **params['alg_args'])
    else:
        model.set_env(env)

    model.learn(total_timesteps=params['timesteps'], log_interval=params['log_interval'], callback=create_training_callback(data_dir, 
                                                    freq=params['eval_freq'], checkpoint_freq=params['checkpoint_freq']))
    print("######## SAVING MODEL TO", data_dir)
    model.save(data_dir +'/final_model')
    if params['normalize']:
        env.save(data_dir + '/normalized_environment.env')
    env.close()
예제 #12
0
def train(params, model=None, env=None): 
    print("Training Parameters: ", params)

    data_dir, tb_path = get_paths(params)
    os.makedirs(data_dir, exist_ok=True)
    # Save parameters immediately
    params.save(data_dir)

    rank = mpi_rank_or_zero()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create the environment if not given
    if env is None:  
        def make_env(i):
            env = get_env(params)
            print("ENV IN UTIL" ,env)
            # TODO: make monitor work for multiple agent.
            env = Monitor(env, data_dir + '/' + str(i), allow_early_resets=params['early_reset'])
            return env

        # if 'PPO' in params['alg']:
        #     env = DummyVecEnv([(lambda n: lambda: make_env(n))(i) for i in range(params['num_proc'])])
        # else:
        #     env = make_env(0)
        env = make_env(0)

        if params['normalize']:
            env = VecNormalize(env)
    # Set the seeds
    if params['seed']:
        seed = params['seed'] + 100000 * rank
        set_global_seeds(seed)
        params['alg_args']['seed'] = seed

    if 'noise' in params and params['noise']:
        from stable_baselines.ddpg import OrnsteinUhlenbeckActionNoise
        n_actions = env.action_space.shape[-1]
        params['alg_args']['action_noise'] = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(params['noise'])*np.ones(n_actions))
  

    print("ENV", env, env.action_space)
    if model is None:
        alg = get_alg(params)
        policy = get_policy(params)
        model = alg(policy,  env, verbose=1, tensorboard_log=tb_path, policy_kwargs=params['policy_args'], **params['alg_args'])
    else:
        model.set_env(env)

    print("\n===============================\n")
    print("TENSORBOARD PATH:", tb_path)
    print("\n===============================\n")
    model.learn(total_timesteps=params['timesteps'], log_interval=params['log_interval'], 
                callback=create_training_callback(data_dir, params, env, freq=params['eval_freq'], checkpoint_freq=params['checkpoint_freq']))
    
    print("Saving model to", data_dir)
    model.save(data_dir +'/final_model')

    if params['normalize']:
        env.save(data_dir + '/environment.pkl')
        
    env.close()
def train(env, seed, policy_fn, reward_giver, dataset, algo, g_step, d_step, policy_entcoeff, num_timesteps,
          save_per_iter, checkpoint_dir, pretrained, bc_max_iter, task_name=None):
    """
    train gail on mujoco

    :param env: (Gym Environment) the environment
    :param seed: (int) the initial random seed
    :param policy_fn: (function (str, Gym Space, Gym Space, bool): MLPPolicy) policy generator
    :param reward_giver: (TransitionClassifier) the reward predicter from obsevation and action
    :param dataset: (MujocoDset) the dataset manager
    :param algo: (str) the algorithm type (only 'trpo' is supported)
    :param g_step: (int) number of steps to train policy in each epoch
    :param d_step: (int) number of steps to train discriminator in each epoch
    :param policy_entcoeff: (float) the weight of the entropy loss for the policy
    :param num_timesteps: (int) the number of timesteps to run
    :param save_per_iter: (int) the number of iterations before saving
    :param checkpoint_dir: (str) the location for saving checkpoints
    :param pretrained: (bool) use a pretrained behavior clone
    :param bc_max_iter: (int) the maximum number of training iterations for the behavior clone
    :param task_name: (str) the name of the task (can be None)
    """

    pretrained_weight = None
    #if pretrained and (bc_max_iter > 0):
    #    # Pretrain with behavior cloning
    #    pretrained_weight = behavior_clone.learn(env, policy_fn, dataset, max_iters=bc_max_iter)

    if algo == 'trpo':
        # Set up for MPI seed
        # rank = 0 는 메인 쓰레드임을 의미합니다.
        # rank != 0 인 쓰레드가 워커워커
        rank = MPI.COMM_WORLD.Get_rank()
        if rank != 0:
            logger.set_level(logger.DISABLED)
        workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
        set_global_seeds(workerseed)
        env.seed(workerseed)

        model = TRPO(policy_fn, env, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, gamma=0.995, lam=0.97,
                     entcoeff=policy_entcoeff, cg_damping=0.1, vf_stepsize=1e-3, vf_iters=5, _init_setup_model=False)

        # GAIL param
        # pretrained_weight = None
        model.pretrained_weight = pretrained_weight
        # Discriminator
        model.reward_giver = reward_giver
        model.expert_dataset = dataset
        model.save_per_iter = save_per_iter
        model.checkpoint_dir = checkpoint_dir
        model.g_step = g_step
        model.d_step = d_step
        model.task_name = task_name
        model.using_gail = True

        # policy model setup!
        model.setup_model()

        # policy model update!
        model.learn(total_timesteps=num_timesteps)
    else:
        raise NotImplementedError