def create_env(env_id, output_path, seed=0):
    rank = MPI.COMM_WORLD.Get_rank()
    set_global_seeds(seed + 10000 * rank)
    env = gym.make(env_id)
    env = Monitor(env, os.path.join(output_path, str(rank)), allow_early_resets=True)
    env.seed(seed)
    return env
Exemplo n.º 2
0
    def _init():
        set_global_seeds(seed + rank)
        env = gym.make(env_id, **env_kwargs)

        # Dict observation space is currently not supported.
        # https://github.com/hill-a/stable-baselines/issues/321
        # We allow a Gym env wrapper (a subclass of gym.Wrapper)
        if wrapper_class:
            env = wrapper_class(env)

        env.seed(seed + rank)
        log_file = os.path.join(log_dir,
                                str(rank)) if log_dir is not None else None
        env = Monitor(env, log_file)
        return env
Exemplo n.º 3
0
 def _init():
     set_global_seeds(seed + rank)
     env = gym.make(env_id)
     if len(env_params) > 0:
         env = modify_env_params(env, params_path, **env_params)
     elif len(params_ranges) > 0:
         env = RandomUniformEnvParams(env,
                                      params_path,
                                      params_ranges,
                                      rank=rank)
     env.seed(seed + rank)
     env = Monitor(env,
                   os.path.join(log_dir, str(rank)),
                   allow_early_resets=True)
     return env
Exemplo n.º 4
0
    def _init():
        set_global_seeds(seed + rank)
        env = gym.make(env_id)

        # Dict observation space is currently not supported.
        # https://github.com/hill-a/stable-baselines/issues/321
        # We allow a Gym env wrapper (a subclass of gym.Wrapper)
        if wrapper_class:
            env = wrapper_class(env)

        env.seed(seed + rank)
        env = Monitor(env,
                      os.path.join(log_dir, str(rank)),
                      allow_early_resets=True)
        return env
Exemplo n.º 5
0
 def _init():
     set_global_seeds(seed)
     env = DonkeyVAEEnv(level=LEVEL,
                        frame_skip=frame_skip,
                        vae=vae,
                        const_throttle=None,
                        min_throttle=MIN_THROTTLE,
                        max_throttle=MAX_THROTTLE,
                        max_cte_error=MAX_CTE_ERROR,
                        n_command_history=N_COMMAND_HISTORY,
                        n_stack=n_stack)
     env.seed(seed)
     if not teleop:
         env = Monitor(env, log_dir, allow_early_resets=True)
     return env
Exemplo n.º 6
0
class SbPpo2():
    '''stable baselines PPO2'''
    def __init__(self, expt_name):
        rospack = rospkg.RosPack()
        pkg_path = rospack.get_path('deepleng_control')
        outdir = pkg_path + '/monitor_logs/' + expt_name

        # env = gym.make('LunarLanderContinuous-v2')
        env = gym.make('DeeplengDocking-v2')
        self.expt_name = expt_name
        self.env = Monitor(env, outdir)

    def __call__(self, *args, **kwargs):
        # eval_callback = EvalCallback(env, best_model_save_path=eval_dir,
        #                              log_path=eval_dir, eval_freq=500,
        #                              deterministic=True, render=False)
        policy_kwargs = dict(layers=[400, 300, 200, 100])
        model = PPO2(MlpPolicy,
                     self.env,
                     policy_kwargs=policy_kwargs,
                     verbose=1,
                     tensorboard_log=
                     "home/dfki.uni-bremen.de/mpatil/Documents/baselines_log")

        model.learn(total_timesteps=int(1e5),
                    log_interval=50,
                    tb_log_name="ppo_Docker_" + self.expt_name)

        model.save(
            "/home/dfki.uni-bremen.de/mpatil/Documents/ppo_stable_baselines_" +
            self.expt_name)

        # del model

        print("Closing environment")
        self.env.close()
        def _thunk():
            env = make_mario(env_id)
            env.seed(seed + rank)

            if cut_map:
                env = CutMarioMap(env)

            env = Monitor(env,
                          logger.get_dir()
                          and os.path.join(logger.get_dir(), str(rank)),
                          allow_early_resets=allow_early_resets)

            # FIXME do if wrap deepmind, create other methods
            return wrap_deepmind_custom(
                env, **wrapper_kwargs)  # converts to 84*84 bw, keep for now
def train(env_name, num_time_steps, eval_ep, eval_freq, ckpt_freq, load_model=None):
    env=gym.make(env_name)
    env_ = gym.make(env_name)
    rank = MPI.COMM_WORLD.Get_rank()
    today = date.today()
    today = str(today).replace('-','_')
    now = datetime.now()
    current_time = now.strftime("%H_%M_%S")
    model_name = env_name + '_PPO1_' + today + current_time
    Path('./run/'+model_name).mkdir(parents=True, exist_ok=True)
    path = os.path.join(os.path.dirname(__file__), './run/' + model_name)
    if rank == 0:
        env = Monitor(env, filename=path)

    ############################
    #         callback         #
    ############################   
    callbacklist = []
    eval_callback = EvalCallback_wandb(env_, n_eval_episodes=eval_ep, eval_freq=eval_freq, log_path=path)
    ckpt_callback = CheckpointCallback(save_freq=ckpt_freq, save_path='./run/' + model_name + '/ckpt', name_prefix='')
    callbacklist.append(eval_callback)
    callbacklist.append(ckpt_callback)
    callback = CallbackList(callbacklist)

    if load_model:
        model = PPO1.load(env=env, load_path=load_model)
    else:
        model = PPO1(MlpPolicy, env, verbose=1, gamma = 0.995, clip_param=0.2, entcoeff=1.0, lam = 0.95, optim_epochs=20,optim_batchsize=32768, timesteps_per_actorbatch=320000)

    ############################
    #          Logging         #
    ############################
    if rank==0:
        logger.configure()
        config = {}
        config['load']=[{'load_model':load_model}]
        config['eval']=[{'eval_freq':eval_freq, 'eval_ep':eval_ep}]
        config['ckpt']=[{'ckpt_freq':ckpt_freq}]
        with open('./run/' + model_name + '/' + model_name + '.txt', 'w+') as outfile:
            json.dump(config, outfile, indent=4)
    else:
        logger.configure(format_strs=[])
    ############################
    #            run           #
    ############################
   
    model.learn(total_timesteps=int(num_time_steps), callback=callback)
    model.save(path+'/'+model_name)
Exemplo n.º 9
0
def train_ppo():

    env = Manipulator2D()
    env = Monitor(env, log_dir)
    # Custom MLP policy of two layers of size 32 each with tanh activation function
    #policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[32, 32])

    # Create the agent
    # env = SubprocVecEnv([make_env( i) for i in range(8)])
    # env = VecMonitor(env, log_dir)
    #model = PPO2(MlpPolicy, env, verbose=1, policy_kwargs=policy_kwargs,)
    model = PPO2(MlpPolicy, env, verbose=1, nminibatches=32, noptepochs = 10, ent_coef= 0.0)
    # Train the agent
    model.learn(total_timesteps=20000000, callback=callback)
    # Save the agent
    model.save("ppo2-mani14")
Exemplo n.º 10
0
def get_arena_envs(use_monitor=True, log_dir=None):
    # the num_envs should be set in the filt settings.st under the  folder of arena2d-sim
    num_envs = rospy.get_param(NS_SETTING + "num_envs")
    if log_dir is None:
        logs_file_names = [None] * num_envs
    else:
        logs_file_names = [
            os.path.join(log_dir, f"arena_env_{i}") for i in range(num_envs)
        ]
    if use_monitor:
        return SubprocVecEnv([
            lambda i=i: Monitor(Arena2dEnvWrapper(i), logs_file_names[i])
            for i in range(num_envs)
        ])
    return SubprocVecEnv(
        [lambda i=i: Arena2dEnvWrapper(i) for i in range(num_envs)])
def create_env(args, idx):
    """
    Create and return an environment according to args (parsed arguments).
    idx specifies idx of this environment among parallel environments.
    """
    monitor_file = os.path.join(args.output, ("env_%d" % idx))

    # Check for Atari envs
    if "NoFrameskip" in args.env:
        env = make_atari(args.env)
        env = wrap_deepmind(env, frame_stack=True)
    else:
        env = gym.make(args.env)
    env = Monitor(env, monitor_file)

    return env
Exemplo n.º 12
0
    def load_model(self,
                   symbol='JPM',
                   sd=dt.datetime(2009, 1, 1),
                   ed=dt.datetime(2010, 12, 31),
                   loadpath=None):
        # load data and indicators
        df = self._load_data([symbol], sd, ed)
        df_met = self._get_indicators(symbol, df)
        print(f'min: {df_met.min()} max: {df_met.max()}')

        # set environment
        self.env = Monitor(LoanEnv(df_met),
                           self.log_dir,
                           allow_early_resets=True)

        # load model
        self.model = DQN.load(loadpath, env=self.env)
Exemplo n.º 13
0
def make_env():
    env = gym_super_mario_bros.make('SuperMarioBros-v3')
    env = JoypadSpace(env, RIGHT_ONLY)
    env = CustomRewardAndDoneEnv(env)  # 報酬とエピソード完了の変更
    env = StochasticFrameSkip(env, n=4, stickprob=0.25)  # スティッキーフレームスキップ
    env = Downsample(env, 2)  # ダウンサンプリング
    env = FrameStack(env, 4)  # フレームスタック
    env = ScaledFloatFrame(env)  # 状態の正規化
    env = Monitor(env, log_dir, allow_early_resets=True)
    env.seed(0)  # シードの指定
    set_global_seeds(0)
    env = DummyVecEnv([lambda: env])  # ベクトル環境の生成

    print('行動空間: ', env.action_space)
    print('状態空間: ', env.observation_space)

    return env
Exemplo n.º 14
0
def main():
    args = mujoco_arg_parser()

    # Create saving trained agent dir
    save_dir = "./trained_agent_dir/" + args.savedir + "/"
    os.makedirs(save_dir, exist_ok=True)

    # Create tensorboard log dir
    tensorboard_log_dir = "./tensorboard_log/"
    os.makedirs(tensorboard_log_dir, exist_ok=True)

    # Create and wrap the environment
    env = gym.make(config['env'])
    # env = ChangeJointRangeEnv(env)
    env = Monitor(
        env, log_dir, allow_early_resets=True
    )  # Monitor:logフォルダにmonitor.csvが出力します。ep_reward_mean(平均報酬)、ep_len_mean(平均エピソード長)、timestamp(経過時間)の3つのカラムを持つCSVになります
    env = DummyVecEnv([
        lambda: env
    ])  #複数の環境用の単純なベクトル化されたラッパーを作成し、現在のPythonプロセスで各環境を順番に呼び出します。

    # modelの生成
    model = PPO2(MlpPolicy,
                 env,
                 verbose=1,
                 tensorboard_log=tensorboard_log_dir,
                 n_steps=config['n_steps'],
                 nminibatches=config['nminibatches'],
                 noptepochs=config['noptepochs'],
                 learning_rate=config['learning_rate'],
                 seed=args.seed)

    model.learn(total_timesteps=config['total_timestep'],
                callback=callback,
                tb_log_name=args.savedir)

    # Save the agent
    model.save(save_dir + "trainedAnt" + "-seed" + str(args.seed))

    # csv 出力
    csvdir = "./output/csv"
    os.makedirs(csvdir, exist_ok=True)
    R = np.array(rewardlist)
    np.savetxt(csvdir + '/' + args.savedir + '-' + str(args.seed) + '.csv',
               R,
               delimiter=',')
Exemplo n.º 15
0
def main():
    """
    Runs the test
    """
    """
    Create an argparse.ArgumentParser for run_mujoco.py.

    :return:  (ArgumentParser) parser {'--env': 'Reacher-v2', '--seed': 0, '--num-timesteps': int(1e6), '--play': False}

    parser = arg_parser()
    parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--num-timesteps', type=int, default=int(1e6))
    parser.add_argument('--play', default=False, action='store_true')
    return parse
    """
    env_id = 'UR5Gripper-v0'
    model_path = '/tmp/gym/trpo_mpi/'
    # args = mujoco_arg_parser().parse_args()
    # train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
    # train(env_id=env_id, num_timesteps=int(1e7), seed=0, model_path=model_path)
    env = gym.make(env_id)
    env = Monitor(env, model_path, allow_early_resets=True)
    model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=model_path)
    model = model.load(model_path + "trpo.pkl")
    model.learn(total_timesteps=int(1e5), callback=callback)
    model.save(model_path + "trpo.pkl")
    # tf_util.save_state(model_path)

    # Enjoy trained agent
    obs = env.reset()
    for i in range(100):
        obs = env.reset()
        env.render()
        for i in range(200):
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            env.render()
Exemplo n.º 16
0
def main():

    # Save argument values to yaml file
    args_file_path = os.path.join(args.log_dir, 'args.yaml')
    with open(args_file_path, 'w') as f:
        yaml.dump(vars(args), f, default_flow_style=False)

    # Create and wrap the environment
    env = gym.make(args.env)
    env = Monitor(env, args.log_dir, allow_early_resets=True)
    env = DummyVecEnv([lambda: env])

    # Add some param noise for exploration
    if args.model == 'DDPG':
        param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.2,
                                             desired_action_stddev=0.2)
        model = MODEL_CLASS(MlpPolicy,
                            env,
                            param_noise=param_noise,
                            memory_limit=int(1e6),
                            verbose=0)
    if args.model == 'SAC':
        # TODO: This doesn't work
        model = MODEL_CLASS(MlpPolicy,
                            env,
                            verbose=1,
                            policy_kwargs={
                                'n_env': 1,
                                'n_steps': 64,
                                'n_batch': 64
                            })
    else:
        model = MODEL_CLASS(MlpPolicy, env, verbose=0)

    # Train the agent
    model.learn(total_timesteps=args.n_steps, callback=callback)

    # Save the final model
    if args.save_model:
        model_file_path = os.path.join(args.log_dir, 'model.pkl')
        model.save(model_file_path)
        print("Best and final models saved in ", os.path.abspath(args.log_dir))

    if args.plots:
        raise NotImplementedError
def make_env(env_id,
             rank,
             log_dir=None,
             allow_early_resets=True,
             flatten_dict=False,
             kwargs=None):
    """
    Create a wrapped, monitored gym.Env for MuJoCo.

    :param env_id: (str) the environment ID
    :param seed: (int) the inital seed for RNG
    :param allow_early_resets: (bool) allows early reset of the environment
    :return: (Gym Environment) The mujoco environment
    """
    if env_id in ENTRY_POINT.keys():
        kwargs = kwargs.copy()
        max_episode_steps = None
        if 'max_episode_steps' in kwargs:
            max_episode_steps = kwargs['max_episode_steps']
            del kwargs['max_episode_steps']
        gym.register(env_id,
                     entry_point=ENTRY_POINT[env_id],
                     max_episode_steps=max_episode_steps,
                     kwargs=kwargs)
        env = gym.make(env_id)
    else:
        raise NotImplementedError
    if flatten_dict:
        env = FlattenDictWrapper(
            env, ['observation', 'achieved_goal', 'desired_goal'])
    if 'FetchStack' in env_id and (
            'Unlimit' not in env_id) and max_episode_steps is None:
        from utils.wrapper import FlexibleTimeLimitWrapper
        env = FlexibleTimeLimitWrapper(env, 100)
    if kwargs['reward_type'] != 'sparse':
        env = DoneOnSuccessWrapper(env, 0.0)
    else:
        env = DoneOnSuccessWrapper(env)
    if log_dir is not None:
        env = Monitor(env,
                      os.path.join(log_dir,
                                   str(rank) + ".monitor.csv"),
                      allow_early_resets=allow_early_resets,
                      info_keywords=('is_success', ))
    return env
def main():

    # create Environment
    env = iCubPushGymEnv(urdfRoot=robot_data.getDataPath(),
                         renders=False,
                         useIK=1,
                         isDiscrete=0,
                         rnd_obj_pose=0,
                         maxSteps=2000,
                         reward_type=0)

    # set seed
    seed = 1
    tf.reset_default_graph()
    set_global_seed(seed)
    env.seed(seed)

    # set log
    monitor_dir = os.path.join(log_dir, 'log')
    os.makedirs(monitor_dir, exist_ok=True)
    env = Monitor(env, monitor_dir + '/', allow_early_resets=True)

    # create agent model
    nb_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(nb_actions),
                                     sigma=float(0.5373) * np.ones(nb_actions))

    model = DDPG('LnMlpPolicy',
                 env,
                 action_noise=action_noise,
                 gamma=0.99,
                 batch_size=16,
                 normalize_observations=True,
                 normalize_returns=False,
                 memory_limit=100000,
                 verbose=1,
                 tensorboard_log=os.path.join(log_dir, 'tb'),
                 full_tensorboard_log=False)

    #start learning
    model.learn(total_timesteps=500000, seed=seed, callback=callback)

    # save model
    print("Saving model.pkl to ", log_dir)
    act.save(log_dir + "/final_model.pkl")
Exemplo n.º 19
0
def make_robotics_env(env_id, seed, rank=0, allow_early_resets=True):
  """
  Create a wrapped, monitored gym.Env for MuJoCo.

  :param env_id: (str) the environment ID
  :param seed: (int) the inital seed for RNG
  :param rank: (int) the rank of the environment (for logging)
  :param allow_early_resets: (bool) allows early reset of the environment
  :return: (Gym Environment) The robotic environment
  """
  set_global_seeds(seed)
  env = gym.make(env_id)
  env = FlattenDictWrapper(env, ['observation', 'desired_goal'])
  env = Monitor(
    env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
    info_keywords=('is_success',), allow_early_resets=allow_early_resets)
  env.seed(seed)
  return env
Exemplo n.º 20
0
 def _solve_domain(self, domain_factory: Callable[[], D]) -> None:
     # TODO: improve code for parallelism
     #  (https://stable-baselines.readthedocs.io/en/master/guide/examples.html
     #  #multiprocessing-unleashing-the-power-of-vectorized-environments)?
     if not hasattr(
             self,
             '_algo'):  # reuse algo if possible (enables further learning)
         domain = domain_factory()
         env = Monitor(AsGymEnv(domain),
                       filename=None,
                       allow_early_resets=True)
         env = DummyVecEnv([
             lambda: env
         ])  # the algorithms require a vectorized environment to run
         self._algo = self._algo_class(self._baselines_policy, env,
                                       **self._algo_kwargs)
         self._init_algo(domain)
     self._algo.learn(**self._learn_config)
Exemplo n.º 21
0
def start_unity_baselines():
    # Set to FALSE for CIP-Pool execution
    # env = make_unity_env('./envs/worm_dynamic_one_agent/linux/worm_dynamic', 1, False)
    # InitialTrainingExample.start_training(env)
    # env.close()

    unity_env = UnityEnvironment(
        './envs/worm_dynamic_one_agent/linux/worm_dynamic', no_graphics=True)
    env = UnityToGymWrapper(unity_env, uint8_visual=False)
    env = Monitor(env, 'results/')
    # The noise objects for TD3
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.1 * np.ones(n_actions))

    model = TD3_Baselines(MlpPolicy, env, action_noise=action_noise, verbose=1)
    model.learn(total_timesteps=int(2e6), log_interval=10)
    model.save("td3_worm")
Exemplo n.º 22
0
def train_SAC(env, out_dir, seed=None, **kwargs):

    # Logs will be saved in log_dir/monitor.csv
    global output_dir
    output_dir = out_dir
    log_dir = os.path.join(out_dir, 'log')
    os.makedirs(log_dir, exist_ok=True)
    env = gym.make(env)
    env = Monitor(env, log_dir + '/', allow_early_resets=True)

    # Delete keys so the dict can be pass to the model constructor
    # policy = kwargs['policy']
    policy = 'MlpPolicy'
    # n_timesteps = kwargs['n_timesteps']
    n_timesteps = int(1e6)
    noise_type = None
    # Add some param noise for exploration
    param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1,
                                         desired_action_stddev=0.1)

    continue_model = False
    if continue_model is True:
        # Continue training
        print("Loading pretrained agent")
        model = SAC.load(os.path.join(out_dir, 'final_model.pkl'),
                         env=env,
                         tensorboard_log=os.path.join(log_dir, 'tb'),
                         verbose=1,
                         **kwargs)
    else:
        model = SAC(
            policy,
            env,  # action_noise=param_noise,
            verbose=1,
            tensorboard_log=os.path.join(log_dir, 'tb'),
            full_tensorboard_log=False,
            **kwargs)

    model.learn(total_timesteps=n_timesteps,
                seed=seed,
                callback=callback,
                log_interval=10)

    return model
Exemplo n.º 23
0
    def _init():
        # set_global_seeds(seed + rank)
        env = gym.make(env_id, **env_kwargs)
        env.seed(seed + rank)
        env.action_space.seed(seed + rank)

        if log_dir and evaluation:
            env = ParticleInformationWrapper(env,
                                             path=os.path.join(
                                                 log_dir, str(rank)))

        if wrappers:
            for wrapper in wrappers:
                env = wrapper[0](env=env, **wrapper[1])

        if log_dir:
            env = Monitor(env, filename=None, allow_early_resets=True
                          )  # filename=os.path.join(log_dir, str(rank))
        return env
        def _init():
            if isinstance(env_id, str):
                # env = retro.make(env_id, state, scenario=scenario)
                if record:
                    env = make_retro(game=env_id,
                                     state=initial_state,
                                     scenario=scenario,
                                     max_episode_steps=max_episode_steps,
                                     record=record_path)
                else:
                    env = make_retro(game=env_id,
                                     state=initial_state,
                                     scenario=scenario,
                                     max_episode_steps=max_episode_steps)

                if len(env_kwargs) > 0:
                    warnings.warn(
                        "No environment class was passed (only an env ID) so `env_kwargs` will be ignored"
                    )
            else:
                env = env_id(**env_kwargs)
            if seed is not None:
                env.seed(seed + rank)
                env.action_space.seed(seed + rank)
            # Wrap the env in a Monitor wrapper
            # to have additional training information
            monitor_path = os.path.join(
                monitor_dir, str(rank)) if monitor_dir is not None else None
            # Create the monitor folder if needed
            if monitor_path is not None:
                os.makedirs(monitor_dir, exist_ok=True)
            env = Monitor(env, filename=monitor_path)

            #if multiple states provided, wrap
            if isinstance(state, list):
                env = RandomStateReset(env, state, seed=seed)

            # Optionally, wrap the environment with the provided wrapper
            if wrapper_class is not None:
                env = wrapper_class(env)
            return env
def main(load_policy=False):

    global log_dir, log_dir_policy
    if (load_policy):
          log_dir_policy = '../policies/PUSHING_TD3+HER_FIXED_POSITION_DYN_RAND_FROM_FIXED_PHYSICS'
    model_class = TD3  # works also with SAC and DDPG
    action_space = 7
    fixed = True
    normalize_observations = False
    gamma = 0.9
    memory_limit = 1000000
    normalize_returns = True
    timesteps = 1500000
    discreteAction = 0
    rend = False
    env = pandaPushGymEnvHERRand(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=0,
            isDiscrete=discreteAction, action_space = action_space,
            fixedPositionObj = fixed, includeVelObs = True)


    env = Monitor(env, log_dir, allow_early_resets=True)
    # Available strategies (cf paper): future, final, episode, random
    goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
    # Wrap the model

    model = HER(CustomPolicy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy,
                verbose=1,tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/TD3+HER_FIXED_DYN_RAND", buffer_size=1000000,batch_size=256,
                random_exploration=0.3, action_noise=action_noise)

    if (load_policy):
        model = HER.load("../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl", env=env, n_sampled_goal=4,
        goal_selection_strategy=goal_selection_strategy,
        tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/TD3+HER_FIXED_DYN_RAND_FROM_FIXED_PHYSICS",
        buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise)

    # Train the model starting from a previous policy
    model.learn(timesteps, callback = callback )
    model.save("../policies/PUSHING_FIXED_TD3_DYN_RAND")
    print("Finished train1")
Exemplo n.º 26
0
 def _init():
     if isinstance(env_id, str):
         env = gym.make(env_id)
         if len(env_kwargs) > 0:
             warnings.warn("No environment class was passed (only an env ID) so `env_kwargs` will be ignored")
     else:
         env = env_id(**env_kwargs)
     if seed is not None:
         env.seed(seed + rank)
         env.action_space.seed(seed + rank)
     # Wrap the env in a Monitor wrapper
     # to have additional training information
     monitor_path = os.path.join(monitor_dir, str(rank)) if monitor_dir is not None else None
     # Create the monitor folder if needed
     if monitor_path is not None:
         os.makedirs(monitor_dir, exist_ok=True)
     env = Monitor(env, filename=monitor_path)
     # Optionally, wrap the environment with the provided wrapper
     if wrapper_class is not None:
         env = wrapper_class(env)
     return env
Exemplo n.º 27
0
        def _init():
            env = pacman_env

            if seed is not None:
                env.seed(seed + rank)
                env.action_space.seed(seed + rank)
            # Wrap the env in a Monitor wrapper
            # to have additional training information
            monitor_path = os.path.join(
                monitor_dir, str(rank)) if monitor_dir is not None else None
            # Create the monitor folder if needed
            if monitor_path is not None:
                os.makedirs(monitor_dir, exist_ok=True)
            env = Monitor(env,
                          filename=monitor_path,
                          info_keywords=('score', 'ghosts', 'level', 'win',
                                         'd', 'map'))
            # Optionally, wrap the environment with the provided wrapper
            if wrapper_class is not None:
                env = wrapper_class(env)
            return env
Exemplo n.º 28
0
 def _init():
     if env_id == "WarehouseEnv":
         #             if map_file is "None" or map_file is None:
         simple_agent = np.zeros((11, 11))
         simple_agent[5, 5] = 1
         #                      [[ 0, 1,  0,  0,  0,  0,  2, 0, 0],
         #                       [ 0, 0,  0,  0,  0,  0,  0, 0, 0],
         #                       [ 0, 0,  0,  0,  0,  0,  0, 0, 0],
         #                       [ 0, 0,  0,  0,  0,  0,  0, 0, 0],
         #                       [ 0, 0,  0,  0,  0,  0,  0, 0, 0],
         #                       [ 0, 0,  0,  0,  0,  0,  0, 0, 0],
         #                       [ 0, 0,  0,  0,  3,  0,  0, 0, 0]]
         #             simple_agent = \
         #                      [[ 0, 1,  0,  0,  0,  0,  0, 0, 0],
         #                       [ 0, 0,  0,  0,  0,  0,  0, 0, 0],
         #                       [ 0, 0,  0,  0,  0,  0,  0, 0, 0],
         #                       [ 0, 0,  0,  0,  0,  0,  0, 0, 0],
         #                       [ 0, 0,  0,  0,  0,  0,  0, 0, 0],
         #                       [ 0, 0,  0,  0,  0,  0,  0, 0, 0],
         #                       [ 0, 0,  0,  0,  0,  0,  0, 0, 0]]
         simple_world = np.zeros((11, 11))
         #                      [[ 0, 0,  0,  0,  0,  0,  0, 0, 0],
         #                       [ 0, 0,  0,  0,  0,  0,  0, 0, 0],
         #                       [ 0, 0,  0,  0,  1,  0,  0, 0, 0],
         #                       [ 0, 1,  0,  0,  0,  1,  0, 0, 0],
         #                       [ 0, 0,  0,  0,  1,  0,  0, 0, 0],
         #                       [ 0, 0,  0,  0,  0,  0,  0, 0, 0],
         #                       [ 0, 0,  0,  0,  0,  0,  0, 0, 0]]
         env = WarehouseEnv(agent_map=simple_agent,
                            obstacle_map=simple_world,
                            render_as_observation=render_as_observation,
                            exponential_agent_training_curve=
                            exponential_agent_training_curve)
     else:
         env = gym.make(env_id, level=env_level)
     if frame_stack:
         env = FrameStack(env, 4)
     if useMonitor:
         env = Monitor(env, log_dir + str(rank), allow_early_resets=True)
     return env
def main(load_policy=False):
    global log_dir
    model_class = TD3  # works also with SAC and DDPG
    action_space = 6
    fixed = True
    #0 completely fixed, 1 slightly random radius, 2 big random radius,
    object_position = 1
    normalize_observations = False
    gamma = 0.9
    memory_limit = 1000000
    normalize_returns = True
    timesteps = 5000000
    discreteAction = 0
    rend = False

    env = pandaPushGymEnvHER(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=1,
            isDiscrete=discreteAction, action_space = action_space,
            fixedPositionObj = fixed, includeVelObs = True, object_position=object_position)

    env = Monitor(env, log_dir, allow_early_resets=True)

    goal_selection_strategy = 'future'
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
    # Wrap the model

    model = HER(CustomTD3Policy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy,
                verbose=1,tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK", buffer_size=1000000,batch_size=256,
                random_exploration=0.3, action_noise=action_noise)

    if (load_policy):
        model = HER.load("../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl", env=env, n_sampled_goal=4,
        goal_selection_strategy=goal_selection_strategy,
        tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK",
        buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise)

    model.learn(timesteps,log_interval=100, callback = callback)
    print("Saving Policy PHASE_1")
    model.save("../policies/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK")
Exemplo n.º 30
0
    def _init():
        set_global_seeds(seed)
        # env = DonkeyVAEEnv(level=level, frame_skip=frame_skip, vae=vae, const_throttle=None, min_throttle=MIN_THROTTLE,
        #                    max_throttle=MAX_THROTTLE, max_cte_error=MAX_CTE_ERROR, n_command_history=N_COMMAND_HISTORY,
        #                    n_stack=n_stack, seed=seed)

        measurements_to_include = set(["steer", "throttle"])
        encode_state_fn = common_carla.create_encode_state_fn(
            vae, measurements_to_include)  # vae encode
        reward_fn = common_carla.reward_fn
        env = CarlaEnv(obs_res=obs_res,
                       action_smoothing=0,
                       encode_state_fn=encode_state_fn,
                       reward_fn=reward_fn,
                       synchronous=True,
                       fps=FPS,
                       host=HOST)  # wyb '10.38.164.121' '127.0.0.1'
        env.seed(0)  # wyb

        if not teleop:
            env = Monitor(env, log_dir, allow_early_resets=True)
        return env
Exemplo n.º 31
0
    def _get():
        locationX = -playerNumber * 1.5
        locationY = -6 + playerNumber * 1.5
        set_global_seeds(seed + rank)

        env = TrainKick(rank,
                        IP,
                        portj,
                        mportj,
                        teamname,
                        playerNumber,
                        locationX,
                        locationY,
                        sleepTime,
                        max_episode_steps=500,
                        trainType=trainType)

        env.seed(seed + rank)
        logdir = os.path.join(log_dir, str(rank))
        env = Monitor(env, str(logdir), allow_early_resets=True)

        return env