def create_env(env_id, output_path, seed=0): rank = MPI.COMM_WORLD.Get_rank() set_global_seeds(seed + 10000 * rank) env = gym.make(env_id) env = Monitor(env, os.path.join(output_path, str(rank)), allow_early_resets=True) env.seed(seed) return env
def _init(): set_global_seeds(seed + rank) env = gym.make(env_id, **env_kwargs) # Dict observation space is currently not supported. # https://github.com/hill-a/stable-baselines/issues/321 # We allow a Gym env wrapper (a subclass of gym.Wrapper) if wrapper_class: env = wrapper_class(env) env.seed(seed + rank) log_file = os.path.join(log_dir, str(rank)) if log_dir is not None else None env = Monitor(env, log_file) return env
def _init(): set_global_seeds(seed + rank) env = gym.make(env_id) if len(env_params) > 0: env = modify_env_params(env, params_path, **env_params) elif len(params_ranges) > 0: env = RandomUniformEnvParams(env, params_path, params_ranges, rank=rank) env.seed(seed + rank) env = Monitor(env, os.path.join(log_dir, str(rank)), allow_early_resets=True) return env
def _init(): set_global_seeds(seed + rank) env = gym.make(env_id) # Dict observation space is currently not supported. # https://github.com/hill-a/stable-baselines/issues/321 # We allow a Gym env wrapper (a subclass of gym.Wrapper) if wrapper_class: env = wrapper_class(env) env.seed(seed + rank) env = Monitor(env, os.path.join(log_dir, str(rank)), allow_early_resets=True) return env
def _init(): set_global_seeds(seed) env = DonkeyVAEEnv(level=LEVEL, frame_skip=frame_skip, vae=vae, const_throttle=None, min_throttle=MIN_THROTTLE, max_throttle=MAX_THROTTLE, max_cte_error=MAX_CTE_ERROR, n_command_history=N_COMMAND_HISTORY, n_stack=n_stack) env.seed(seed) if not teleop: env = Monitor(env, log_dir, allow_early_resets=True) return env
class SbPpo2(): '''stable baselines PPO2''' def __init__(self, expt_name): rospack = rospkg.RosPack() pkg_path = rospack.get_path('deepleng_control') outdir = pkg_path + '/monitor_logs/' + expt_name # env = gym.make('LunarLanderContinuous-v2') env = gym.make('DeeplengDocking-v2') self.expt_name = expt_name self.env = Monitor(env, outdir) def __call__(self, *args, **kwargs): # eval_callback = EvalCallback(env, best_model_save_path=eval_dir, # log_path=eval_dir, eval_freq=500, # deterministic=True, render=False) policy_kwargs = dict(layers=[400, 300, 200, 100]) model = PPO2(MlpPolicy, self.env, policy_kwargs=policy_kwargs, verbose=1, tensorboard_log= "home/dfki.uni-bremen.de/mpatil/Documents/baselines_log") model.learn(total_timesteps=int(1e5), log_interval=50, tb_log_name="ppo_Docker_" + self.expt_name) model.save( "/home/dfki.uni-bremen.de/mpatil/Documents/ppo_stable_baselines_" + self.expt_name) # del model print("Closing environment") self.env.close()
def _thunk(): env = make_mario(env_id) env.seed(seed + rank) if cut_map: env = CutMarioMap(env) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=allow_early_resets) # FIXME do if wrap deepmind, create other methods return wrap_deepmind_custom( env, **wrapper_kwargs) # converts to 84*84 bw, keep for now
def train(env_name, num_time_steps, eval_ep, eval_freq, ckpt_freq, load_model=None): env=gym.make(env_name) env_ = gym.make(env_name) rank = MPI.COMM_WORLD.Get_rank() today = date.today() today = str(today).replace('-','_') now = datetime.now() current_time = now.strftime("%H_%M_%S") model_name = env_name + '_PPO1_' + today + current_time Path('./run/'+model_name).mkdir(parents=True, exist_ok=True) path = os.path.join(os.path.dirname(__file__), './run/' + model_name) if rank == 0: env = Monitor(env, filename=path) ############################ # callback # ############################ callbacklist = [] eval_callback = EvalCallback_wandb(env_, n_eval_episodes=eval_ep, eval_freq=eval_freq, log_path=path) ckpt_callback = CheckpointCallback(save_freq=ckpt_freq, save_path='./run/' + model_name + '/ckpt', name_prefix='') callbacklist.append(eval_callback) callbacklist.append(ckpt_callback) callback = CallbackList(callbacklist) if load_model: model = PPO1.load(env=env, load_path=load_model) else: model = PPO1(MlpPolicy, env, verbose=1, gamma = 0.995, clip_param=0.2, entcoeff=1.0, lam = 0.95, optim_epochs=20,optim_batchsize=32768, timesteps_per_actorbatch=320000) ############################ # Logging # ############################ if rank==0: logger.configure() config = {} config['load']=[{'load_model':load_model}] config['eval']=[{'eval_freq':eval_freq, 'eval_ep':eval_ep}] config['ckpt']=[{'ckpt_freq':ckpt_freq}] with open('./run/' + model_name + '/' + model_name + '.txt', 'w+') as outfile: json.dump(config, outfile, indent=4) else: logger.configure(format_strs=[]) ############################ # run # ############################ model.learn(total_timesteps=int(num_time_steps), callback=callback) model.save(path+'/'+model_name)
def train_ppo(): env = Manipulator2D() env = Monitor(env, log_dir) # Custom MLP policy of two layers of size 32 each with tanh activation function #policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[32, 32]) # Create the agent # env = SubprocVecEnv([make_env( i) for i in range(8)]) # env = VecMonitor(env, log_dir) #model = PPO2(MlpPolicy, env, verbose=1, policy_kwargs=policy_kwargs,) model = PPO2(MlpPolicy, env, verbose=1, nminibatches=32, noptepochs = 10, ent_coef= 0.0) # Train the agent model.learn(total_timesteps=20000000, callback=callback) # Save the agent model.save("ppo2-mani14")
def get_arena_envs(use_monitor=True, log_dir=None): # the num_envs should be set in the filt settings.st under the folder of arena2d-sim num_envs = rospy.get_param(NS_SETTING + "num_envs") if log_dir is None: logs_file_names = [None] * num_envs else: logs_file_names = [ os.path.join(log_dir, f"arena_env_{i}") for i in range(num_envs) ] if use_monitor: return SubprocVecEnv([ lambda i=i: Monitor(Arena2dEnvWrapper(i), logs_file_names[i]) for i in range(num_envs) ]) return SubprocVecEnv( [lambda i=i: Arena2dEnvWrapper(i) for i in range(num_envs)])
def create_env(args, idx): """ Create and return an environment according to args (parsed arguments). idx specifies idx of this environment among parallel environments. """ monitor_file = os.path.join(args.output, ("env_%d" % idx)) # Check for Atari envs if "NoFrameskip" in args.env: env = make_atari(args.env) env = wrap_deepmind(env, frame_stack=True) else: env = gym.make(args.env) env = Monitor(env, monitor_file) return env
def load_model(self, symbol='JPM', sd=dt.datetime(2009, 1, 1), ed=dt.datetime(2010, 12, 31), loadpath=None): # load data and indicators df = self._load_data([symbol], sd, ed) df_met = self._get_indicators(symbol, df) print(f'min: {df_met.min()} max: {df_met.max()}') # set environment self.env = Monitor(LoanEnv(df_met), self.log_dir, allow_early_resets=True) # load model self.model = DQN.load(loadpath, env=self.env)
def make_env(): env = gym_super_mario_bros.make('SuperMarioBros-v3') env = JoypadSpace(env, RIGHT_ONLY) env = CustomRewardAndDoneEnv(env) # 報酬とエピソード完了の変更 env = StochasticFrameSkip(env, n=4, stickprob=0.25) # スティッキーフレームスキップ env = Downsample(env, 2) # ダウンサンプリング env = FrameStack(env, 4) # フレームスタック env = ScaledFloatFrame(env) # 状態の正規化 env = Monitor(env, log_dir, allow_early_resets=True) env.seed(0) # シードの指定 set_global_seeds(0) env = DummyVecEnv([lambda: env]) # ベクトル環境の生成 print('行動空間: ', env.action_space) print('状態空間: ', env.observation_space) return env
def main(): args = mujoco_arg_parser() # Create saving trained agent dir save_dir = "./trained_agent_dir/" + args.savedir + "/" os.makedirs(save_dir, exist_ok=True) # Create tensorboard log dir tensorboard_log_dir = "./tensorboard_log/" os.makedirs(tensorboard_log_dir, exist_ok=True) # Create and wrap the environment env = gym.make(config['env']) # env = ChangeJointRangeEnv(env) env = Monitor( env, log_dir, allow_early_resets=True ) # Monitor:logフォルダにmonitor.csvが出力します。ep_reward_mean(平均報酬)、ep_len_mean(平均エピソード長)、timestamp(経過時間)の3つのカラムを持つCSVになります env = DummyVecEnv([ lambda: env ]) #複数の環境用の単純なベクトル化されたラッパーを作成し、現在のPythonプロセスで各環境を順番に呼び出します。 # modelの生成 model = PPO2(MlpPolicy, env, verbose=1, tensorboard_log=tensorboard_log_dir, n_steps=config['n_steps'], nminibatches=config['nminibatches'], noptepochs=config['noptepochs'], learning_rate=config['learning_rate'], seed=args.seed) model.learn(total_timesteps=config['total_timestep'], callback=callback, tb_log_name=args.savedir) # Save the agent model.save(save_dir + "trainedAnt" + "-seed" + str(args.seed)) # csv 出力 csvdir = "./output/csv" os.makedirs(csvdir, exist_ok=True) R = np.array(rewardlist) np.savetxt(csvdir + '/' + args.savedir + '-' + str(args.seed) + '.csv', R, delimiter=',')
def main(): """ Runs the test """ """ Create an argparse.ArgumentParser for run_mujoco.py. :return: (ArgumentParser) parser {'--env': 'Reacher-v2', '--seed': 0, '--num-timesteps': int(1e6), '--play': False} parser = arg_parser() parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--num-timesteps', type=int, default=int(1e6)) parser.add_argument('--play', default=False, action='store_true') return parse """ env_id = 'UR5Gripper-v0' model_path = '/tmp/gym/trpo_mpi/' # args = mujoco_arg_parser().parse_args() # train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) # train(env_id=env_id, num_timesteps=int(1e7), seed=0, model_path=model_path) env = gym.make(env_id) env = Monitor(env, model_path, allow_early_resets=True) model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=model_path) model = model.load(model_path + "trpo.pkl") model.learn(total_timesteps=int(1e5), callback=callback) model.save(model_path + "trpo.pkl") # tf_util.save_state(model_path) # Enjoy trained agent obs = env.reset() for i in range(100): obs = env.reset() env.render() for i in range(200): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
def main(): # Save argument values to yaml file args_file_path = os.path.join(args.log_dir, 'args.yaml') with open(args_file_path, 'w') as f: yaml.dump(vars(args), f, default_flow_style=False) # Create and wrap the environment env = gym.make(args.env) env = Monitor(env, args.log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) # Add some param noise for exploration if args.model == 'DDPG': param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.2, desired_action_stddev=0.2) model = MODEL_CLASS(MlpPolicy, env, param_noise=param_noise, memory_limit=int(1e6), verbose=0) if args.model == 'SAC': # TODO: This doesn't work model = MODEL_CLASS(MlpPolicy, env, verbose=1, policy_kwargs={ 'n_env': 1, 'n_steps': 64, 'n_batch': 64 }) else: model = MODEL_CLASS(MlpPolicy, env, verbose=0) # Train the agent model.learn(total_timesteps=args.n_steps, callback=callback) # Save the final model if args.save_model: model_file_path = os.path.join(args.log_dir, 'model.pkl') model.save(model_file_path) print("Best and final models saved in ", os.path.abspath(args.log_dir)) if args.plots: raise NotImplementedError
def make_env(env_id, rank, log_dir=None, allow_early_resets=True, flatten_dict=False, kwargs=None): """ Create a wrapped, monitored gym.Env for MuJoCo. :param env_id: (str) the environment ID :param seed: (int) the inital seed for RNG :param allow_early_resets: (bool) allows early reset of the environment :return: (Gym Environment) The mujoco environment """ if env_id in ENTRY_POINT.keys(): kwargs = kwargs.copy() max_episode_steps = None if 'max_episode_steps' in kwargs: max_episode_steps = kwargs['max_episode_steps'] del kwargs['max_episode_steps'] gym.register(env_id, entry_point=ENTRY_POINT[env_id], max_episode_steps=max_episode_steps, kwargs=kwargs) env = gym.make(env_id) else: raise NotImplementedError if flatten_dict: env = FlattenDictWrapper( env, ['observation', 'achieved_goal', 'desired_goal']) if 'FetchStack' in env_id and ( 'Unlimit' not in env_id) and max_episode_steps is None: from utils.wrapper import FlexibleTimeLimitWrapper env = FlexibleTimeLimitWrapper(env, 100) if kwargs['reward_type'] != 'sparse': env = DoneOnSuccessWrapper(env, 0.0) else: env = DoneOnSuccessWrapper(env) if log_dir is not None: env = Monitor(env, os.path.join(log_dir, str(rank) + ".monitor.csv"), allow_early_resets=allow_early_resets, info_keywords=('is_success', )) return env
def main(): # create Environment env = iCubPushGymEnv(urdfRoot=robot_data.getDataPath(), renders=False, useIK=1, isDiscrete=0, rnd_obj_pose=0, maxSteps=2000, reward_type=0) # set seed seed = 1 tf.reset_default_graph() set_global_seed(seed) env.seed(seed) # set log monitor_dir = os.path.join(log_dir, 'log') os.makedirs(monitor_dir, exist_ok=True) env = Monitor(env, monitor_dir + '/', allow_early_resets=True) # create agent model nb_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(0.5373) * np.ones(nb_actions)) model = DDPG('LnMlpPolicy', env, action_noise=action_noise, gamma=0.99, batch_size=16, normalize_observations=True, normalize_returns=False, memory_limit=100000, verbose=1, tensorboard_log=os.path.join(log_dir, 'tb'), full_tensorboard_log=False) #start learning model.learn(total_timesteps=500000, seed=seed, callback=callback) # save model print("Saving model.pkl to ", log_dir) act.save(log_dir + "/final_model.pkl")
def make_robotics_env(env_id, seed, rank=0, allow_early_resets=True): """ Create a wrapped, monitored gym.Env for MuJoCo. :param env_id: (str) the environment ID :param seed: (int) the inital seed for RNG :param rank: (int) the rank of the environment (for logging) :param allow_early_resets: (bool) allows early reset of the environment :return: (Gym Environment) The robotic environment """ set_global_seeds(seed) env = gym.make(env_id) env = FlattenDictWrapper(env, ['observation', 'desired_goal']) env = Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), info_keywords=('is_success',), allow_early_resets=allow_early_resets) env.seed(seed) return env
def _solve_domain(self, domain_factory: Callable[[], D]) -> None: # TODO: improve code for parallelism # (https://stable-baselines.readthedocs.io/en/master/guide/examples.html # #multiprocessing-unleashing-the-power-of-vectorized-environments)? if not hasattr( self, '_algo'): # reuse algo if possible (enables further learning) domain = domain_factory() env = Monitor(AsGymEnv(domain), filename=None, allow_early_resets=True) env = DummyVecEnv([ lambda: env ]) # the algorithms require a vectorized environment to run self._algo = self._algo_class(self._baselines_policy, env, **self._algo_kwargs) self._init_algo(domain) self._algo.learn(**self._learn_config)
def start_unity_baselines(): # Set to FALSE for CIP-Pool execution # env = make_unity_env('./envs/worm_dynamic_one_agent/linux/worm_dynamic', 1, False) # InitialTrainingExample.start_training(env) # env.close() unity_env = UnityEnvironment( './envs/worm_dynamic_one_agent/linux/worm_dynamic', no_graphics=True) env = UnityToGymWrapper(unity_env, uint8_visual=False) env = Monitor(env, 'results/') # The noise objects for TD3 n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model = TD3_Baselines(MlpPolicy, env, action_noise=action_noise, verbose=1) model.learn(total_timesteps=int(2e6), log_interval=10) model.save("td3_worm")
def train_SAC(env, out_dir, seed=None, **kwargs): # Logs will be saved in log_dir/monitor.csv global output_dir output_dir = out_dir log_dir = os.path.join(out_dir, 'log') os.makedirs(log_dir, exist_ok=True) env = gym.make(env) env = Monitor(env, log_dir + '/', allow_early_resets=True) # Delete keys so the dict can be pass to the model constructor # policy = kwargs['policy'] policy = 'MlpPolicy' # n_timesteps = kwargs['n_timesteps'] n_timesteps = int(1e6) noise_type = None # Add some param noise for exploration param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1) continue_model = False if continue_model is True: # Continue training print("Loading pretrained agent") model = SAC.load(os.path.join(out_dir, 'final_model.pkl'), env=env, tensorboard_log=os.path.join(log_dir, 'tb'), verbose=1, **kwargs) else: model = SAC( policy, env, # action_noise=param_noise, verbose=1, tensorboard_log=os.path.join(log_dir, 'tb'), full_tensorboard_log=False, **kwargs) model.learn(total_timesteps=n_timesteps, seed=seed, callback=callback, log_interval=10) return model
def _init(): # set_global_seeds(seed + rank) env = gym.make(env_id, **env_kwargs) env.seed(seed + rank) env.action_space.seed(seed + rank) if log_dir and evaluation: env = ParticleInformationWrapper(env, path=os.path.join( log_dir, str(rank))) if wrappers: for wrapper in wrappers: env = wrapper[0](env=env, **wrapper[1]) if log_dir: env = Monitor(env, filename=None, allow_early_resets=True ) # filename=os.path.join(log_dir, str(rank)) return env
def _init(): if isinstance(env_id, str): # env = retro.make(env_id, state, scenario=scenario) if record: env = make_retro(game=env_id, state=initial_state, scenario=scenario, max_episode_steps=max_episode_steps, record=record_path) else: env = make_retro(game=env_id, state=initial_state, scenario=scenario, max_episode_steps=max_episode_steps) if len(env_kwargs) > 0: warnings.warn( "No environment class was passed (only an env ID) so `env_kwargs` will be ignored" ) else: env = env_id(**env_kwargs) if seed is not None: env.seed(seed + rank) env.action_space.seed(seed + rank) # Wrap the env in a Monitor wrapper # to have additional training information monitor_path = os.path.join( monitor_dir, str(rank)) if monitor_dir is not None else None # Create the monitor folder if needed if monitor_path is not None: os.makedirs(monitor_dir, exist_ok=True) env = Monitor(env, filename=monitor_path) #if multiple states provided, wrap if isinstance(state, list): env = RandomStateReset(env, state, seed=seed) # Optionally, wrap the environment with the provided wrapper if wrapper_class is not None: env = wrapper_class(env) return env
def main(load_policy=False): global log_dir, log_dir_policy if (load_policy): log_dir_policy = '../policies/PUSHING_TD3+HER_FIXED_POSITION_DYN_RAND_FROM_FIXED_PHYSICS' model_class = TD3 # works also with SAC and DDPG action_space = 7 fixed = True normalize_observations = False gamma = 0.9 memory_limit = 1000000 normalize_returns = True timesteps = 1500000 discreteAction = 0 rend = False env = pandaPushGymEnvHERRand(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=0, isDiscrete=discreteAction, action_space = action_space, fixedPositionObj = fixed, includeVelObs = True) env = Monitor(env, log_dir, allow_early_resets=True) # Available strategies (cf paper): future, final, episode, random goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # Wrap the model model = HER(CustomPolicy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1,tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/TD3+HER_FIXED_DYN_RAND", buffer_size=1000000,batch_size=256, random_exploration=0.3, action_noise=action_noise) if (load_policy): model = HER.load("../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl", env=env, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/TD3+HER_FIXED_DYN_RAND_FROM_FIXED_PHYSICS", buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise) # Train the model starting from a previous policy model.learn(timesteps, callback = callback ) model.save("../policies/PUSHING_FIXED_TD3_DYN_RAND") print("Finished train1")
def _init(): if isinstance(env_id, str): env = gym.make(env_id) if len(env_kwargs) > 0: warnings.warn("No environment class was passed (only an env ID) so `env_kwargs` will be ignored") else: env = env_id(**env_kwargs) if seed is not None: env.seed(seed + rank) env.action_space.seed(seed + rank) # Wrap the env in a Monitor wrapper # to have additional training information monitor_path = os.path.join(monitor_dir, str(rank)) if monitor_dir is not None else None # Create the monitor folder if needed if monitor_path is not None: os.makedirs(monitor_dir, exist_ok=True) env = Monitor(env, filename=monitor_path) # Optionally, wrap the environment with the provided wrapper if wrapper_class is not None: env = wrapper_class(env) return env
def _init(): env = pacman_env if seed is not None: env.seed(seed + rank) env.action_space.seed(seed + rank) # Wrap the env in a Monitor wrapper # to have additional training information monitor_path = os.path.join( monitor_dir, str(rank)) if monitor_dir is not None else None # Create the monitor folder if needed if monitor_path is not None: os.makedirs(monitor_dir, exist_ok=True) env = Monitor(env, filename=monitor_path, info_keywords=('score', 'ghosts', 'level', 'win', 'd', 'map')) # Optionally, wrap the environment with the provided wrapper if wrapper_class is not None: env = wrapper_class(env) return env
def _init(): if env_id == "WarehouseEnv": # if map_file is "None" or map_file is None: simple_agent = np.zeros((11, 11)) simple_agent[5, 5] = 1 # [[ 0, 1, 0, 0, 0, 0, 2, 0, 0], # [ 0, 0, 0, 0, 0, 0, 0, 0, 0], # [ 0, 0, 0, 0, 0, 0, 0, 0, 0], # [ 0, 0, 0, 0, 0, 0, 0, 0, 0], # [ 0, 0, 0, 0, 0, 0, 0, 0, 0], # [ 0, 0, 0, 0, 0, 0, 0, 0, 0], # [ 0, 0, 0, 0, 3, 0, 0, 0, 0]] # simple_agent = \ # [[ 0, 1, 0, 0, 0, 0, 0, 0, 0], # [ 0, 0, 0, 0, 0, 0, 0, 0, 0], # [ 0, 0, 0, 0, 0, 0, 0, 0, 0], # [ 0, 0, 0, 0, 0, 0, 0, 0, 0], # [ 0, 0, 0, 0, 0, 0, 0, 0, 0], # [ 0, 0, 0, 0, 0, 0, 0, 0, 0], # [ 0, 0, 0, 0, 0, 0, 0, 0, 0]] simple_world = np.zeros((11, 11)) # [[ 0, 0, 0, 0, 0, 0, 0, 0, 0], # [ 0, 0, 0, 0, 0, 0, 0, 0, 0], # [ 0, 0, 0, 0, 1, 0, 0, 0, 0], # [ 0, 1, 0, 0, 0, 1, 0, 0, 0], # [ 0, 0, 0, 0, 1, 0, 0, 0, 0], # [ 0, 0, 0, 0, 0, 0, 0, 0, 0], # [ 0, 0, 0, 0, 0, 0, 0, 0, 0]] env = WarehouseEnv(agent_map=simple_agent, obstacle_map=simple_world, render_as_observation=render_as_observation, exponential_agent_training_curve= exponential_agent_training_curve) else: env = gym.make(env_id, level=env_level) if frame_stack: env = FrameStack(env, 4) if useMonitor: env = Monitor(env, log_dir + str(rank), allow_early_resets=True) return env
def main(load_policy=False): global log_dir model_class = TD3 # works also with SAC and DDPG action_space = 6 fixed = True #0 completely fixed, 1 slightly random radius, 2 big random radius, object_position = 1 normalize_observations = False gamma = 0.9 memory_limit = 1000000 normalize_returns = True timesteps = 5000000 discreteAction = 0 rend = False env = pandaPushGymEnvHER(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=1, isDiscrete=discreteAction, action_space = action_space, fixedPositionObj = fixed, includeVelObs = True, object_position=object_position) env = Monitor(env, log_dir, allow_early_resets=True) goal_selection_strategy = 'future' n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # Wrap the model model = HER(CustomTD3Policy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1,tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK", buffer_size=1000000,batch_size=256, random_exploration=0.3, action_noise=action_noise) if (load_policy): model = HER.load("../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl", env=env, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK", buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise) model.learn(timesteps,log_interval=100, callback = callback) print("Saving Policy PHASE_1") model.save("../policies/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK")
def _init(): set_global_seeds(seed) # env = DonkeyVAEEnv(level=level, frame_skip=frame_skip, vae=vae, const_throttle=None, min_throttle=MIN_THROTTLE, # max_throttle=MAX_THROTTLE, max_cte_error=MAX_CTE_ERROR, n_command_history=N_COMMAND_HISTORY, # n_stack=n_stack, seed=seed) measurements_to_include = set(["steer", "throttle"]) encode_state_fn = common_carla.create_encode_state_fn( vae, measurements_to_include) # vae encode reward_fn = common_carla.reward_fn env = CarlaEnv(obs_res=obs_res, action_smoothing=0, encode_state_fn=encode_state_fn, reward_fn=reward_fn, synchronous=True, fps=FPS, host=HOST) # wyb '10.38.164.121' '127.0.0.1' env.seed(0) # wyb if not teleop: env = Monitor(env, log_dir, allow_early_resets=True) return env
def _get(): locationX = -playerNumber * 1.5 locationY = -6 + playerNumber * 1.5 set_global_seeds(seed + rank) env = TrainKick(rank, IP, portj, mportj, teamname, playerNumber, locationX, locationY, sleepTime, max_episode_steps=500, trainType=trainType) env.seed(seed + rank) logdir = os.path.join(log_dir, str(rank)) env = Monitor(env, str(logdir), allow_early_resets=True) return env