def run_agent(envs, parameters): '''Train an agent.''' alg = parameters['alg'] learning_rate = parameters['learning_rate'] gamma = parameters['gamma'] model_path = parameters['model_path'] set_global_seeds(parameters.get('seed')) dummy_env = OptVecEnv(envs) if alg == 'PPO': model = PPO2(MlpPolicy, dummy_env, gamma=gamma, learning_rate=learning_rate, verbose=1, nminibatches=dummy_env.num_envs) elif alg == 'A2C': model = A2C(MlpPolicy, dummy_env, gamma=gamma, learning_rate=learning_rate, verbose=1) else: model = DDPG(ddpg.MlpPolicy, dummy_env, gamma=gamma, verbose=1, actor_lr=learning_rate / 10, critic_lr=learning_rate) try: model.learn(total_timesteps=parameters.get('total_timesteps', 10**6)) except tf.errors.InvalidArgumentError: LOGGER.error('Possible Nan, %s', str((alg, learning_rate, gamma))) finally: dummy_env.close() model.save(str(model_path))
def train_DDPG(self, model_name, model_params=config.DDPG_PARAMS): """DDPG model""" from stable_baselines import DDPG from stable_baselines.ddpg.policies import DDPGPolicy from stable_baselines.common.noise import OrnsteinUhlenbeckActionNoise env_train = self.env n_actions = env_train.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) start = time.time() model = DDPG('MlpPolicy', env_train, batch_size=model_params['batch_size'], buffer_size=model_params['buffer_size'], param_noise=param_noise, action_noise=action_noise, verbose=model_params['verbose']) model.learn(total_timesteps=model_params['timesteps']) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (DDPG): ', (end - start) / 60, ' minutes') return model
def train_agent_with_ddpg(load): from stable_baselines.ddpg.policies import FeedForwardPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise from stable_baselines import DDPG # Create and wrap the environment env = gym.make('F16GCAS-v0') env = DummyVecEnv([lambda: env]) # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.01) * np.ones(n_actions)) # Custom MLP policy of two layers of size 16 each class CustomPolicy(FeedForwardPolicy): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, layers=[128, 128], layer_norm=False, feature_extraction="mlp") model = DDPG(CustomPolicy, env, verbose=1, action_noise=action_noise) if not load: ExpData = ExpertDataset("./lqr_export.npz") model.pretrain(ExpData, n_epochs=100) model.save(ROOT+"/trained_models/TDRL/f16/ddpg/128_128") else: model = DDPG.load(ROOT+"/trained_models/TDRL/f16/ddpg/128_128", policy=CustomPolicy, env=env) return model
def test_ddpg_normalization(): """ Test that observations and returns normalizations are properly saved and loaded. """ param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=0.05) model = DDPG('MlpPolicy', 'Pendulum-v0', memory_limit=50000, normalize_observations=True, normalize_returns=True, nb_rollout_steps=128, nb_train_steps=1, batch_size=64, param_noise=param_noise) model.learn(1000) obs_rms_params = model.sess.run(model.obs_rms_params) ret_rms_params = model.sess.run(model.ret_rms_params) model.save('./test_ddpg.zip') loaded_model = DDPG.load('./test_ddpg.zip') obs_rms_params_2 = loaded_model.sess.run(loaded_model.obs_rms_params) ret_rms_params_2 = loaded_model.sess.run(loaded_model.ret_rms_params) for param, param_loaded in zip(obs_rms_params + ret_rms_params, obs_rms_params_2 + ret_rms_params_2): assert np.allclose(param, param_loaded) del model, loaded_model if os.path.exists("./test_ddpg.zip"): os.remove("./test_ddpg.zip")
def main(env: PSMCartesianDDPGEnv): # the noise objects for DDPG n_actions = env.action.action_space.shape[0] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model = DDPG(MlpPolicy, env, gamma=0.95, verbose=1, nb_train_steps=300, nb_rollout_steps=150, param_noise=param_noise, batch_size=128, action_noise=action_noise, random_exploration=0.05, normalize_observations=True, tensorboard_log="./ddpg_dvrk_tensorboard/", observation_range=(-1.5, 1.5), critic_l2_reg=0.01) model.learn(total_timesteps=4000000, log_interval=100, callback=CheckpointCallback( save_freq=100000, save_path="./ddpg_dvrk_tensorboard/")) model.save("./ddpg_robot_env")
def main(output_folder_path:Path): # Set gym-carla environment agent_config = AgentConfig.parse_file(Path("configurations/agent_configuration.json")) carla_config = CarlaConfig.parse_file(Path("configurations/carla_configuration.json")) params = { "agent_config": agent_config, "carla_config": carla_config, "ego_agent_class": RLPIDAgent, "max_collision": 5 } env = gym.make('roar-pid-v0', params=params) env.reset() model_params: dict = { "verbose": 1, "render": True, "tensorboard_log": (output_folder_path / "tensorboard").as_posix() } latest_model_path = find_latest_model(output_folder_path) if latest_model_path is None: model = DDPG(LnMlpPolicy, env=env, **model_params) # full tensorboard log can take up space quickly else: model = DDPG.load(latest_model_path, env=env, **model_params) model.render = True model.tensorboard_log = (output_folder_path / "tensorboard").as_posix() logging_callback = LoggingCallback(model=model) checkpoint_callback = CheckpointCallback(save_freq=1000, verbose=2, save_path=(output_folder_path / "checkpoints").as_posix()) event_callback = EveryNTimesteps(n_steps=100, callback=checkpoint_callback) callbacks = CallbackList([checkpoint_callback, event_callback, logging_callback]) model = model.learn(total_timesteps=int(1e10), callback=callbacks, reset_num_timesteps=False) model.save(f"pid_ddpg_{datetime.now()}")
def main(): # unpause Simulation so that robot receives data on all topics gazebo_connection.GazeboConnection().unpauseSim() # create node rospy.init_node('pickbot_gym', anonymous=True, log_level=rospy.FATAL) env = gym.make('Pickbot-v1') # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model = DDPG(MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise) model.learn(total_timesteps=200000) print("Saving model to pickbot_model_ddpg_continuous_" + timestamp + ".pkl") model.save("pickbot_model_ddpg_continuous_" + timestamp)
def explore(app, emulator, appium, timesteps, timer, save_policy, policy_dir, cycle, nb_train_steps=10, random_exploration=0.7): try: env = TimeFeatureWrapper(app) model = DDPG(MlpPolicy, env, verbose=1, random_exploration=random_exploration, nb_train_steps=nb_train_steps) callback = TimerCallback(timer=timer) model.learn(total_timesteps=timesteps, callback=callback) if save_policy: model.save(f'{policy_dir}{os.sep}{cycle}') return True except Exception: appium.restart_appium() if emulator is not None: emulator.restart_emulator() return False
def main(env): n_actions = env.action_space.shape[0] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # Using only one expert trajectory # you can specify `traj_limitation=-1` for using the whole dataset file_dir = "/home/vignesh/Thesis_Suture_data/trial2/ambf_data/" dataset = ExpertDataset(expert_path=file_dir + 'expert_psm_data.npz', traj_limitation=1, batch_size=32) model = DDPG(MlpPolicy, env, gamma=0.95, verbose=1, nb_train_steps=300, nb_rollout_steps=150, param_noise=param_noise, batch_size=128, action_noise=action_noise, random_exploration=0.05, normalize_observations=True, tensorboard_log="./ddpg_dvrk_tensorboard/", observation_range=(-1.5, 1.5)) model.pretrain(dataset, n_epochs=1000) model.save("./gail_robot_env")
def train_DDPG(env_train, model_name, timesteps=50000): """DDPG model""" start = time.time() model = DDPG('MlpPolicy', env_train) model.learn(total_timesteps=timesteps) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (DDPG): ', (end - start) / 60, ' minutes') return model
def DDPGAgent(multi_stock_env, num_episodes): models_folder = 'saved_models' rewards_folder = 'saved_rewards' env = DummyVecEnv([lambda: multi_stock_env]) # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # Hyper parameters GAMMA = 0.99 TAU = 0.001 BATCH_SIZE = 16 ACTOR_LEARNING_RATE = 0.0001 CRITIC_LEARNING_RATE = 0.001 BUFFER_SIZE = 500 print("\nRunning DDPG Agent...\n") model = DDPG(MlpPolicy, env, gamma = GAMMA, tau = TAU, batch_size = BATCH_SIZE, actor_lr = ACTOR_LEARNING_RATE, critic_lr = CRITIC_LEARNING_RATE, buffer_size = BUFFER_SIZE, verbose=1, param_noise=param_noise, action_noise=action_noise) model.learn(total_timesteps=50000) model.save(f'{models_folder}/rl/ddpg.h5') del model model = DDPG.load(f'{models_folder}/rl/ddpg.h5') obs = env.reset() portfolio_value = [] for e in range(num_episodes): action, _states = model.predict(obs) next_state, reward, done, info = env.step(action) print(f"episode: {e + 1}/{num_episodes}, episode end value: {info[0]['cur_val']:.2f}") portfolio_value.append(round(info[0]['cur_val'], 3)) # save portfolio value for each episode np.save(f'{rewards_folder}/rl/ddpg.npy', portfolio_value) print("\nDDPG Agent run complete and saved!") a = np.load(f'./saved_rewards/rl/ddpg.npy') print(f"\nCumulative Portfolio Value Average reward: {a.mean():.2f}, Min: {a.min():.2f}, Max: {a.max():.2f}") plt.plot(a) plt.title("Portfolio Value Per Episode (DDPG)") plt.ylabel("Portfolio Value") plt.xlabel("Episodes") plt.show()
def train_ddpg(): env = gimbal(5, 500) env = DummyVecEnv([lambda: env]) eval_env = gimbal(5, 500) eval_env = DummyVecEnv([lambda: eval_env]) # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = None model = DDPG(policy=MlpPolicy, env=env, gamma=0.99, memory_policy=None, eval_env=eval_env, nb_train_steps=500, nb_rollout_steps=500, nb_eval_steps=500, param_noise=param_noise, action_noise=action_noise, normalize_observations=False, tau=0.001, batch_size=128, param_noise_adaption_interval=50, normalize_returns=False, enable_popart=False, observation_range=(-5000.0, 5000.0), critic_l2_reg=0.0, return_range=(-inf, inf), actor_lr=0.0001, critic_lr=0.001, clip_norm=None, reward_scale=1.0, render=False, render_eval=False, memory_limit=50000, verbose=1, tensorboard_log="./logs", _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False) #model = DDPG.load("./models/baseline_ddpg_t2") #model.set_env(env) model.learn(total_timesteps=1000000, callback=None, seed=None, log_interval=100, tb_log_name='DDPG', reset_num_timesteps=True) model.save("./models/baseline_ddpg_t2")
def main(output_folder_path: Path): # Set gym-carla environment agent_config = AgentConfig.parse_file( Path("configurations/agent_configuration.json")) carla_config = CarlaConfig.parse_file( Path("configurations/carla_configuration.json")) params = { "agent_config": agent_config, "carla_config": carla_config, "ego_agent_class": RLLocalPlannerAgent, "max_collision": 5, } env = gym.make('roar-local-planner-v0', params=params) env.reset() model_params: dict = { "verbose": 1, "render": True, "env": env, "n_cpu_tf_sess": None, "buffer_size": 1000, "nb_train_steps": 50, "nb_rollout_steps": 100, # "nb_eval_steps": 50, "batch_size": 32, } latest_model_path = find_latest_model(Path(output_folder_path)) if latest_model_path is None: model = DDPG(CnnPolicy, **model_params) else: model = DDPG.load(latest_model_path, **model_params) tensorboard_dir = (output_folder_path / "tensorboard") ckpt_dir = (output_folder_path / "checkpoints") tensorboard_dir.mkdir(parents=True, exist_ok=True) ckpt_dir.mkdir(parents=True, exist_ok=True) model.tensorboard_log = tensorboard_dir.as_posix() model.render = True logging_callback = LoggingCallback(model=model) checkpoint_callback = CheckpointCallback(save_freq=1000, verbose=2, save_path=ckpt_dir.as_posix()) event_callback = EveryNTimesteps(n_steps=100, callback=checkpoint_callback) callbacks = CallbackList( [checkpoint_callback, event_callback, logging_callback]) model = model.learn(total_timesteps=int(1e10), callback=callbacks, reset_num_timesteps=False) model.save(f"local_planner_ddpg_{datetime.now()}")
def train_DDPG(env_train, model_name, timesteps=10000): """DDPG model""" # the noise objects for DDPG n_actions = env_train.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) start = time.time() model = DDPG('MlpPolicy', env_train, param_noise=param_noise, action_noise=action_noise) model.learn(total_timesteps=timesteps) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (DDPG): ', (end - start) / 60, ' minutes') return model
def main(): env1 = KukaDiverseObjectEnv(renders=True, isDiscrete=False) model = DDPG(MlpPolicy, env1, verbose=1) # = deepq.models.mlp([64]) model.learn(total_timesteps=500000) #max_timesteps=10000000, # exploration_fraction=0.1, # exploration_final_eps=0.02, # print_freq=10, # callback=callback, network='mlp') print("Saving model to kukadiverse_model.pkl") model.save("kukadiversecont_model.pkl") main()
def main(output_folder_path: Path): # Set gym-carla environment agent_config = AgentConfig.parse_file( Path("configurations/agent_configuration.json")) carla_config = CarlaConfig.parse_file( Path("configurations/carla_configuration.json")) params = { "agent_config": agent_config, "carla_config": carla_config, "ego_agent_class": RLLocalPlannerAgent, "max_collision": 5, } env = gym.make('roar-local-planner-v1', params=params) env.reset() tensorboard_dir, ckpt_dir = prep_dir(output_folder_path) model_params: dict = { "verbose": 1, "render": True, "env": env, "n_cpu_tf_sess": 2, "buffer_size": 10, "random_exploration": 0.1, "tensorboard_log": tensorboard_dir.as_posix(), } latest_model_path = find_latest_model(Path(output_folder_path)) if latest_model_path is None: model = DDPG( LnMlpPolicy, **model_params) # full tensorboard log can take up space quickly else: model = DDPG.load(latest_model_path, **model_params) logging_callback = LoggingCallback(model=model) checkpoint_callback = CheckpointCallback(save_freq=1000, verbose=2, save_path=ckpt_dir.as_posix()) event_callback = EveryNTimesteps(n_steps=100, callback=checkpoint_callback) callbacks = CallbackList( [checkpoint_callback, event_callback, logging_callback]) model = model.learn(total_timesteps=int(1e10), callback=callbacks, reset_num_timesteps=False) model.save(f"local_planner_v1_ddpg_{datetime.now()}")
def ppo1_nmileg_pool(sensory_value): RL_method = "PPO1" # total_MC_runs = 50 experiment_ID = "handtest_rot_pool_with_MC_C_task0/" save_name_extension = RL_method total_timesteps = 500000 sensory_info = "sensory_{}".format(sensory_value) current_mc_run_num =22 #starts from 0 for mc_cntr in range(current_mc_run_num, current_mc_run_num+1): log_dir = "./logs/{}/MC_{}/{}/{}/".format(experiment_ID, mc_cntr, RL_method, sensory_info) # defining the environments env = gym.make('HandManipulate-v1{}'.format(sensory_value)) #env = gym.wrappers.Monitor(env, "./tmp/gym-results", video_callable=False, force=True) ## setting the Monitor env = gym.wrappers.Monitor(env, log_dir+"Monitor/", video_callable=False, force=True, uid="Monitor_info") # defining the initial model if RL_method == "PPO1": model = PPO1(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir) elif RL_method == "PPO2": env = DummyVecEnv([lambda: env]) model = PPO2(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir) elif RL_method == "DDPG": env = DummyVecEnv([lambda: env]) n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5)* 5 * np.ones(n_actions)) model = DDPG(DDPG_MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, tensorboard_log=log_dir) else: raise ValueError("Invalid RL mode") # setting the environment on the model #model.set_env(env) # setting the random seed for some of the random instances random_seed = mc_cntr random.seed(random_seed) env.seed(random_seed) env.action_space.seed(random_seed) np.random.seed(random_seed) tf.random.set_random_seed(random_seed) # training the model # training the model model.learn(total_timesteps=total_timesteps) # saving the trained model model.save(log_dir+"/model") return None
def main(): # create Environment env = iCubPushGymEnv(renders=False, use_IK=1, obj_pose_rnd_std=0, max_steps=2000, reward_type=0) # set seed seed = 1 tf.reset_default_graph() set_global_seed(seed) env.seed(seed) # set log monitor_dir = os.path.join(log_dir, 'log') os.makedirs(monitor_dir, exist_ok=True) env = Monitor(env, monitor_dir + '/', allow_early_resets=True) # create agent model nb_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(0.5373) * np.ones(nb_actions)) model = DDPG('LnMlpPolicy', env, action_noise=action_noise, gamma=0.99, batch_size=16, normalize_observations=True, normalize_returns=False, memory_limit=100000, verbose=1, tensorboard_log=os.path.join(log_dir, 'tb'), full_tensorboard_log=False) # start learning model.learn(total_timesteps=500000, seed=seed, callback=callback) # save model print("Saving model.pkl to ", log_dir) model.save(log_dir + "/final_model.pkl")
def train_DDPG(self, model_name, ddpg_params=config.DDPG_PARAMS): """DDPG model""" from stable_baselines import DDPG from stable_baselines.ddpg.policies import DDPGPolicy from stable_baselines.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise, AdaptiveParamNoiseSpec env_train = self.env start = time.time() model = DDPG('MlpPolicy', env_train, batch_size=ddpg_params['batch_size'], buffer_size=ddpg_params['buffer_size'], verbose=ddpg_params['verbose']) model.learn(total_timesteps=ddpg_params['timesteps']) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (DDPG): ', (end - start) / 60, ' minutes') return model
def run_baseline_ddpg(env_name, train=True): import numpy as np # from stable_baselines.ddpg.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise from stable_baselines import DDPG env = gym.make(env_name) env = DummyVecEnv([lambda: env]) if train: # mlp from stable_baselines.ddpg.policies import FeedForwardPolicy class CustomPolicy(FeedForwardPolicy): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, layers=[64, 64, 64], layer_norm=True, feature_extraction="mlp") # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions)+0.15, sigma=0.3 * np.ones(n_actions)) model = DDPG(CustomPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, tau=0.01, observation_range=(env.observation_space.low, env.observation_space.high), critic_l2_reg=0, actor_lr=1e-3, critic_lr=1e-3, memory_limit=100000) model.learn(total_timesteps=1e5) model.save("checkpoints/ddpg_" + env_name) else: model = DDPG.load("checkpoints/ddpg_" + env_name) obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render() print("state: ", obs, " reward: ", rewards, " done: ", dones, "info: ", info) del model # remove to demonstrate saving and loading
def main(args): #Starting the timer to record the operation time. start = time.time() env_id = 'fwmav_hover-v0' #Creating a vector of size 1 which only has the environment. env = DummyVecEnv([make_env(env_id, 0)]) # env = SubprocVecEnv([make_env(env_id, i) for i in range(args.n_cpu)]) # -1 argument means the shape will be found automatically. n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model = DDPG( policy=MyDDPGPolicy, env=env, gamma=1.0, nb_train_steps=5000, nb_rollout_steps=10000, nb_eval_steps=10000, param_noise=param_noise, action_noise=action_noise, tau=0.003, batch_size=256, observation_range=(-np.inf, np.inf), actor_lr=0.0001, critic_lr=0.001, reward_scale=0.05, memory_limit=10000000, verbose=1, ) model.learn(total_timesteps=args.time_step) model.save(args.model_path) #End timer. end = time.time() print("Time used: ", end - start)
def training(env): n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model = DDPG(MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, render=True, return_range=[-1.0, 1.0], observation_range=[-2.0, 2.0]) model.learn(total_timesteps=40000) time = datetime.now().strftime("%m%d_%H%M%S") model.save("models\\ddpg_sbl_" + time) del model # remove to demonstrate saving and loading testing(env, time)
def ppo1_nmileg_pool(stiffness_value): RL_method = "PPO1" experiment_ID = "experiment_4_pool_A/mc_1/" save_name_extension = RL_method total_timesteps = 500000 stiffness_value_str = "stiffness_{}".format(stiffness_value) log_dir = "./logs/{}/{}/{}/".format(experiment_ID, RL_method, stiffness_value_str) # defining the environments env = gym.make('TSNMILeg{}-v1'.format(stiffness_value)) #env = gym.wrappers.Monitor(env, "./tmp/gym-results", video_callable=False, force=True) # defining the initial model if RL_method == "PPO1": model = PPO1(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir) elif RL_method == "PPO2": env = DummyVecEnv([lambda: env]) model = PPO2(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir) elif RL_method == "DDPG": env = DummyVecEnv([lambda: env]) n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * 5 * np.ones(n_actions)) model = DDPG(DDPG_MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, tensorboard_log=log_dir) else: raise ValueError("Invalid RL mode") # setting the environment on the model #model.set_env(env) # training the model # training the model model.learn(total_timesteps=total_timesteps) # saving the trained model model.save(log_dir + "/model") return None
def main(args): start = time.time() env_id = 'fwmav_maneuver-v0' env = DummyVecEnv([make_env(env_id, 0)]) # env = SubprocVecEnv([make_env(env_id, i) for i in range(args.n_cpu)]) n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model = DDPG( policy=MyDDPGPolicy, env=env, gamma=1.0, nb_train_steps=5000, nb_rollout_steps=10000, nb_eval_steps=10000, param_noise=param_noise, action_noise=action_noise, tau=0.003, batch_size=256, observation_range=(-np.inf, np.inf), actor_lr=0.0001, critic_lr=0.001, reward_scale=0.05, memory_limit=10000000, verbose=1, ) model.learn(total_timesteps=args.time_step) model.save(args.model_path) end = time.time() print("Time used: ", end - start)
def main(): param_noise = None env1 = tm700GymEnv2(renders=False, isDiscrete=False) model = DDPG(MlpPolicy, env1, verbose=1, param_noise=param_noise, random_exploration=0.1) # model = DQN(MlpPolicy, env1, verbose=1, exploration_fraction=0.3) # = deepq.models.mlp([64]) start = time.time() model.learn(total_timesteps=1000000) #max_timesteps=10000000, # exploration_fraction=0.1, # exploration_final_eps=0.02, # print_freq=10, # callback=callback, network='mlp') print("Saving model") model.save("tm_test_model_randomblocksrotated.pkl") print('total time', time.time() - start)
def __call__(self): policy_kwargs = dict(layers=[400, 300, 200, 100]) n_actions = self.env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.1) * np.ones(n_actions)) # check_env(self.env) model = DDPG(MlpPolicy, self.env, policy_kwargs=policy_kwargs, action_noise=action_noise, memory_limit=50000, tensorboard_log="/home/dfki.uni-bremen.de/mpatil/Documents/baselines_log", verbose=1) time_steps = 3e4 model.learn(total_timesteps=int(time_steps), log_interval=50, tb_log_name="ddpg_Docker_" + self.expt_name) model.save("/home/dfki.uni-bremen.de/mpatil/Documents/ddpg_stable_baselines_" + self.expt_name) print("Closing environment") self.env.close()
def get_model(load=True): # Create and wrap the environment env = gym.make('F16GCAS-v0') env = DummyVecEnv([lambda: env]) # Custom MLP policy of two layers of size 16 each class CustomPolicy(FeedForwardPolicy): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, layers=[128, 128], layer_norm=False, feature_extraction="mlp") model = DDPG(CustomPolicy, env, verbose=1) # # Train the agent if load == False: model.learn(total_timesteps=1000000) model.save(ROOT + "/trained_models/TDRL/f16/128_128") model = DDPG.load(ROOT + "/trained_models/TDRL/f16/ddpg/128_128.pkl", policy=CustomPolicy) return model
expDir = '/home/shivanik/lab/pointExp/state/' verbose = 1 nIter = 5e6 name = 'ddpg_1_%s' %np.format_float_scientific(nIter) logger = osp.join(expDir, name, 'logs') video_folder = osp.join(logger, 'videos') env = make_vec_env('PointMassDense-1-v1', 4, wrapper_class = FlattenDictWrapper, wrapper_env_kwargs =['observation', 'achieved_goal', 'desired_goal']) env = VecVideoRecorder(env, video_folder, record_video_trigger=lambda x: x %100000, video_length=400, name_prefix="Video-{}") # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.2) * np.ones(n_actions)) model = DDPG(MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise) model.learn(total_timesteps=int(nIter)) model.save(expDir + "/%s/model" %(name, np.format_float_scientific(nIter))) # del model # remove to demonstrate saving and loading # model = DDPG.load("ddpg_mountain") # obs = env.reset() # while True: # action, _states = model.predict(obs) # obs, rewards, dones, info = env.step(action) # env.render()
# the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) timesteps = 100000 name = "TWODNOREPS_franka_continuous_ddpg_learning_rate_" + "_timesteps_" + str( timesteps) model = DDPG(MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, tensorboard_log="/home/ryuga/Documents/TensorBoardLogs") model.learn(total_timesteps=timesteps, tb_log_name=name) model.save(name) # + str(my_learning_rate)) # f = open("envparameters_" + name, "x") # f.write(str([my_signal_rate, my_signal_repetitions, my_step_limit,])) # f.close() while True: obs = env.reset() for i in range(500): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
if not os.path.exists(log_dir): os.makedirs(log_dir) os.environ['CUDA_VISIBLE_DEVICES'] = '1' tstart = time.time() env = ToyEnv( train=True, log_dir=log_dir, ) env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model = DDPG(env=env, policy=FeedForwardCust3Policy, verbose=1, param_noise=param_noise, action_noise=action_noise) model.learn(total_timesteps=int(5e6), callback=callback) model.save(log_dir + "last_model") print('Time taken: {:.2f}'.format(time.time() - tstart))