def main(): parser = argparse.ArgumentParser("Insertion, Manual mode") parser.add_argument('--host', default="127.0.0.1", type=str, help='IP of the server') parser.add_argument( '--port', default=9081, type=int, help='Port that should be used to connect to the server') parser.add_argument('--save', action="store_true", help=('Saves checkpoints')) parser.add_argument( '--use_coord', action="store_true", help=('If set, the environment\'s observation space will be' 'coordinates instead of images')) args = parser.parse_args() os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' env = gym.make('insertion-v0', kwargs={ 'host': args.host, "port": args.port, "use_coord": args.use_coord }) # check_env(env, warn=True) print(f"Observation space: {env.observation_space}") print(f"Action space: {env.action_space}") # print(env.action_space.sample()) # Save a checkpoint every 50000 steps ckpt = CkptCallback(save_freq=50000, save_path='../checkpoints/', name_prefix='rl_insertion') if args.save else None if args.use_coord: model = SAC('MlpPolicy', env, verbose=1, tensorboard_log="../insertion_tensorboard/") else: model = SAC('CnnPolicy', env, verbose=1, tensorboard_log="../insertion_tensorboard/") model.learn(50001, callback=ckpt)
def main(): parser = argparse.ArgumentParser("Insertion, Manual mode") parser.add_argument('checkpoint_path', type=str, help='Path to checkpoint') parser.add_argument('--host', default="192.168.2.121", type=str, help='IP of the server (default is a Windows#2)') parser.add_argument( '--port', default=9090, type=int, help='Port that should be used to connect to the server') parser.add_argument( '--use_coord', action="store_true", help=('If set, the environment\'s observation space will be' 'coordinates instead of images')) args = parser.parse_args() os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' env = gym.make('insertion-v0', kwargs={ 'host': args.host, "port": args.port, "use_coord": args.use_coord }) print(f"Observation space: {env.observation_space}") print(f"Action space: {env.action_space}") if args.use_coord: model = SAC('MlpPolicy', env, verbose=1, tensorboard_log="../insertion_tensorboard/") else: model = SAC('CnnPolicy', env, verbose=1, tensorboard_log="../insertion_tensorboard/") model.load(args.checkpoint_path, env=env) obs = env.reset() for i in range(10000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
def main(argv): fixed = True policy_name = "sac_reaching_policy" obj_pose_rnd_std = 0 if fixed == True else 0.05 pandaenv = pandaReachGymEnv(renders=True, use_IK=0, numControlledJoints=7, obj_pose_rnd_std=obj_pose_rnd_std, includeVelObs=True) n_actions = pandaenv.action_space.shape[-1] pandaenv = DummyVecEnv([lambda: pandaenv]) model = SAC(MlpPolicy, pandaenv, gamma=0.9, batch_size=16, verbose=1, tensorboard_log="../pybullet_logs/pandareach_sac/") model.learn(total_timesteps=1000000) model.save("../pybullet_logs/pandareach_sac/" + policy_name) del model # remove to demonstrate saving and loading
def func_run(env, logger, lr, action_noise, file): expDir = '/home/shivanik/lab/pointExp/state/' num_objs = 1 verbose = 1 name = 'sac_%d_0.5' % num_objs nIter = 5e7 save_video_length = 200 save_video_interval = 1000000 env = VecVideoRecorder( env, osp.join(logger, "videos"), record_video_trigger=lambda x: x % save_video_interval == 0, video_length=save_video_length) model = SAC( MlpPolicy, env, verbose=verbose, tensorboard_log=logger, learning_rate=lr, action_noise=action_noise, ) model.learn(total_timesteps=int(nIter), log_interval=100) exp_name = expDir + "/%s/%s_%s" % (name, np.format_float_scientific(nIter), np.format_float_scientific(lr)) model.save(exp_name) file.write(exp_name + '\n') env.close() return True
def train_sac(training_tag): env = gym.make(ENVIRONMENT_NAME) env = DummyVecEnv([lambda: env]) if (isinstance(training_tag, float)): model = SAC(sac_MlpPolicy, env, ent_coef=training_tag, verbose=1, policy_kwargs=POLICY_KWARGS) for step in range(TRAINING_STEPS): env.reset() (model, learning_results) = model.learn( total_timesteps=TRAINING_TIMESTEPS, log_interval=100) file_tag = str(training_tag).replace(".", "p") if (SAVE_AGENTS): model.save("nchain/models/SAC_" + ENVIRONMENT_NAME + "_s" + str(step) + "_t" + str(file_tag) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_TIMESTEPS)) if (SAVE_FINAL_AGENT): model.save("nchain/models/SAC_" + ENVIRONMENT_NAME + "_t" + str(file_tag) + "_i" + str(CURRENT_ITERATION) + "_ts" + str(TRAINING_STEPS * TRAINING_TIMESTEPS)) env.reset() del model return data
def explore(app, emulator, appium, timesteps, timer, save_policy, policy_dir, cycle, train_freq=5, target_update_interval=10): try: env = TimeFeatureWrapper(app) model = SAC(MlpPolicy, env, verbose=1, train_freq=train_freq, target_update_interval=target_update_interval) callback = TimerCallback(timer=timer, app=app) model.learn(total_timesteps=timesteps, callback=callback) if save_policy: model.save(f'{policy_dir}{os.sep}{cycle}') return True except Exception as e: print(e) appium.restart_appium() if emulator is not None: emulator.restart_emulator() return False
def run_experiment(verbose, tensorboard_log, learning_rate): pdb.set_trace() env = make_vec_env( 'PointMassDense-%d-v1' % num_objs, 1, wrapper_class=FlattenDictWrapper, wrapper_env_kwargs=['observation', 'achieved_goal', 'desired_goal']) env = VecVideoRecorder( env, osp.join(logger, "videos"), record_video_trigger=lambda x: x % save_video_interval == 0, video_length=save_video_length) n_actions = env.action_space.shape[-1] stddev = 0.2 action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model = SAC( MlpPolicy, env, verbose=verbose, tensorboard_log=logger, learning_rate=learning_rate, action_noise=action_noise, ) model.learn(total_timesteps=int(nIter), log_interval=100) model.save(expDir + "/%s/%s_%s" % (name, np.format_float_scientific(nIter), np.format_float_scientific(learning_rate))) env.close()
def model_training_learning(env_train, model_name, timesteps=100000): # train model os.chdir("./model_saved/" + model_name) start = time.time() print("Train ", model_name, " Model with MlpPolicy: ") if model_name == "A2C_Model": model = A2C('MlpPolicy', env_train, verbose=0) elif model_name == "PPO_Model": model = PPO2('MlpPolicy', env_train, verbose=0) elif model_name == "TD3_Model": model = TD3('MlpPolicy', env_train, verbose=0) elif model_name == "SAC_Model": model = SAC('MlpPolicy', env_train, verbose=0) print("Learning ", model_name, " time steps: ", timesteps) model.learn(total_timesteps=timesteps) print("TD3 Model learning completed: ") end = time.time() timestamp = time.strftime('%b-%d-%Y_%H%M') model_file_name = (model_name + timestamp) model.save(model_file_name) print("- ", model_name, " save finish :") print("Training time ", model_name, " : ", (end - start) / 60, " minutes") os.chdir("./..") os.chdir("./..") return model
def train_SAC(env, out_dir, seed=None, **kwargs): # Logs will be saved in log_dir/monitor.csv global output_dir output_dir = out_dir log_dir = os.path.join(out_dir, 'log') os.makedirs(log_dir, exist_ok=True) env = make_mujoco_env(env, 0) env = Monitor(env, log_dir + "/") continue_train = False if continue_train: # Continue training print("Loading pretrained agent") model = SAC.load(os.path.join(out_dir, 'final_model.pkl'), env=env, tensorboard_log=os.path.join(log_dir, 'tb'), verbose=1, **kwargs) else: model = SAC( policy, env, #action_noise=action_noise, verbose=1, tensorboard_log=os.path.join(log_dir, 'tb'), full_tensorboard_log=False, **kwargs) model.learn(total_timesteps=n_timesteps, seed=seed, callback=callback, log_interval=10) return model
def train_policy(num_of_envs, log_relative_path, maximum_episode_length, skip_frame, seed_num, sac_config, total_time_steps, validate_every_timesteps, task_name): task = generate_task(task_generator_id=task_name, dense_reward_weights=np.array( [250, 0, 125, 0, 750, 0, 0, 0.005]), fractional_reward_weight=1, goal_height=0.15, tool_block_mass=0.02) env = CausalWorld(task=task, skip_frame=skip_frame, enable_visualization=False, seed=seed_num, max_episode_length=maximum_episode_length) set_global_seeds(seed_num) policy_kwargs = dict(layers=[256, 256]) checkpoint_callback = CheckpointCallback(save_freq=int( validate_every_timesteps / num_of_envs), save_path=log_relative_path, name_prefix='model') model = SAC(MlpPolicy, env, verbose=1, policy_kwargs=policy_kwargs, **sac_config, seed=seed_num) model.learn(total_timesteps=total_time_steps, tb_log_name="sac", callback=checkpoint_callback) return
def train(env_name, num_time_steps, policy_kwargs, eval_ep, eval_freq, ckpt_freq, load_model=None): env = gym.make(env_name) # env.render() env_ = gym.make(env_name) today = date.today() today = str(today).replace('-', '_') now = datetime.now() current_time = now.strftime("%H_%M_%S") model_name = env_name + '_SAC_' + today + current_time Path('./run/' + model_name).mkdir(parents=True, exist_ok=True) path = os.path.join(os.path.dirname(__file__), './run/' + model_name) env = Monitor(env, filename=path) ############################ # Logging # ############################ logger.configure(path) config = {} config['load'] = [{'load_model': load_model}] config['eval'] = [{'eval_freq': eval_freq, 'eval_ep': eval_ep}] config['ckpt'] = [{'ckpt_freq': ckpt_freq}] config['policy'] = [{'policy_network': policy_kwargs}] with open('./run/' + model_name + '/' + model_name + '.txt', 'w+') as outfile: json.dump(config, outfile, indent=4) ############################ # callback # ############################ callbacklist = [] ckpt_callback = CheckpointCallback(save_freq=ckpt_freq, save_path='./run/' + model_name + '/ckpt', name_prefix='') eval_callback = EvalCallback_wandb_SAC(env_, n_eval_episodes=eval_ep, eval_freq=eval_freq, log_path=path) callbacklist.append(ckpt_callback) callbacklist.append(eval_callback) callback = CallbackList(callbacklist) ############################ # run # ############################ # policy_kwargs = dict(net_arch=[128, dict(vf=[256], pi=[16])]) model = SAC(MlpPolicy, env, verbose=1) model.learn(total_timesteps=int(num_time_steps), log_interval=20, callback=callback) model.save(path + "SAC_Walker2d")
def train(learning_rate, time_steps, env, model_path): tf.reset_default_graph() # to avoid the conflict the existnat parameters, but not suggested for reuse parameters # default policy is MlpPolicy model = SAC(CustomSACPolicy, env, verbose=1,seed=10, n_cpu_tf_sess=16) model.learn(total_timesteps=int(time_steps), log_interval=1000, callback=callback) model.save(model_path)
def train_SAC(env_train, model_name, timesteps=50000): start = time.time() model = SAC('MlpPolicy', env_train, verbose=0) model.learn(total_timesteps=timesteps) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (SAC): ', (end - start) / 60, ' minutes') return model
def train(): machine = StateMachine() machine.initialize(headless=True) camera = Camera(machine) env = CustomEnv(machine, camera, state="vision") model = SAC(CnnPolicy, env, verbose=1, learning_starts=32, batch_size=32, \ target_update_interval=32, tensorboard_log=dir_path+'/Logs/') model.learn(total_timesteps=2000, log_interval=1000000) model.save("Grasp_Model_Full_Pose")
def test_predict_SAC(): ''' Visualize predictions from a random policy. ''' env = gym.make('KukaMujocoSAC-v0') model = SAC(SAC_MlpPolicy, env) obs = env.reset() while True: action, _ = model.predict(obs) obs, rew, done, info = env.step(action, render=True)
def sac(env_id, timesteps, policy="MlpPolicy", log_interval=None, tensorboard_log=None, seed=None): env = gym.make(env_id) model = SAC(policy, env, verbose=1, tensorboard_log=tensorboard_log) model.learn(total_timesteps=timesteps, log_interval=log_interval) save_model_weights(model, "sac", env_id, policy, seed)
def train(): set_gpu() expDir = '/home/shivanik/lab/pointExp/state/' num_objs = 1 verbose = 1 name = 'sac_%d_0.5' % num_objs nIter = 1e8 save_video_length = 200 save_video_interval = 1000000 file = open('sac_done.txt', 'w+') env = make_vec_env( 'PointMassDense-%d-v1' % num_objs, 1, wrapper_class=FlattenDictWrapper, wrapper_env_kwargs=['observation', 'achieved_goal', 'desired_goal']) n_actions = env.action_space.shape[-1] stddev = 0.2 pool = multiprocessing.Pool(processes=4) for lr in [1e-5]: #, 5e-4, 1e-5 logger = osp.join( expDir, name, 'logs%s_%s' % (np.format_float_scientific(nIter), np.format_float_scientific(lr))) env = VecVideoRecorder( env, osp.join(logger, "videos"), record_video_trigger=lambda x: x % save_video_interval == 0, video_length=save_video_length) action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) # boo = pool.apply_async(func_run, args=(env, logger, lr, action_noise, file)) model = SAC( MlpPolicy, env, verbose=verbose, tensorboard_log=logger, learning_rate=lr, action_noise=action_noise, ) model.learn(total_timesteps=int(nIter), log_interval=100) exp_name = expDir + "/%s/%s_%s" % (name, np.format_float_scientific(nIter), np.format_float_scientific(lr)) model.save(exp_name) file.write(exp_name + '\n') env.close() file.close() pool.close() pool.join()
def sac(env, seed): n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.1) * np.ones(n_actions)) return SAC('MlpPolicy', env, learning_rate=0.001, action_noise=action_noise, verbose=1, tensorboard_log="./data/runs", seed=seed)
def run(self): self._init() env = self.env model = self.model objective = self.objective if objective == "infogain": wenv = InfogainEnv(env, model) elif objective == "prederr": wenv = PrederrEnv(env, model) else: raise AttributeError( "Objective '{}' is unknown. Needs to be 'infogain' or 'prederr'" .format(objective)) wenv.max_episode_len = self.horizon wenv.end_episode_callback = self._end_episode dvenv = DummyVecEnv([lambda: wenv]) if self.rl_algo == "ddpg": self.logger.info("Setting up DDPG as model-free RL algorithm.") pn = AdaptiveParamNoiseSpec() an = NormalActionNoise(np.array([0]), np.array([1])) rl_model = DDPG(DDPGMlpPolicy, dvenv, verbose=1, render=False, action_noise=an, param_noise=pn, nb_rollout_steps=self.horizon, nb_train_steps=self.horizon) elif self.rl_algo == "sac": self.logger.info("Setting up SAC as model-free RL algorithm.") rl_model = SAC(SACMlpPolicy, dvenv, verbose=1, learning_starts=self.horizon) else: raise AttributeError( "Model-free RL algorithm '{}' is unknown.".format( self.rl_algo)) # Train the agent max_steps_total = self.horizon * self.n_episodes * 100 try: self.logger.info("Start the agent") rl_model.learn(total_timesteps=max_steps_total, seed=self.seed) except MaxEpisodesReachedException: print("Exploration finished.")
def main(): env = gym.make("teaching-env-v0", teacher_path=os.path.join(os.getcwd(), "../saved_models", sys.argv[1]), validation_path=DATA_PATH, max_queries=config.MAX_QUERIES) agent_model = SAC(MlpPolicy, env, train_freq=1, batch_size=64, learning_rate=3e-4, learning_starts=0, buffer_size=1000, random_exploration=config.EPSILON_EXPLORATION, gamma=config.GAMMA, verbose=1) #agent_model.learn(total_timesteps=config.MAX_QUERIES * config.NUM_TRAIN_EPISODES) #agent_model.save('test_SAC') agent_model.load('test_SAC', env=env) obs = env.reset() total_reward = float('-inf') prog = tqdm(range(config.MAX_QUERIES), postfix={'Reward': total_reward}) actions = [] # For visualization total_reward = 0.0 for i in prog: action = select_action(agent_model, obs, epsilon=config.EPSILON_EXPLORATION) #action, _states = agent_model.predict(obs, deterministic=False) obs, reward, done, info = env.step(action) total_reward += reward prog.set_postfix({'Reward': total_reward}) actions.append(np.asscalar(action)) plt.hist(actions, bins=config.NUM_BINS, range=(-5, 5), density=True) plt.savefig('./visualizations/histograms/SAC') plt.clf() # Plot student's predicted function inputs = np.linspace(-5, 5, num=1000) outputs = env.student_model(inputs.reshape(-1, 1)) plt.scatter(inputs, outputs, s=0.1, label='SAC') plt.title("SAC Student's Approximation") plt.ylim((-60, 100)) plt.savefig('./visualizations/functions/SAC') plt.clf()
def train_GAIL(env_train, model_name, timesteps=1000): """GAIL Model""" #from stable_baselines.gail import ExportDataset, generate_expert_traj start = time.time() # generate expert trajectories model = SAC('MLpPolicy', env_train, verbose=1) generate_expert_traj(model, 'expert_model_gail', n_timesteps=100, n_episodes=10) # Load dataset dataset = ExpertDataset(expert_path='expert_model_gail.npz', traj_limitation=10, verbose=1) model = GAIL('MLpPolicy', env_train, dataset, verbose=1) model.learn(total_timesteps=1000) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (PPO): ', (end - start) / 60, ' minutes') return model
def train_SAC(env, out_dir, seed=None, **kwargs): # Logs will be saved in log_dir/monitor.csv global output_dir output_dir = out_dir log_dir = os.path.join(out_dir, 'log') os.makedirs(log_dir, exist_ok=True) env = gym.make(env) env = Monitor(env, log_dir + '/', allow_early_resets=True) # Delete keys so the dict can be pass to the model constructor # policy = kwargs['policy'] policy = 'MlpPolicy' # n_timesteps = kwargs['n_timesteps'] n_timesteps = int(1e6) noise_type = None # Add some param noise for exploration param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1) continue_model = False if continue_model is True: # Continue training print("Loading pretrained agent") model = SAC.load(os.path.join(out_dir, 'final_model.pkl'), env=env, tensorboard_log=os.path.join(log_dir, 'tb'), verbose=1, **kwargs) else: model = SAC( policy, env, # action_noise=param_noise, verbose=1, tensorboard_log=os.path.join(log_dir, 'tb'), full_tensorboard_log=False, **kwargs) model.learn(total_timesteps=n_timesteps, seed=seed, callback=callback, log_interval=10) return model
def sac(env_id, timesteps, policy="MlpPolicy", log_interval=None, tensorboard_log=None, seed=None, load_weights=None): env = gym.make(env_id) if load_weights is not None: model = SAC.load(load_weights, env, verbose=0) else: model = SAC(policy, env, verbose=1, tensorboard_log=tensorboard_log) callback = WandbRenderEnvCallback(model_name="sac", env_name=env_id) model.learn(total_timesteps=timesteps, log_interval=log_interval, callback=callback)
def train_SAC(env_train, model_name, timesteps=100000): # train SAC model os.chdir("./model_saved/") start = time.time() print("Train SAC Model with MlpPolicy: ") model = SAC('MlpPolicy', env_train, verbose=0) print("SAC Learning time steps: ", timesteps) model.learn(total_timesteps=timesteps) print("SAC Model learning completed: ") end = time.time() timestamp = time.strftime('%b-%d-%Y_%H%M') model_file_name = (model_name + timestamp) model.save(model_file_name) print("SAC Model save finish :") print('Training time SAC: ', (end - start) / 60, ' minutes') os.chdir("./..") return model
def run(env_name, algorithm, seed): env_name_map = { 'halfcheetah': 'HalfCheetah-v2', 'hopper': 'Hopper-v2', 'ant': 'Ant-v2', 'walker': 'Walker2d-v2' } env = DummyVecEnv([lambda: gym.make(env_name_map[env_name])]) if algorithm == 'ppo': model = PPO2('MlpPolicy', env, learning_rate=1e-3, verbose=1) elif algorithm == 'trpo': model = TRPO('MlpPolicy', env, max_kl=0.01, verbose=1) elif algorithm == 'sac': model = SAC('MlpPolicy', env, learning_rate=1e-3, verbose=1) else: raise NotImplementedError() filepath = '%s_%s_%d.pkl' % (env_name, algorithm, seed) model.learn(total_timesteps=100000, seed=seed) model.save(filepath)
def train(self, num_e=1, n_timesteps=10000000, save_fraction=0.1, save='saves/m1'): env_id = "default" num_e = 32 # Number of processes to use # Create the vectorized environment #env = DummyVecEnv([lambda: env]) #Ramona #self.env = SubprocVecEnv([self.make_env(env_id, i) for i in range(num_e)]) env = Template_Gym() self.env = DummyVecEnv([lambda: env]) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) #self.model = PPO2(CustomPolicy_2, self.env, verbose=0, learning_rate=1e-5, tensorboard_log="./test6" ) self.model = SAC(CustomPolicy_sac, self.env, verbose=1, learning_rate=1e-5, tensorboard_log="./m1lstm1") #self.model = PPO2.load("default9", self.env, policy=CustomPolicy, tensorboard_log="./test/" ) n_timesteps = n_timesteps * save_fraction n_timesteps = int(n_timesteps) training_loop = 1 / save_fraction training_loop = int(training_loop) for i in range(training_loop): self.model.learn(n_timesteps) self.model.save(save+str(i))
def get_SAC_model(model_settings, model_path, ckpt_path, ckpt_step, tb_path): policy_kwargs = dict(layers=model_settings['NET_LAYERS']) env = get_single_process_env(model_settings, model_path, ckpt_step) if ckpt_path is not None: print("Loading model from checkpoint '{}'".format(ckpt_path)) model = SAC.load(ckpt_path, env=env, _init_setup_model=True, policy_kwargs=policy_kwargs, **model_settings['train_configs'], verbose=1, tensorboard_log=tb_path) model.num_timesteps = ckpt_step else: model = SAC(SACMlpPolicy, env, _init_setup_model=True, policy_kwargs=policy_kwargs, **model_settings['train_configs'], verbose=1, tensorboard_log=tb_path) return model, env
def __init__(self): logger.info(os.getcwd()) #self._interpreter = Interpreter("./converted_model.tflite") #self._interpreter.allocate_tensors() #print(self._interpreter.get_input_details()) #print(self._interpreter.get_output_details()) #_, self._input_height, self._input_width, _ = self._interpreter.get_input_details()[0]['shape'] self.env = AutoDriftEnv(const_throttle=0.3) # self.model = SacModel(policy=CnnPolicy, env=self.env) self.model = SAC(policy=CnnPolicy, env=self.env) # self._input_height = IMAGE_HEIGHT # self._input_width = IMAGE_WIDTH # print(self._input_height) # print(self._input_width) # self._socket = socket.socket() # socket_addr = ('127.0.0.1', 8888) # UNCOMMENT THIS #self._socket.connect(socket_addr) self.main()
def train_SAC(self, model_name, model_params=config.SAC_PARAMS): """TD3 model""" from stable_baselines import SAC env_train = self.env start = time.time() model = SAC( 'MlpPolicy', env_train, batch_size=model_params['batch_size'], buffer_size=model_params['buffer_size'], learning_rate=model_params['learning_rate'], learning_starts=model_params['learning_starts'], ent_coef=model_params['ent_coef'], verbose=model_params['verbose'], tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{model_name}") model.learn(total_timesteps=model_params['timesteps'], tb_log_name="SAC_run") end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (SAC): ', (end - start) / 60, ' minutes') return model
def get_sac(env, **kwargs): env_id = env.unwrapped.spec.id if (env_id.startswith("Ant") or env_id.startswith("HalfCheetah") or env_id.startswith("Swimmer") or env_id.startswith("Fetch")): sac_kwargs = { "verbose": 1, "learning_rate": 3e-4, "gamma": 0.98, "tau": 0.01, "ent_coef": "auto", "buffer_size": 1000000, "batch_size": 256, "learning_starts": 10000, "train_freq": 1, "gradient_steps": 1, } policy = CustomSACPolicy elif env_id.startswith("Hopper"): sac_kwargs = { "verbose": 1, "learning_rate": 3e-4, "ent_coef": 0.01, "buffer_size": 1000000, "batch_size": 256, "learning_starts": 1000, "train_freq": 1, "gradient_steps": 1, } policy = CustomSACPolicy else: sac_kwargs = {"verbose": 1, "learning_starts": 1000} policy = MlpPolicySac for key, val in kwargs.items(): sac_kwargs[key] = val solver = SAC(policy, env, **sac_kwargs) return solver