def train(alg, task): if task == 'reach': env_fn = lambda: SawyerReachEnv(n_substeps=25, reward_type='dense') elif task == 'grasp': env_fn = lambda: SawyerGraspEnv(n_substeps=5, reward_type='dense') ac_kwargs = dict(hidden_sizes=[64, 64], activation=tf.nn.relu) save_path = os.path.join(SAVE_PATH, task, alg) if alg == 'ppo': # mpi_fork(2) logger_kwargs = dict(output_dir=save_path, exp_name=EXP_NAME) ppo(env_fn=env_fn, steps_per_epoch=4000, epochs=20000, logger_kwargs=logger_kwargs, max_ep_len=1000) elif alg == 'ddpg': logger_kwargs = dict(output_dir=SAVE_PATH + '/ddpg_suite', exp_name=EXP_NAME) ddpg(env_fn=env_fn, steps_per_epoch=5000, batch_size=256, epochs=2000, logger_kwargs=logger_kwargs, max_ep_len=200) elif alg == 'trpo': logger_kwargs = dict(output_dir=SAVE_PATH + '/trpo_suite', exp_name=EXP_NAME) trpo(env_fn=env_fn, ac_kwargs=ac_kwargs, steps_per_epoch=5000, epochs=2000, logger_kwargs=logger_kwargs, max_ep_len=200) elif alg == 'td3': logger_kwargs = dict(output_dir=save_path, exp_name=EXP_NAME) td3(env_fn=env_fn, start_steps=100000, steps_per_epoch=5000, epochs=2000, logger_kwargs=logger_kwargs, max_ep_len=1000) elif alg == 'sac': logger_kwargs = dict(output_dir=save_path, exp_name=EXP_NAME) sac(env_fn=env_fn, start_steps=100000, steps_per_epoch=5000, epochs=2000, logger_kwargs=logger_kwargs, max_ep_len=200)
def __call__(self, *args, **kwargs): ac_kwargs = dict(hidden_sizes=[400, 300, 200, 100], activation=torch.nn.ReLU) logger_kwargs = dict(output_dir=self.outdir, exp_name=self.expt_name) td3(env_fn=self.env, ac_kwargs=ac_kwargs, steps_per_epoch=250, epochs=400, logger_kwargs=logger_kwargs)
from spinup import td3 import tensorflow as tf import gym env_fn = lambda: gym.make('Fish-v2') #ac_kwargs = dict(hidden_sizes=[64, 64], activation=tf.nn.relu) logger_kwargs = dict(output_dir='.', exp_name='exp_1') td3(env_fn=env_fn, steps_per_epoch=5000, epochs=600, act_noise=0.1, logger_kwargs=logger_kwargs)
'episodes_per_goal': 1 , # Number of episodes before generating another random goal 'goal_buffer_size': 1 , # Number goals to store in buffer to be reused later 50 'goal_from_buffer_prob': 0.0000000 , # Probability of selecting a random goal from the goal buffer, value between 0 and 1 'num_adjacent_goals': 0 , # Number of nearby goals to be generated for each randomly generated goal 'random_goal_seed': 2 , # Seed used to generate the random goals 10 18 #'is_validation': False , #Test policy then put **True** # Whether this is a validation run, if true will print which points failed and how many reached 'normalise_reward': True, # Perform reward normalisation, this happens before reward bonus and penalties 'continuous_run': False , # Continuously run the simulation, even after it reaches the destination 'reward_noise_mu': 0.0 , # Reward noise mean (reward noise follows gaussian distribution) 'reward_noise_sigma': 0.0 , # Reward noise standard deviation, recommended 0.5 'reward_noise_decay': 0.0 , # Constant for exponential reward noise decay (recommended 0.31073, decays to 0.002 in 20 steps) #'exp_rew_scaling': 8.0 # Constant for exponential reward scaling (None by default, recommended 5.0, cumulative exp_reward = 29.48)''' X } # env_fn = gym.make('LobotArmContinuous-v6',task_kwargs=task_kwargs, robot_kwargs=robot_kwargs) ac_kwargs = dict(hidden_sizes=[256,256]) #64/32 logger_kwargs = dict(output_dir='data/rosarmtesting', exp_name='TD3') from spinup import td3_pytorch as td3 td3(env=env_fn, steps_per_epoch=5000, epochs=2000, ac_kwargs=ac_kwargs,save_freq=20,act_noise=0.1,logger_kwargs =logger_kwargs,seed=1) #,logger_kwargs =logger_kwargs, ac_kwargs=ac_kwargs_5000,,logger_kwargs =logger_kwargs ''' fpath='data/td3_0_1accepted_randomgaol_Truenormalized_exp5_256256_continus_training_seed_none/' #set saved model folder to find tf1_save env,get_action=load_policy_and_env(fpath, itr='last',deterministic=False) env = gym.make('LobotArmContinuous-v0',task_kwargs=task_kwargs, robot_kwargs=robot_kwargs) run_policy(env, get_action, max_ep_len=None, num_episodes=500, render=True) '''
#'is_validation': False , #Test policy then put **True** # Whether this is a validation run, if true will print which points failed and how many reached 'norm_rew_scaling': 300 # Perform reward normalisation, this happens before reward bonus and penalties #'continuous_run': False , # Continuously run the simulation, even after it reaches the destination #'exp_rew_scaling': 7.0 # Constant for exponential reward scaling (None by default, recommended 5.0, cumulative exp_reward = 29.48)''' } # env_fn = gym.make('HyQ-v0', task_kwargs=task_kwargs, robot_kwargs=robot_kwargs) ac_kwargs = dict(hidden_sizes=[256, 256]) #64/32 logger_kwargs = dict(output_dir='data/HyQv0testjo_b8_corrected', exp_name='TD3') from spinup import td3_pytorch as td3 from spinup.utils.test_policy import load_policy_and_env, run_policy td3( env=env_fn, start_steps=10000, max_ep_len=500, steps_per_epoch=5000, epochs=3000, ac_kwargs=ac_kwargs, save_freq=20, act_noise=0.1, logger_kwargs=logger_kwargs ) #,logger_kwargs =logger_kwargs, ac_kwargs=ac_kwargs_5000,,logger_kwargs =logger_kwargs #fpath='data/HyQv0testjo_b8_corrected/' #set saved model folder to find tf1_save #env,get_action=load_policy_and_env(fpath, itr='last',deterministic=False) #run_policy(env_fn, get_action, max_ep_len=None, num_episodes=500, render=True)
from spinup import td3 import tensorflow as tf import gym env_fn = lambda: gym.make('InvertedPendulum-v2') #ac_kwargs = dict(hidden_sizes=[64, 64], activation=tf.nn.relu) logger_kwargs = dict(output_dir='.', exp_name='exp_1') td3(env_fn=env_fn, steps_per_epoch=5000, epochs=200, logger_kwargs=logger_kwargs)
def main(): robot_kwargs = { 'use_gui': False, 'rtf': 7.0 } # 'state_noise_mu': 0, 'state_noise_sigma': 0.075 'random_init_pos': False, task_kwargs = { 'max_time_step': 500, # Maximum time step before stopping the episode 'accepted_dist_to_bounds': 0.0010000, # Allowable distance to joint limits (radians) 0.002 1400epoch 90%++ 'accepted_error': 0.0010000, # Allowable distance from target coordinates (metres) 'reach_target_bonus_reward': 0.0000000, # Bonus reward upon reaching target 'timeout_penalty': 0, # Reward penalty for collision' % self.timeout_penalty) 'reach_bounds_penalty': 0.0000000, # Reward penalty when reaching joint limit 38/18 'contact_penalty': 0.0000000, # Reward penalty for collision 38/18 'episodes_per_goal': 1, # Number of episodes before generating another random goal 'goal_buffer_size': 1, # Number goals to store in buffer to be reused later 50 'goal_from_buffer_prob': 0.0000000, # Probability of selecting a random goal from the goal buffer, value between 0 and 1 'num_adjacent_goals': 0, # Number of nearby goals to be generated for each randomly generated goal 'random_goal_seed': None, # Seed used to generate the random goals 10 18 'is_validation': False, # Test policy then put **True** # Whether this is a validation run, if true will print which points failed and how many reached 'normalise_reward': True, # Perform reward normalisation, this happens before reward bonus and penalties 'continuous_run': False, # Continuously run the simulation, even after it reaches the destination 'reward_noise_mu': None, # Reward noise mean (reward noise follows gaussian distribution) 'reward_noise_sigma': None, # Reward noise standard deviation, recommended 0.5 'reward_noise_decay': None, # Constant for exponential reward noise decay (recommended 0.31073, decays to 0.002 in 20 steps) 'exp_rew_scaling': 5.0 # Constant for exponential reward scaling (None by default, recommended 5.0, cumulative exp_reward = 29.48)''' } # env_fn = lambda: gym.make('LobotArmContinuous-v1', task_kwargs=task_kwargs, robot_kwargs=robot_kwargs) ac_kwargs = dict(hidden_sizes=[256, 256]) # 64/32 logger_kwargs = dict( output_dir= 'data/td3_0_1accept_Truenormalized_exp5_256256_continus_seed_None_noneset1_000_XXrework_negativereward_run2', exp_name='TD3') td3( env_fn=env_fn, steps_per_epoch=5000, epochs=5000, ac_kwargs=ac_kwargs, save_freq=20, act_noise=0.1, logger_kwargs=logger_kwargs ) # ,logger_kwargs =logger_kwargs, ac_kwargs=ac_kwargs_5000,,logger_kwargs =logger_kwargs '''
import unittest from functools import partial import gym import tensorflow as tf from spinup import td3 env_fn = partial(gym.make, 'Pendulum-v0') with tf.Graph().as_default(): td3(env_fn)