def train(alg, task): if task == 'reach': env_fn = lambda: SawyerReachEnv(n_substeps=25, reward_type='dense') elif task == 'grasp': env_fn = lambda: SawyerGraspEnv(n_substeps=5, reward_type='dense') ac_kwargs = dict(hidden_sizes=[64, 64], activation=tf.nn.relu) save_path = os.path.join(SAVE_PATH, task, alg) if alg == 'ppo': # mpi_fork(2) logger_kwargs = dict(output_dir=save_path, exp_name=EXP_NAME) ppo(env_fn=env_fn, steps_per_epoch=4000, epochs=20000, logger_kwargs=logger_kwargs, max_ep_len=1000) elif alg == 'ddpg': logger_kwargs = dict(output_dir=SAVE_PATH + '/ddpg_suite', exp_name=EXP_NAME) ddpg(env_fn=env_fn, steps_per_epoch=5000, batch_size=256, epochs=2000, logger_kwargs=logger_kwargs, max_ep_len=200) elif alg == 'trpo': logger_kwargs = dict(output_dir=SAVE_PATH + '/trpo_suite', exp_name=EXP_NAME) trpo(env_fn=env_fn, ac_kwargs=ac_kwargs, steps_per_epoch=5000, epochs=2000, logger_kwargs=logger_kwargs, max_ep_len=200) elif alg == 'td3': logger_kwargs = dict(output_dir=save_path, exp_name=EXP_NAME) td3(env_fn=env_fn, start_steps=100000, steps_per_epoch=5000, epochs=2000, logger_kwargs=logger_kwargs, max_ep_len=1000) elif alg == 'sac': logger_kwargs = dict(output_dir=save_path, exp_name=EXP_NAME) sac(env_fn=env_fn, start_steps=100000, steps_per_epoch=5000, epochs=2000, logger_kwargs=logger_kwargs, max_ep_len=200)
def test_cartpole(self): """ Test training a small agent in a simple environment """ env_fn = partial(gym.make, 'CartPole-v1') ac_kwargs = dict(hidden_sizes=(32, )) with tf.Graph().as_default(): ppo(env_fn, steps_per_epoch=100, epochs=10, ac_kwargs=ac_kwargs)
def main(): import tensorflow as tf env_fn = lambda: Doubling() ac_kwargs = dict(hidden_sizes=[50, 50], activation=tf.nn.relu) logger_kwargs = dict(output_dir='output_dir3', exp_name='training_64x64relu') ppo(env_fn=env_fn, ac_kwargs=ac_kwargs, steps_per_epoch=5000, epochs=25000000000, logger_kwargs=logger_kwargs, save_freq=1)
def __call__(self, *args, **kwargs): ac_kwargs = dict(hidden_sizes=[400, 300, 200, 100], activation=torch.nn.ReLU) logger_kwargs = dict(output_dir=self.outdir, exp_name=self.expt_name) ppo( env_fn=self.env, ac_kwargs=ac_kwargs, # steps_per_epoch=250, steps_per_epoch=1000, epochs=400, logger_kwargs=logger_kwargs)
def main(): args = { "forest_data_path": "/Users/anmartin/Projects/summer_project/hl_planner/forest_data.tiff", "num_measurements": 6, "max_forest_heights": [60, 90, 45, 38, 30, 76], "orbit_altitude": 757000, } env_fn = lambda: gym.make('gym_orekit:online-orekit-v0', **args) ac_kwargs = dict(hidden_sizes=[64, 64], activation=tf.nn.relu) logger_kwargs = dict(output_dir='./output', exp_name='test1') ppo(env_fn=env_fn, ac_kwargs=ac_kwargs, steps_per_epoch=20160, epochs=10, max_ep_len=20160, save_freq=2, logger_kwargs=logger_kwargs)
def main(): env_fn = lambda : gym.make('LunarLander-v2') ac_kwargs = dict(hidden_sizes=[64,64], activation=tf.nn.relu) logger_kwargs = dict(output_dir='path/to/output_dir', exp_name='experiment_name') ppo(env_fn=env_fn, ac_kwargs=ac_kwargs, steps_per_epoch=5000, epochs=250, logger_kwargs=logger_kwargs) # pass ######------play movie ----------------- # movie = retro.Movie('SuperMarioBros-Nes-Level1-1-000000.bk2') # movie.step() # env = retro.make( # game=movie.get_game(), # state=None, # # bk2s can contain any button presses, so allow everything # use_restricted_actions=retro.Actions.ALL, # players=movie.players, # ) # env.initial_state = movie.get_state() # env.reset() # while movie.step(): # keys = [] # for p in range(movie.players): # for i in range(env.num_buttons): # keys.append(movie.get_key(i, p)) # env.step(keys) # env.render() #########---------main RL program-------------- env=retro.make(game='SuperMarioBros-Nes',record='.') obv=env.reset() for i in range(10000): obs,rew,done,info=env.step(env.action_space.sample()) env.render() env.close()
def run_tests(model_name=None): global epochs global env global ac env = gym.make('StrategyEnv-v0') test_tf = '20200201-20200207' # test_tf = '20200401-' env.set_timeframe(test_tf) env.full_reset() def make_env(): env = gym.make('StrategyEnv-v0') env.set_timeframe('20191110-20200131') env.randomize_timeframe(True) env.set_ac(True) env.full_reset() return env #ask joey to run normal BT to compare run_test_with_strat(env) env.run_normal_bt() if model_name: torch.manual_seed(10000) np.random.seed(10000) ac = models.load_model(model_name, env.observation_space, env.action_space) else: # ac = ppo(make_env, epochs=epochs, target_kl=0.001, steps_per_epoch=7200, max_ep_len=100000) ac = ppo(make_env, epochs=epochs, steps_per_epoch=7200, max_ep_len=100000) model_name = models.save_model(ac) run_model_test(env, ac, model_name)
from spinup import ppo_pytorch as ppo from spinup.exercises.common import print_result from functools import partial import gym import os import pandas as pd import psutil import time logdir = "/tmp/experiments/%i" % int(time.time()) ActorCritic = partial(exercise1_2_auxiliary.ExerciseActorCritic, actor=MLPGaussianActor) ppo(env_fn=lambda: gym.make('MountainCarContinuous-v0'), actor_critic=ActorCritic, ac_kwargs=dict(hidden_sizes=(64, )), steps_per_epoch=4000, epochs=20, logger_kwargs=dict(output_dir=logdir)) # Get scores from last five epochs to evaluate success. data = pd.read_table(os.path.join(logdir, 'progress.txt')) last_scores = data['AverageEpRet'][-5:] # Your implementation is probably correct if the agent has a score >500, # or if it reaches the top possible score of 1000, in the last five epochs. correct = np.mean(last_scores) > 500 or np.max(last_scores) == 1e3 print_result(correct)
lam=float(lam)) # train with PPO if algorithm == 'ppo': clip_ratio = sys.argv[2] target_kl = sys.argv[3] exp_name = 'll_ppo_seed' + str(seed) + '_epochs' + str(epochs) exp_name += '_cr' + clip_ratio + '_tk' + target_kl logger_kwargs = dict(output_dir='data_spinning_up/' + exp_name + '/', exp_name=exp_name) ppo(env_fn=env_fn, ac_kwargs=ac_kwargs, max_ep_len=1000, gamma=0.99, seed=seed, steps_per_epoch=steps_per_epoch, pi_lr=0.005, vf_lr=0.005, epochs=epochs, logger_kwargs=logger_kwargs, clip_ratio=float(clip_ratio), target_kl=float(target_kl)) # train with TRPO if algorithm == 'trpo': delta = sys.argv[2] backtrack_coef = sys.argv[3] exp_name = 'll_trpo_seed' + str(seed) + '_epochs' + str(epochs) exp_name += '_delta' + delta + '_bc' + backtrack_coef logger_kwargs = dict(output_dir='data_spinning_up/' + exp_name + '/', exp_name=exp_name) trpo(env_fn=env_fn,
from spinup import ppo_tf1 as ppo import tensorflow as tf import gym env_fn = lambda: gym.make('CartPole-v0') ac_kwargs = dict(hidden_sizes=[8, 16, 8], activation=tf.nn.relu) ppo(env_fn=env_fn, ac_kwargs=ac_kwargs, steps_per_epoch=300, epochs=250)
""" Run this file to verify your solution. """ from spinup import ppo from spinup.exercises.common import print_result import gym import os import pandas as pd import psutil import time import pybullet_envs logdir = "/tmp/experiments/%i"%int(time.time()) tf_hidden_sizes = (64,) keras_hidden_sizes = (64,64) ppo(env_fn = lambda : gym.make('InvertedPendulumBulletEnv-v0'), ac_kwargs=dict(policy=mlp_gaussian_policy, hidden_sizes=keras_hidden_sizes), steps_per_epoch=4000, epochs=20, logger_kwargs=dict(output_dir=logdir)) # Get scores from last five epochs to evaluate success. data = pd.read_table(os.path.join(logdir,'progress.txt')) last_scores = data['AverageEpRet'][-5:] # Your implementation is probably correct if the agent has a score >500, # or if it reaches the top possible score of 1000, in the last five epochs. correct = np.mean(last_scores) > 500 or np.max(last_scores)==1e3 print_result(correct)
def test_cartpole(self): ''' Test training a small agent in a simple environment ''' env_fn = partial(gym.make, 'CartPole-v1') ac_kwargs = dict(hidden_sizes=(32,)) ppo(env_fn, steps_per_epoch=100, epochs=10, ac_kwargs=ac_kwargs)
noise=noise) dirOut = dirO + 'TFIM' + "P" + str(Nt) + '_N' + str( Ns) + '_rw' + rtype elif model == 'RandomTFIM': J_couplings = set_couplings(Ns, seed) env_fn = lambda: qenv.RandomTFIM(Ns, J_couplings, Nt, rtype, dt, actType, measured_obs=measured_obs, g_target=hfield, noise=noise) dirOut = dirO + 'RandomIsing' + "P" + str(Nt) + '_N' + str( Ns) + '_rw' + rtype else: raise ValueError(f'Invalid model:{model}') dirOut += '/' + measured_obs + '/network' + str(layers[0]) + 'x' + str( layers[1]) ac_kwargs = dict(hidden_sizes=layers, activation=tf.nn.relu) logger_kwargs = dict(output_dir=dirOut, exp_name='RL_first_try') ppo(env_fn=env_fn, ac_kwargs=ac_kwargs, steps_per_epoch=nstep, epochs=epochs, logger_kwargs=logger_kwargs, gamma=1.0, target_kl=0.01, save_freq=128)
from spinup import ppo_pytorch as ppo from spinup.exercises.common import print_result from functools import partial from spinup.utils.run_utils import set_mujoco set_mujoco() import gym import os import pandas as pd import psutil import time logdir = "/tmp/experiments/%i" % int(time.time()) ActorCritic = partial(exercise1_2_auxiliary.ExerciseActorCritic, actor=MLPGaussianActor) ppo(env_fn=lambda: gym.make('InvertedPendulum-v2'), actor_critic=ActorCritic, ac_kwargs=dict(hidden_sizes=(64, )), steps_per_epoch=4000, epochs=20, logger_kwargs=dict(output_dir=logdir)) # Get scores from last five epochs to evaluate success. data = pd.read_table(os.path.join(logdir, 'progress.txt')) last_scores = data['AverageEpRet'][-5:] # Your implementation is probably correct if the agent has a score >500, # or if it reaches the top possible score of 1000, in the last five epochs. correct = np.mean(last_scores) > 500 or np.max(last_scores) == 1e3 print_result(correct)
from spinup import ppo_tf1 as ppo import tensorflow as tf import gym from gym.wrappers import FlattenObservation import panda_gym EPOCHS = 100 STEPS_PER_EPOCH = 4000 ENV = 'PandaPush-v1' def env_fn(): env = gym.make(ENV) print(env.observation_space) env = FlattenObservation(env) return env ac_kwargs = dict( hidden_sizes=[64, 64], activation=tf.nn.sigmoid, ) logger_kwargs = dict(output_dir=f'logs/sigmoid_{ENV}', exp_name=f'exp_sigmoid_{ENV}') ppo(env_fn=env_fn, ac_kwargs=ac_kwargs, steps_per_epoch=STEPS_PER_EPOCH, epochs=EPOCHS, logger_kwargs=logger_kwargs)
#vpg # spinup.vpg( # env, # ac_kwargs={"hidden_sizes":(64,2)}, # seed = np.random.randint(100), # steps_per_epoch=1250, # epochs=2500, # pi_lr=3e-4, # logger_kwargs = {"output_dir" : "logs/vpgrandomtest"} # ) #ppo spinup.ppo(env, ac_kwargs={"hidden_sizes": (64, 2)}, seed=np.random.randint(100), steps_per_epoch=1250, pi_lr=3e-4, epochs=2500, logger_kwargs={"output_dir": "logs/ppo-v3-0-rerun2"}) #polynomials # spinup.vpgpolynomial( # env, # ac_kwargs={"order":3}, # seed = np.random.randint(100), # steps_per_epoch=1250, # epochs=2500, # pi_lr=2e-5, # l1_scaling=0.001, # logger_kwargs = {"output_dir" : "logs/polyrandomtest"} # )
from spinup import ppo_pytorch as ppo from spinup import ddpg_pytorch as ddpg from spinup import sac_pytorch as sac import tensorflow as tf import gym import torch """ env_fn = lambda : gym.make('Walker2d-v2') ac_kwargs = dict(hidden_sizes=[64,64]) logger_kwargs = dict(output_dir='baseline_data/walker/ppo', exp_name='walker_ppo') ppo(env_fn=env_fn, ac_kwargs=ac_kwargs, steps_per_epoch=5000, epochs=200, logger_kwargs=logger_kwargs) #env_fn = lambda : gym.make('Walker2d-v2') #ac_kwargs = dict(hidden_sizes=[64,64], activation=tf.nn.relu) logger_kwargs = dict(output_dir='baseline_data/walker/ddpg', exp_name='walker_ddpg') ddpg(env_fn=env_fn, ac_kwargs=ac_kwargs, steps_per_epoch=5000, epochs=200, logger_kwargs=logger_kwargs) logger_kwargs = dict(output_dir='baseline_data/walker/sac', exp_name='walker_sac') sac(env_fn=env_fn, ac_kwargs=ac_kwargs, steps_per_epoch=5000, epochs=200, logger_kwargs=logger_kwargs) env_fn = lambda : gym.make('Hopper-v2') logger_kwargs = dict(output_dir='baseline_data/hopper/ppo', exp_name='hopper_ppo') ppo(env_fn=env_fn, ac_kwargs=ac_kwargs, steps_per_epoch=5000, epochs=200, logger_kwargs=logger_kwargs) # env_fn = lambda : gym.make('Walker2d-v2') # ac_kwargs = dict(hidden_sizes=[64,64], activation=tf.nn.relu) logger_kwargs = dict(output_dir='baseline_data/hopper/ddpg', exp_name='hopper_ddpg') ddpg(env_fn=env_fn, ac_kwargs=ac_kwargs, steps_per_epoch=5000, epochs=200, logger_kwargs=logger_kwargs) logger_kwargs = dict(output_dir='baseline_data/hopper/sac', exp_name='hopper_sac') sac(env_fn=env_fn, ac_kwargs=ac_kwargs, steps_per_epoch=5000, epochs=200, logger_kwargs=logger_kwargs)
# Check this, it may not work def envFunc(): env = gym.make('airsim_gym-v0') return env # Setup the environment function and hyperparameters env_fn = envFunc ac_kwargs = dict(hidden_sizes=[64, 64]) logger_kwargs = dict( output_dir='/home/isra/Documents/airsim_exp_results_random_seed_0', exp_name='random_goals') ppo(env_fn=envFunc, ac_kwargs=ac_kwargs, seed=0, max_ep_len=500, steps_per_epoch=4000, epochs=250, logger_kwargs=logger_kwargs) logger_kwargs = dict( output_dir='/home/isra/Documents/airsim_exp_results_random_seed_5', exp_name='random_goals') ppo(env_fn=envFunc, ac_kwargs=ac_kwargs, seed=5, max_ep_len=500, steps_per_epoch=4000, epochs=250, logger_kwargs=logger_kwargs) logger_kwargs = dict( output_dir='/home/isra/Documents/airsim_exp_results_random_seed_10',
""" from spinup import ppo from spinup.exercises.common import print_result from spinup.user_config import INVERTEDPENDULUM_ENV, IMPORT_USER_MODULES import gym import os import pandas as pd import psutil import time import importlib for module in IMPORT_USER_MODULES: importlib.import_module(module) logdir = "/tmp/experiments/%i" % int(time.time()) ppo(env_fn=lambda: gym.make(INVERTEDPENDULUM_ENV), ac_kwargs=dict(policy=mlp_gaussian_policy, hidden_sizes=(64, )), steps_per_epoch=4000, epochs=20, logger_kwargs=dict(output_dir=logdir)) # Get scores from last five epochs to evaluate success. data = pd.read_table(os.path.join(logdir, 'progress.txt')) last_scores = data['AverageEpRet'][-5:] # Your implementation is probably correct if the agent has a score >500, # or if it reaches the top possible score of 1000, in the last five epochs. correct = np.mean(last_scores) > 500 or np.max(last_scores) == 1e3 print_result(correct)
import gym import gym_geofriend2 from MapGenerators.Basic import Basic from MapGenerators.Pyramid import Pyramid from MapGenerators.HighPlatform import HighPlatform from MapGenerators.TwoHighTowers import TwoHighTowers from Player.Player import Player from spinup import ppo import tensorflow as tf env_fn = lambda : gym.make("geofriend2-v0") # ac_kwargs = dict(hidden_sizes=[64,64], activation=tf.nn.relu) logger_kwargs = dict(output_dir='spinupPpo', exp_name='experiment') ppo(env_fn=env_fn, steps_per_epoch=5000, epochs=500, logger_kwargs=logger_kwargs)
from spinup.utils.run_utils import ExperimentGrid from spinup import ppo import tensorflow as tf import gym # todo: define env_fn in terms of some gym environment env_fn = lambda: gym.make('LunarLanderContinuous-v2') ac_kwargs = dict(hidden_sizes=[64, 64], activation=tf.nn.relu) logger_kwargs = dict(output_dir='data/test1', exp_name='test1') ppo(env_fn=env_fn, ac_kwargs=ac_kwargs, steps_per_epoch=5000, max_ep_len=1000, epochs=500, logger_kwargs=logger_kwargs)
import gym import gym_env from spinup import ppo_pytorch as ppo from gym_env.wrapper import PendulumCostWrapper env = gym.make('Pendulum-v0') env._max_episode_steps = 100 env = PendulumCostWrapper(env) ppo(env_fn=lambda: env, ac_kwargs=dict(hidden_sizes=[16] * 2), gamma=0.99, max_ep_len=1000, lam=0.95, epochs=100000, seed=1)
import torch import gym import numpy as np from spinup import ppo_pytorch as ppo def env_fn(): import vortex_cartpole # We can pass a dictionary of arguments to the environment using kwargs # headless : True or False, selects whether or not to use graphics rendering # random_reset : if True, a random state is induced when the environment is reset # kwargs = {"headless": False, "random_reset": True} env = gym.make('VortexCartPole-v0', **kwargs) return env # Test training am agent using pytorch PPO ac_kwargs = dict(hidden_sizes=[32, 32], activation=torch.nn.ReLU) ppo(env_fn, steps_per_epoch=1000, epochs=50, gamma=0.99, pi_lr=1e-3, vf_lr=1e-3, ac_kwargs=ac_kwargs)
def test_atari_env(self): ''' Test training a small agent in a simple environment ''' env_fn = partial(gym.make, 'CarRacing-v0') ac_kwargs = dict(hidden_sizes=(32, )) with tf.Graph().as_default(): ppo(env_fn, steps_per_epoch=100, epochs=10, ac_kwargs=ac_kwargs)
return pi, logp, logp_pi if __name__ == '__main__': """ Run this file to verify your solution. """ from spinup import ppo_tf1 as ppo from spinup.exercises.common import print_result import gym import os import pandas as pd import psutil import time logdir = "/tmp/experiments/%i" % int(time.time()) ppo(env_fn=lambda: gym.make('InvertedPendulum-v2'), ac_kwargs=dict(policy=mlp_gaussian_policy, hidden_sizes=(64, )), steps_per_epoch=4000, epochs=1, logger_kwargs=dict(output_dir=logdir)) # Get scores from last five epochs to evaluate success. data = pd.read_table(os.path.join(logdir, 'progress.txt')) last_scores = data['AverageEpRet'][-5:] # Your implementation is probably correct if the agent has a score >500, # or if it reaches the top possible score of 1000, in the last five epochs. correct = np.mean(last_scores) > 500 or np.max(last_scores) == 1e3 print_result(correct)
def ppo_test(): # ac_kwargs = dict(hidden_sizes=[64,64]) # logger_kwargs = dict(output_dir='path/to/output_dir', exp_name='experiment_name') ppo(env_fn=env, steps_per_epoch=50, epochs=10)
#vpg # spinup.vpg( # env, # ac_kwargs={"hidden_sizes":(64,2)}, # seed = np.random.randint(100), # steps_per_epoch=1250, # epochs=2500, # pi_lr=3e-4, # logger_kwargs = {"output_dir" : "logs/vpgrandomtest"} # ) #ppo spinup.ppo(env, ac_kwargs={"hidden_sizes": (64, 2)}, seed=np.random.randint(100), steps_per_epoch=1250, pi_lr=3e-3, epochs=2500, logger_kwargs={"output_dir": "logs/ppo-dptest-uscaling1-lr3e3"}) #polynomials # spinup.vpgpolynomial( # env, # ac_kwargs={"order":3}, # seed = np.random.randint(100), # steps_per_epoch=1250, # epochs=2500, # pi_lr=2e-5, # l1_scaling=0.001, # logger_kwargs = {"output_dir" : "logs/polyrandomtest"} # )
running = not done count += 1 if count > 100: break save_gif(color_images, path=color_output) save_gif(object_images, path=object_output) print("____________________________") print("Target: {}".format(env.target)) print("Reward: {}".format(reward)) print("____________________________") def save_gif(images, path="example.gif"): with imageio.get_writer(path, mode='I') as writer: for image in images: writer.append_data(image) if __name__ == '__main__': """ Run the code to verify the solution """ if to_train: logdir = "data/experiments/%i"%int(time.time()) ppo(env_fn = GoalGridWorld, actor_critic=mlp_actor_critic, steps_per_epoch=100000, epochs=100, logger_kwargs=dict(output_dir=logdir)) else: logdir = "data/experiments/%i/simple_save/"%int(exp_id) simulate(path=logdir)
from spinup import ppo import tensorflow as tf import gym import paint_svg from paint_svg.algos.ppo.ppo import ppo env_fn = lambda : gym.make('PaintSvg-v0') ac_kwargs = dict(hidden_sizes=[64,64], activation=tf.nn.relu) logger_kwargs = dict(output_dir='test', exp_name='paint_svg') ppo(env_fn=env_fn, ac_kwargs=ac_kwargs, steps_per_epoch=5000, epochs=250, logger_kwargs=logger_kwargs) # env = gym.make('PaintSvg-v0') # env.reset()
super(MoveTowardZ, self).__init__(env) def action(self, action): action[2] = -.3 return action env = gym.make('panda-v0') env = ProcessFrame84(env) env = ImageToPyTorch(env) env = MoveTowardZ(env) image = env.reset() plt.figure() plt.imshow(image.squeeze(), cmap='gray') plt.title('Example extracted screen') plt.show() env_fn = lambda: env ac_kwargs = dict(hidden_sizes=[18, 64, 64], activation=nn.ReLU) logger_kwargs = dict(output_dir='spinup', exp_name='panda_ppo') #ppo(env_fn=env_fn,actor_critic=core.CNNActorCritic, ac_kwargs=ac_kwargs, steps_per_epoch=5000, epochs=250, logger_kwargs=logger_kwargs) ppo(env_fn=env_fn, actor_critic=core.CNNActorCritic, ac_kwargs=ac_kwargs, steps_per_epoch=2, epochs=1, logger_kwargs=logger_kwargs)