log_path = "{}/{}/".format(args.log_folder, args.algo) save_path = os.path.join( log_path, "{}_{}{}".format(env_id, get_latest_run_id(log_path, env_id) + 1, uuid_str)) params_path = "{}/{}".format(save_path, env_id) os.makedirs(params_path, exist_ok=True) callbacks = [] if args.save_freq > 0: # Account for the number of parallel environments args.save_freq = max(args.save_freq // n_envs, 1) callbacks.append( CheckpointCallback(save_freq=args.save_freq, save_path=save_path, name_prefix='rl_model', verbose=1)) def create_env(n_envs, eval_env=False): """ Create the environment and wrap it if necessary :param n_envs: (int) :param eval_env: (bool) Whether is it an environment used for evaluation or not :return: (Union[gym.Env, VecEnv]) :return: (gym.Env) """ global hyperparams # Do not log eval env (issue with writing the same file) log_dir = None if eval_env else save_path
e_c = 0.0001 #define entropy coeff feedback = 'Bayes' #'Markov' or 'Bayes' steady = True #if True resets always with steady state conditions N = 8 #number of parallel workers LRo = 2.5e-4 #learning rate #uact=True #if we want to use u as action (only Bayesian) TIMESTEPS = int(50e6) #training steps sched_LR = LinearSchedule(1, LRo, 0) #lr schedule LR = sched_LR.value qs = 0 #no feedback cost dirname = 'Fisher_tests_{}RK4_cirand'.format(feedback) #directory name title = 'feed{}_steady{}_lro{}_ts{}M_N{}_ec{}_0.49_3e4_theta0.1_Mlp_1e-3_RK4_bothrand_s2'.format( feedback, steady, LRo, TIMESTEPS / 1e6, N, e_c) #make checkpoint callback checkpoint_callback = CheckpointCallback( save_freq=int(100000 / N), save_path='./Fisher_nocost_checkpoint/{}/{}_q{}'.format( dirname, title, qs)) callback = checkpoint_callback #set parameters and start training params = { 'k': 1, 'eta': 1, 'X_kunit': 0.49, 'theta': 0.1 } #if a parameter is set to None it will be sampled from a uniform distribution at every reset args = { 'feedback': feedback, 'params': params } #i parametri di default son questi: rewfunc=Tools.purity_like_rew,q=1e-4,dt=1e-3,plot=False,pow=0.5 #instantiate environment env = make_vec_env(FisherEnv, n_envs=N, env_kwargs=args)
steady = True #if True resets always with steady state conditions plot = False #if True resets always to fixed out of equilibrium conditions N = 1 #number of parallel workers LRo = 2e-4 #define the learning rate TIMESTEPS = int(6e6) #training steps sched_LR = LinearSchedule(1, LRo, 0) #schedule for lr reduction LR = sched_LR.value clip = LinearSchedule(1, 0.2, 0).value #schedule for clipping parameter PPO (eventual) title = 'feed{}_steady{}_lro{}_ts{}M_N{}_ec{}_{}_{}_{}_partial{}_fbound{}_tanh0.01_pur0.5_hurwseedr0_1e5'.format( feedback, steady, LRo, TIMESTEPS / 1e6, N, e_c, k, mirr, g, partial, fbound) #make checkpoint callback checkpoint_callback = CheckpointCallback( save_freq=int(1000000 / N), save_path='/home/fallani/prova/New/Optomech_checkpoint/{}/{}_q{}'.format( dirname, title, qs)) callback = checkpoint_callback #set F matrix zero = np.zeros((2, 2)) if fbound == True: F = np.block([[zero, zero], [zero, np.identity(2)]]) #custom F matrix elif fbound == False: F = np.identity(4) P = np.block([[np.identity(2), zero], [zero, zero]]) #set parameters and start training params = par.parameters(k=k, mirr=mirr, g=g) #commented parameters from Hammerer, usually useless #params={'wm':1,'k':0.5,'y':2e-7,'eta':1,'g':0.3,'detuning':-1,'ne':3.5e5,'na':0,'phi':math.pi/2}#{'wm':1,'k':5,'y':1.14e-4,'eta':1,'g':0.095,'detuning':0,'ne':2,'na':0,'phi':math.pi*0.25} #if a parameter is set to None it will be sampled from a uniform distribution at every reset args = { 'feedback': feedback,
from stable_baselines import PPO2 import warnings warnings.filterwarnings('ignore') envArgsDict = { 'resizeCamImagePct': 50, 'ledHSVLower': np.array([0, 0, 252]), 'ledHSVHigher':np.array([31, 9, 255]), 'rPiIP': '192.168.0.183', 'rPiPort':50000, 'episodeLength':100, 'bullseye':10 } env = make_vec_env(RPiLEDEnv, n_envs=1, env_kwargs=envArgsDict) callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-500, verbose=1) eval_callback = EvalCallback(env, best_model_save_path='./logs/best', log_path='./logs/', eval_freq=500, deterministic=True, render=False, callback_on_new_best=callback_on_best) # Added checkpoint because I lost model data after a crash when the webcam shutdown because the screen went to sleep :( checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/', name_prefix='ppo2_model') cb = CallbackList([checkpoint_callback, eval_callback]) policy_kwargs = {'layers':[128, 128]} model = PPO2.load('/Users/guillaumevandecasteele/PycharmProjects/robotics/ppo1_rpi_led_nn128.zip', verbose=1, policy_kwargs=policy_kwargs, tensorboard_log='./logs/') model.set_env(env) model.learn(total_timesteps=20000, callback=cb) model.save("ppo2_rpi_led_pargs")
def run(alg, alg_kwargs, task, task_kwargs, wrappers_kwargs, expl_params, rollout, num_trials, folder, n_thrds, n_lstm, rerun=False, test_kwargs={}, num_retrains=10, seed=0, train_mode=None, sl_kwargs=None): train_mode = train_mode or 'RL' env = test_env(task, kwargs=task_kwargs, num_steps=1000) num_timesteps = int(1000 * num_trials / (env.num_tr)) files = glob.glob(folder + '/*model*') vars_ = { 'alg': alg, 'alg_kwargs': alg_kwargs, 'task': task, 'task_kwargs': task_kwargs, 'wrappers_kwargs': wrappers_kwargs, 'expl_params': expl_params, 'rollout': rollout, 'folder': folder, 'num_trials': num_trials, 'n_thrds': n_thrds, 'n_lstm': n_lstm } np.savez(folder + '/params.npz', **vars_) if len(files) == 0 or rerun: if train_mode == 'RL': if alg == "A2C": from stable_baselines import A2C as algo elif alg == "ACER": from stable_baselines import ACER as algo elif alg == "ACKTR": from stable_baselines import ACKTR as algo elif alg == "PPO2": from stable_baselines import PPO2 as algo env = SubprocVecEnv([ make_env(env_id=task, rank=i, seed=seed, wrapps=wrappers_kwargs, **task_kwargs) for i in range(n_thrds) ]) model = algo(LstmPolicy, env, verbose=0, n_steps=rollout, n_cpu_tf_sess=n_thrds, tensorboard_log=None, policy_kwargs={ "feature_extraction": "mlp", "n_lstm": n_lstm }, **alg_kwargs) # this assumes 1 trial ~ 10 steps sv_freq = 5 * wrappers_kwargs['MonitorExtended-v0']['sv_per'] chckpnt_cllbck = CheckpointCallback(save_freq=sv_freq, save_path=folder, name_prefix='model') model.learn(total_timesteps=num_timesteps, callback=chckpnt_cllbck) model.save(f"{folder}/model_{num_timesteps}_steps.zip") plotting.plot_rew_across_training(folder=folder) elif train_mode == 'SL': stps_ep = sl_kwargs['steps_per_epoch'] wraps_sl = deepc(wrappers_kwargs) del wraps_sl['PassAction-v0'] del wraps_sl['PassReward-v0'] del wraps_sl['MonitorExtended-v0'] env = make_env(env_id=task, rank=0, seed=seed, wrapps=wraps_sl, **task_kwargs)() dataset = ngym.Dataset(env, batch_size=sl_kwargs['btch_s'], seq_len=rollout, batch_first=True) obs_size = env.observation_space.shape[0] act_size = env.action_space.n model = define_model(seq_len=rollout, num_h=n_lstm, obs_size=obs_size, act_size=act_size, batch_size=sl_kwargs['btch_s'], stateful=sl_kwargs['stateful'], loss=sl_kwargs['loss']) # Train network data_generator = (dataset() for i in range(stps_ep)) model.fit(data_generator, verbose=1, steps_per_epoch=stps_ep) model.save(f"{folder}/model_{stps_ep}_steps") if len(test_kwargs) != 0: for key in test_kwargs.keys(): sv_folder = folder + key test_kwargs[key]['seed'] = seed if train_mode == 'RL': ga.get_activity(folder, alg, sv_folder, **test_kwargs[key]) elif train_mode == 'SL': stps_ep = sl_kwargs['steps_per_epoch'] wraps_sl = deepc(wrappers_kwargs) wraps_sl.update(test_kwargs[key]['wrappers']) del wraps_sl['PassAction-v0'] del wraps_sl['PassReward-v0'] env = make_env(env_id=task, rank=0, seed=seed, wrapps=wraps_sl, **task_kwargs)() obs_size = env.observation_space.shape[0] act_size = env.action_space.n model_test = define_model(seq_len=1, batch_size=1, obs_size=obs_size, act_size=act_size, stateful=sl_kwargs['stateful'], num_h=n_lstm, loss=sl_kwargs['loss']) ld_f = folder + 'model_' + str(stps_ep) + '_steps'.replace( '//', '/') model_test.load_weights(ld_f) env.reset() for ind_stp in range(sl_kwargs['test_steps']): obs = env.ob_now obs = obs[np.newaxis] obs = obs[np.newaxis] action = model_test.predict(obs) action = np.argmax(action, axis=-1)[0] _, _, _, _ = env.step(action)
def main(): """ Prepare for trainings """ log_dir, model_dir = prepare_dirs() model_name = model_dir + '/' + MODEL_NAME print(f'model will be saved as {model_name}') log_dir = log_dir + '/' + MODEL_NAME """ Generate & Check environment """ env_name = ENV_NAME env = gym.make(env_name) # print(f'Observation space: {env.observation_space}') # print(f'Action space: {env.action_space}') # env = Monitor(env, log_dir, allow_early_resets=True) # check_env(env) """ Save config as pickle file """ config = summarize_config(env) save_config(log_dir, config) """ Vectorize environment """ num_envs = NUM_ENVS env = DummyVecEnv([lambda: env for _ in range(num_envs)]) # For training eval_env = DummyVecEnv([lambda: gym.make(env_name)]) # For evaluation """ Define checkpoint callback """ checkpoint_callback = CheckpointCallback(save_freq=SAVE_FREQ, save_path=model_name, name_prefix=MODEL_NAME) """ Use deterministic actions for evaluation callback """ eval_callback = EvalCallback(eval_env, best_model_save_path=model_name, log_path=log_dir, eval_freq=EVAL_FREQ, deterministic=True, render=False, n_eval_episodes=N_EVAL_EPISODES) print(f'Algorithm: {ALGORITHM}\n') if not CONTINUAL_LEARNING: """ Define model """ model = define_model(env, log_dir) else: model = load_model(env, model_dir, log_dir) """ Evaluate model before training """ # mean_reward, std_reward = evaluate_policy(model=model, # env=eval_env, # n_eval_episodes=N_EVAL_EPISODES) # print(f'Before training: mean reward: {mean_reward:.2f} +/- {std_reward:.2f}') """ Train model """ model.learn(total_timesteps=MAX_STEPS, callback=[checkpoint_callback, eval_callback]) """ Evaluate model after training """ # mean_reward, std_reward = evaluate_policy(model=model, # env=eval_env, # n_eval_episodes=N_EVAL_EPISODES) # print(f'After training: mean reward: {mean_reward:.2f} +/- {std_reward:.2f}') """ Save trained model """ model.save(model_name) """ Test trained model """ obs = eval_env.reset() for i in range(N_EVAL_EPISODES): action, _states = model.predict(obs) obs, rewards, dones, info = eval_env.step(action) eval_env.render() env.close() eval_env.close()
from stable_baselines import PPO2 import numpy as np import gym from stable_baselines.common.callbacks import CheckpointCallback from utils import * gamename = "MortalKombat3-Genesis" if __name__ == "__main__": n_cpu = 16 env = SubprocVecEnv([make_env] * n_cpu) env = VecFrameStack(env, n_stack=4) model = PPO2(CnnLstmPolicy, env, n_steps=128, verbose=1, tensorboard_log="./tboard_log") # Use this if you want to continue training a saved model # model = PPO2.load("training_checkpoints/your_model.zip", tensorboard_log="./tboard_log") # model.set_env(env) checkpoint_callback = CheckpointCallback( save_freq=1000, save_path='./training_checkpoints', name_prefix='subzero-ppo2') model.learn(total_timesteps=20000000, callback=checkpoint_callback) model.save('subzero-ppo2') env.close()
def main(logdir): # params SLEEP_RATE = 100 #1 2 10 50 100Hz EPISODE_TIME = 30 # 30 120 sec USE_MPC = False N_EPISODE = 1000000 Action_Choice = np.array([1, 1, 1, 1, 0, 0, 0, 0]) EPISODE_LENGTH = SLEEP_RATE * EPISODE_TIME TOTAL_TIMESTEPS = EPISODE_LENGTH * N_EPISODE # logdir logdir = os.path.join(logdir, strftime("%Y-%m-%d--%H:%M:%S", localtime())) os.makedirs(logdir) checkpoint_path = os.path.join(logdir, 'checkpoint') callback_path = logdir final_model_path = logdir + '/final_model' # env env = BlimpEnv(SLEEP_RATE, EPISODE_TIME, USE_MPC, Action_Choice) env = Monitor(env, logdir) # env = make_vec_env(lambda: env, n_envs=1, monitor_dir=logdir) print("Observation space:", env.observation_space) print("Shape:", env.observation_space.shape) print("Action space:", env.action_space) # callback SAVE_FREQ = EPISODE_LENGTH * 100 # save model for every 20 episode checkpoint_callback = CheckpointCallback(save_freq=SAVE_FREQ, save_path=checkpoint_path, name_prefix='sac_callback_model') save_on_best_training_reward_callback = SaveOnBestTrainingRewardCallback( check_freq=SAVE_FREQ, log_dir=callback_path) callback = CallbackList( [checkpoint_callback, save_on_best_training_reward_callback]) # agent model = SAC(MlpPolicy, env, gamma=0.98, learning_rate=0.0003, buffer_size=1000000, learning_starts=EPISODE_LENGTH * 20, train_freq=1, batch_size=256, tau=0.01, ent_coef='auto', target_update_interval=1, gradient_steps=1, target_entropy='auto', action_noise=None, verbose=1, tensorboard_log=logdir, full_tensorboard_log=True, _init_setup_model=True) print("---------- Start Learing -----------") model.learn(total_timesteps=TOTAL_TIMESTEPS, log_interval=SAVE_FREQ, callback=callback) print("---------- Finish Learning ----------") model.save(final_model_path) del model # remove to demonstrate saving and loading model = SAC.load(final_model_path) results_plotter.plot_results([logdir], TOTAL_TIMESTEPS, results_plotter.X_TIMESTEPS, "SAC BLIMP") plt.show()
pathlib.Path("./models").mkdir(exist_ok=True) pathlib.Path("./models/checkpoints").mkdir(exist_ok=True) env = WarehouseEnv('7x7_4bins_2items_2binslots_1agentslots') model = DQN(CustomDQNPolicy, env, verbose=1, exploration_fraction=0.95, exploration_initial_eps=1, exploration_final_eps=0.05, batch_size=32, buffer_size=50000) checkpoint_callback = CheckpointCallback(save_freq=50000, save_path='./models/checkpoints/', name_prefix=prefix) #episode_plot_freq = n : Update plots every n time steps #update_stats_every = m: Update stats used in plots every m Episodes #Note! update_stats_every > 1 would lead to lose of information in the plot (not in the trining process), but increase the performance during training. plt_callback = plotcallback(episode_plot_freq=10000, update_stats_every=1, average_size=100, verbose=1, plot_prefix=prefix, plot_dir="./Plots") callbacks = CallbackList([checkpoint_callback, plt_callback]) model.learn(total_timesteps=total_timesteps, callback=callbacks)