def main(): env = DummyVecEnv([lambda: WeightEnv()]) env.env_method("seed", 0) model = PPO2(MlpPolicy, env, tensorboard_log="/tmp/foo") model.learn(total_timesteps=1000000) obs = env.reset() print('position velocity accel jerk reward') for i in range(2000): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) if done: printit(info[0]['terminal_observation'], rewards[0]) print("") printit(obs[0], rewards[0])
def test_agent(agent_step): for coef_index in range(len(CLAC_COEFS)): mut_coef = CLAC_COEFS[coef_index] if (agent_step == 1): print(mut_coef, " ", NUM_TRAINING_STEPS, " ", ENVIRONMENT_NAME, " ", FOLDER) features = pd.DataFrame() mirl_env = gym.make(ENVIRONMENT_NAME) mirl_env = DummyVecEnv([lambda: mirl_env]) mirl_model = CLAC(CLAC_MlpPolicy, mirl_env, mut_inf_coef=mut_coef, coef_schedule=3.3e-4, verbose=1) (mirl_model, learning_results) = mirl_model.learn( total_timesteps=NUM_TRAINING_STEPS, log_interval=10) learning_results['AgentID'] = agent_step learning_results.to_pickle(FOLDER + "/results/MIRL_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_0.pkl") for resample_step in range(1, NUM_RESAMPLES): # Set both environments to the same resampled values if (RANDOMIZATION_LEVEL == "Normal"): mirl_env.env_method("randomize", 0) elif (RANDOMIZATION_LEVEL == "Extreme"): mirl_env.env_method("randomize", 1) elif (RANDOMIZATION_LEVEL == "Test"): mirl_env.env_method("randomize", -1) else: print("Error resampling unknown value: ", RANDOMIZATION_LEVEL) continue if (agent_step == 1): print(mut_coef, " ", NUM_TRAINING_STEPS, " ", ENVIRONMENT_NAME, " ", FOLDER, " resample step ", resample_step) (mirl_model, learning_results) = mirl_model.learn( total_timesteps=NUM_TRAINING_STEPS, reset_num_timesteps=False, log_interval=10) learning_results.to_pickle(FOLDER + "/results/MIRL_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") mirl_model.save(FOLDER + "/models/MIRL_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_0") del mirl_model del mirl_env
def main(): env = DummyVecEnv([lambda: EngineEnv()]) #env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.) #env = VecNormalize(env) env.env_method("seed", 0) # more value function loss model = PPO2(MlpPolicy, env, vf_coef=10.0, tensorboard_log="/tmp/foo") #model = A2C(MlpPolicy, env, tensorboard_log="/tmp/foo") #model = SAC(MlpPolicy, env, tensorboard_log="/tmp/foo") #model = SAC(CustomSACPolicy, env, tensorboard_log="/tmp/foo") model.learn(total_timesteps=1000000) #model.learn(total_timesteps=400000) #model.learn(total_timesteps=500000) obs = env.reset() print('map far_err afr reward') # 1234567890 1234567890 1234567890 1 for i in range(2000): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) #if done: # printit(info[0]['terminal_observation'], rewards[0]) # print("") printit(obs[0], rewards[0])
checkpoint = 'fresh' elif args.checkpoint in ["BEST", "best", "Best"]: checkpoint = bestperformingcheckpoint( os.path.join("./saves/", args.tag)) else: checkpoint = os.path.join("./saves/", args.tag, args.checkpoint.split('/')[-1]) if not args.test: env = DummyVecEnv([ lambda: env_generator(ep_len=args.episode_length, total_sweeps=args.total_sweeps) ]) #env = VecNormalize(env, norm_obs=False, norm_reward=True, training=True) env.env_method('set_experiment_tag', indices=[0], tag=args.tag) env.env_method('init_HamiltonianGetter', indices=[0], phase='TRAIN') env.env_method('set_max_ep_length', indices=[0], max_ep_length=args.episode_length) print("Attempting to restore model") try: print("Loading model") model = PPO2.load(checkpoint, env=env, **model_args) except Exception as EEE: print(EEE) print("ERROR restoring model. Starting from scratch") print("Initializing model") model = PPO2( env=env,
def validation(checkpoint_name, num_hamiltonians=20, num_trials=10, mode='validation'): tf.config.set_soft_device_placement(True) with tf.device("/gpu:1"): env = DummyVecEnv([ lambda: env_generator(ep_len=args.episode_length, total_sweeps=args.total_sweeps) ]) env.env_method('set_experiment_tag', indices=[0], tag=args.tag) env.env_method('set_max_ep_length', indices=[0], max_ep_length=args.episode_length) if mode == 'test': env.env_method("toggle_datadump_on", indices=[0]) env.env_method('init_HamiltonianGetter', indices=[0], phase='TEST', directory=args.hamiltonian_directory) model = PPO2.load(checkpoint_name, env=env, **model_args) env = model.get_env() env.env_method("init_HamiltonianSuccessRecorder", indices=[0], num_hamiltonians=num_hamiltonians, num_trials=num_trials) env.env_method("set_static_Hamiltonian_by_ID", indices=[0], ID=0) if args.destructive: env.env_method('set_destructive_observation_on', indices=[0]) obs = env.reset() test_ep = -1 inftime = 0 envtime = 0 count = 0 for ham in range(num_hamiltonians): env.env_method("set_static_Hamiltonian_by_ID", indices=[0], ID=ham) for trial in range(num_trials): test_ep += 1 state = None done = [False for _ in range(env.num_envs)] step = -1 while True: step += 1 tick = time.time() action, state = model.predict(obs, state=state, mask=done, deterministic=True) tock = time.time() if step > 3 and step < 35: inftime += tock - tick tick = time.time() obs, reward, d, _ = env.step(action) tock = time.time() if step > 3 and step < 35: envtime += tock - tick count += 1 # if test_ep==10000: # env.env_method("toggle_datadump_off", indices=[0]) if d: break if mode == 'test': env.env_method("hsr_write") print(f"Total inference time: {inftime}s") print(f"Total environment time: {envtime}s") print(f"Total count: {count}") print(f"Time per inference call: {inftime/count}") print(f"Time time in environment: {envtime/count}") if mode == 'validation': #only want to log if in validation mode (i.e. validation during testing) p = env.env_method('get_hamiltonian_success_probability', indices=[0])[0] if args.wandb_project != "disable": wandb.log({"Probability of success": p}) archive = checkpoint_name.replace( "saved_model", f"archived_p{p:06.3f}_{uuid4()}") print( f"Archiving checkpoint. Copying {checkpoint_name} to {archive}" ) shutil.copy(checkpoint_name + ".zip", archive + ".zip")
rewards_time_list_fixed_2 = [] avg_rewards_time_list_fixed_2 = [] rewards_bak_list_fixed_2 = [] avg_rewards_bak_list_fixed_2 = [] rewards_bat_list_fixed_2 = [] avg_rewards_bat_list_fixed_2 = [] avg_rewards_energy_list_fixed_2 = [] fixed_2_data = [] s = 1 t_range = 100 set_seed(rand_seed) obs = env.reset() for i in range(t_range): action = env.env_method('myopic_action_cal') obs, rewards, dones, info = env.step(action) rewards_list_myopic.append(1 / rewards / s) avg_rewards_myopic.append(np.mean(rewards_list_myopic[:])) t, bak, bat = env.render() rewards_time_list_myopic.append(t / s) avg_rewards_time_list_myopic.append(np.mean(rewards_time_list_myopic[:])) rewards_bak_list_myopic.append(bak / s) avg_rewards_bak_list_myopic.append(np.mean(rewards_bak_list_myopic[:])) rewards_bat_list_myopic.append(bat / s) avg_rewards_bat_list_myopic.append(np.mean(rewards_bat_list_myopic[:])) avg_rewards_energy_list_myopic.append(avg_rewards_bak_list_myopic[-1] + avg_rewards_bat_list_myopic[-1]) myopic_data.append([ avg_rewards_time_list_myopic[-1], avg_rewards_bak_list_myopic[-1], avg_rewards_bat_list_myopic[-1]
def solve(supply_distribution: Tuple[dict, list], demand_distribution: Tuple[dict, list], model_name: str, export_model: str, max_age: int, demand: int, doi: int, n_warm_start_days: int, n_days: int, obs_method: int, state_type: str) -> dict: """ :param demand_distribution: Tuple[dict, list] containing a dict with {blood_group : distribution}, list of included antigens :param supply_distribution: Tuple[dict, list] containing a dict with {blood_group : distribution}, list of included antigens :param model_name: str, name of the model that is used to store the results :param export_model: str, name of hte model that is trained :param max_age: int, max age of the RBCs :param demand: int, number of demand / supply per day :param doi: days of inventory, the number of days the inventory is filled before first supply :param n_warm_start_days: int, number of days of warm start :param n_days: int, number of days for evaluation :param obs_method: int, 1 or 2: item requested one-hot-encoded (1) or binary (2) :param state_type: type of state that is used 'custom category' :return: """ # Get model ready env = environment.Env(supply_distribution[0], demand_distribution[0], max_age, demand, doi, obs_method=obs_method, state_type=state_type, file_name=model_name) env = DummyVecEnv([lambda: env]) model = PPO2.load(export_model, env=env) # Run model obs = env.reset() # Warm start print('warm start - started') env.env_method('set_days', n_warm_start_days) done = False while not done: action, _states = model.predict(obs, deterministic=True) obs_next, rewards, done, info = env.step(action) obs = obs_next print('warm start - ended') # Testing print('Testing - started') env.env_method('set_days', n_days) env.env_method('change_eval_boolean', True) done = False while not done: action, _states = model.predict(obs, deterministic=True) obs_next, rewards, done, info = env.step(action) obs = obs_next results = env.env_method('render_blood_specific') # get evaluation metrics print('Testing - ended') return results
def test_agent(agent_step): for coef_index in range(len(CLAC_COEFS)): mut_coef = CLAC_COEFS[coef_index] ent_coef = SAC_COEFS[coef_index] if (agent_step == 1): print(mut_coef, " ", ent_coef, " ", NUM_TRAINING_STEPS, " ", ENVIRONMENT_NAME, " ", FOLDER) features = pd.DataFrame() clac_env = gym.make(ENVIRONMENT_NAME) clac_env = DummyVecEnv([lambda: clac_env]) clac_model = CLAC(CLAC_MlpPolicy, clac_env, mut_inf_coef=mut_coef, verbose=1) sac_env = gym.make(ENVIRONMENT_NAME) sac_env = DummyVecEnv([lambda: sac_env]) sac_model = SAC(MlpPolicy, sac_env, ent_coef=ent_coef, verbose=1) mirl_env = gym.make(ENVIRONMENT_NAME) mirl_env = DummyVecEnv([lambda: mirl_env]) mirl_model = CLAC(CLAC_MlpPolicy, mirl_env, mut_inf_coef=mut_coef, coef_schedule=3.3e-4, verbose=1) (clac_model, learning_results) = clac_model.learn( total_timesteps=NUM_TRAINING_STEPS, log_interval=1000) learning_results['AgentID'] = agent_step learning_results.to_pickle(FOLDER + "/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_0.pkl") (sac_model, learning_results) = sac_model.learn( total_timesteps=NUM_TRAINING_STEPS, log_interval=1000) learning_results['AgentID'] = agent_step learning_results.to_pickle(FOLDER + "/results/SAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_0.pkl") (mirl_model, learning_results) = mirl_model.learn( total_timesteps=NUM_TRAINING_STEPS, log_interval=1000) learning_results['AgentID'] = agent_step learning_results.to_pickle(FOLDER + "/results/MIRL_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_0.pkl") for resample_step in range(1, NUM_RESAMPLES): # Set both environments to the same resampled values if (RANDOMIZATION_LEVEL == "Normal"): clac_env.env_method("randomize", 0) elif (RANDOMIZATION_LEVEL == "Random"): clac_env.env_method("randomize", 1) elif (RANDOMIZATION_LEVEL == "Extreme"): clac_env.env_method("randomize", 2) elif (RANDOMIZATION_LEVEL == "Test"): clac_env.env_method("randomize", -1) else: print("Error resampling unknown value: ", RANDOMIZATION_LEVEL) continue env_features = clac_env.env_method("get_features")[0] sac_env.env_method("set_features", env_features) mirl_env.env_method("set_features", env_features) if (agent_step == 1): print(env_features) Power = env_features[0] Density = env_features[1] Friction = env_features[2] Gravity = env_features[3] d = { "Mut Coefficient": mut_coef, "Ent Coefficient": ent_coef, "Resample Step": resample_step, "Power": Power, "Density": Density, "Friction": Friction, "Gravity": Gravity } features = features.append(d, ignore_index=True) (clac_model, learning_results) = clac_model.learn( total_timesteps=NUM_TRAINING_STEPS, reset_num_timesteps=False, log_interval=1000) learning_results.to_pickle(FOLDER + "/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") (sac_model, learning_results) = sac_model.learn( total_timesteps=NUM_TRAINING_STEPS, reset_num_timesteps=False, log_interval=1000) learning_results.to_pickle(FOLDER + "/results/SAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") (mirl_model, learning_results) = mirl_model.learn( total_timesteps=NUM_TRAINING_STEPS, reset_num_timesteps=False, log_interval=1000) learning_results.to_pickle(FOLDER + "/results/MIRL_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") clac_model.save(FOLDER + "/models/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_0") sac_model.save(FOLDER + "/models/SAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_0") mirl_model.save(FOLDER + "/models/MIRL_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_0") features.to_pickle(FOLDER + "/features/features_" + str(agent_step) + "_" + str(mut_coef) + "_" + str(ent_coef) + ".pkl") #print(features) del sac_model del sac_env del clac_model del clac_env del mirl_model del mirl_env
parser.add_argument("-b","--betainit", help="Initial inverse temperature", default="") args = parser.parse_args() silent = True from train import episode_length experiment_name=sys.argv[1] experiment_description="""Reward is the negative of the minimum energy at episode termination, with no episode termination if negative beta encountered""" beta_init = float(args.betainit) env = DummyVecEnv([lambda: env_generator(ep_len=episode_length, total_sweeps=episode_length*100, beta_init_function=lambda: beta_init )]) env = VecNormalize(env, norm_obs=False, norm_reward=False, training=False) env.env_method('set_experiment_tag', indices=[0], tag=args.tag) env.env_method('set_max_ep_length', indices=[0], max_ep_length=episode_length) env.env_method("toggle_datadump_on", indices=[0]) #env.env_method('init_HamiltonianGetter', indices=[0], phase='TEST', directory=args.hamiltonian_directory ) env.env_method('init_HamiltonianGetter', indices=[0], phase='WSC', directory=args.hamiltonian_directory ) #Attempting to restore most recently saved model print("!!!!!!!!!!!!!!!!!!!!!!!") max_path = mostrecentmodification(os.path.join("./saves", 'WSC'))#args.tag)) print(f" Attempting to restore model {max_path}") model = PPO2.load(max_path, env=env, **model_args) print("!!!!!!!!!!!!!!!!!!!!!!!") env = model.get_env()
delayPerTask = delayPerTask / len(info[0]['info']) ep_instr.append(totalInstr) ep_power.append(info[0]['power_consumed']) ep_power_per_instr.append(ep_power[-1] / ep_instr[-1]) ep_delay_per_task.append(delayPerTask) wandb.log({ 'ep_instr': ep_instr[-1], 'ep_power': ep_power[-1], 'ep_power_per_instr': ep_power_per_instr[-1], 'ep_delay_per_task': ep_delay_per_task[-1] }) cumInstructions += totalInstr avgDelayPerTask += delayPerTask if num_test == 1: writeOutputFile('out.csv', info[0]['info']) env.env_method('graphShow', 'power') env.env_method('graphShow', 'temp') avgPowerPerInstr = avgPowerPerInstr / (num_test - skip) cumInstructions = cumInstructions / (num_test - skip) avgDelayPerTask = avgDelayPerTask / (num_test - skip) print("Mean Instruction Count per episode = \t" + str(cumInstructions)) print("avgPowerPerInstr = \t\t" + str(avgPowerPerInstr)) # print("cumInstructions = " + str(cumInstructions)) print("avgDelayPerTask = \t\t" + str(avgDelayPerTask)) print("skips = \t\t" + str(skip)) wandb.config.mean_ep_instr = np.mean(ep_instr) wandb.config.mean_power = np.mean(ep_power) wandb.config.mean_power_per_instr = np.mean(ep_power_per_instr) wandb.config.mean_delay_per_task = np.mean(ep_delay_per_task)
from stable_baselines import PPO2 from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv import argparse from datetime import datetime parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("configPath", help="Enter config path") args = parser.parse_args() path = args.configPath env = DummyVecEnv([lambda: TradingEnv(path)]) config = configparser.ConfigParser() config.read(path) model = PPO2.load(config['MAIN']['Model']) obs = env.reset() for i in range(int(config['MAIN']['TestSteps'])): print(i) action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) env.render() env.env_method("save_results")
def test_agent(agent_step): now = time.time() for coef_index in range(len(CLAC_COEFS)): mut_coef = CLAC_COEFS[coef_index] ent_coef = SAC_COEFS[coef_index] training_timestep = 0 if(agent_step == 1): print(mut_coef, " ", ent_coef, " ", NUM_TRAINING_STEPS, " ", ENVIRONMENT_NAME, " ", FOLDER) features = pd.DataFrame() clac_env = gym.make(ENVIRONMENT_NAME) clac_env = DummyVecEnv([lambda: clac_env]) clac_model = CLAC(CLAC_MlpPolicy, clac_env, mut_inf_coef=mut_coef, verbose=1) sac_env = gym.make(ENVIRONMENT_NAME) sac_env = DummyVecEnv([lambda: sac_env]) sac_model = SAC(MlpPolicy, sac_env, ent_coef=ent_coef, verbose=1) mirl_env = gym.make(ENVIRONMENT_NAME) mirl_env = DummyVecEnv([lambda: mirl_env]) mirl_model = CLAC(CLAC_MlpPolicy, mirl_env, mut_inf_coef=mut_coef, coef_schedule=3.3e-3, verbose=1) (clac_model, learning_results) = clac_model.learn(total_timesteps=NUM_TRAINING_STEPS, log_interval=1000) learning_results['AgentID'] = agent_step learning_results.to_pickle(FOLDER + "/Training/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_0.pkl") (sac_model, learning_results) = sac_model.learn(total_timesteps=NUM_TRAINING_STEPS, log_interval=1000) learning_results['AgentID'] = agent_step learning_results.to_pickle(FOLDER + "/Training/results/SAC_"+ str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_0.pkl") (mirl_model, learning_results) = mirl_model.learn(total_timesteps=NUM_TRAINING_STEPS, log_interval=1000) learning_results['AgentID'] = agent_step learning_results.to_pickle(FOLDER + "/Training/results/MIRL_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_0.pkl") training_timestep += NUM_TRAINING_STEPS sac_env.env_method("set_features", env_features) mirl_env.env_method("set_features", env_features) #if(agent_step == 0): # print(env_features) Power = env_features[0] Density = env_features[1] Friction = env_features[2] Gravity = env_features[3] d = {"Mut Coefficient": mut_coef, "Ent Coefficient": ent_coef, "Resample Step":resample_step, "Power": Power, "Density": Density, "Friction": Friction, "Gravity": Gravity} #d = {"Mut Coefficient": mut_coef, "Resample Step":resample_step, "Power": Power, "Density": Density, "Friction": Friction, "Gravity": Gravity} features = features.append(d, ignore_index = True) # Train generalization eval_results = eval_model(clac_model, env, "CLAC", mut_coef, NUM_TESTING_STEPS, training_timestep, 0) eval_results['AgentID'] = agent_step eval_results.to_pickle(FOLDER + "/Generalization/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_0.pkl") eval_results = eval_model(sac_model, env, "SAC", ent_coef, NUM_TESTING_STEPS, training_timestep, 0) eval_results['AgentID'] = agent_step eval_results.to_pickle(FOLDER + "/Generalization/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_0.pkl") eval_results = eval_model(mirl_model, env, "MIRL", mut_coef, NUM_TESTING_STEPS, training_timestep, 0) eval_results['AgentID'] = agent_step eval_results.to_pickle(FOLDER + "/Generalization/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_0.pkl") clac_model.save(FOLDER + "/Training/models/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_0") sac_model.save(FOLDER + "/Training/models/SAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_0") mirl_model.save(FOLDER + "/Training/models/MIRL_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_0") #features.to_pickle(FOLDER + "/features/features_" + str(agent_step) + "_" + str(mut_coef) + "_" + str(ent_coef) + ".pkl") for resample_step in range(1, NUM_RESAMPLES): if(agent_step == 1): print(mut_coef, " ", ent_coef, " ", NUM_TRAINING_STEPS, " ", ENVIRONMENT_NAME, " ", FOLDER, " resample step ", resample_step) (clac_model, learning_results) = clac_model.learn(total_timesteps=NUM_TRAINING_STEPS, reset_num_timesteps=False, log_interval=1000) learning_results.to_pickle(FOLDER + "/Training/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") (sac_model, learning_results) = sac_model.learn(total_timesteps=NUM_TRAINING_STEPS, reset_num_timesteps=False, log_interval=1000) learning_results.to_pickle(FOLDER + "/Training/results/SAC_"+ str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") (mirl_model, learning_results) = mirl_model.learn(total_timesteps=NUM_TRAINING_STEPS, reset_num_timesteps=False, log_interval=1000) learning_results.to_pickle(FOLDER + "/Training/results/MIRL_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") training_timestep += NUM_TRAINING_STEPS clac_model.save(FOLDER + "/Training/models/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step)) sac_model.save(FOLDER + "/Training/models/SAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step)) mirl_model.save(FOLDER + "/Training/models/MIRL_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step)) #print(features) del sac_model del sac_env del clac_model del clac_env del mirl_model del mirl_env later = time.time() difference = int(later - now) print("Tested Agent Time: ", difference)
parser.add_argument("-d","--hamiltonian_directory", help="Hamiltonian directory", default="") parser.add_argument("--destructive", default=False, action='store_true', help='Whether or not to use destructive observation.') args = parser.parse_args() silent = True from train import episode_length experiment_name=sys.argv[1] experiment_description="""Reward is the negative of the minimum energy at episode termination, with no episode termination if negative beta encountered""" env = DummyVecEnv([lambda: env_generator(ep_len=episode_length, total_sweeps=episode_length*1, beta_init_function=lambda: 0.3)]) env = VecNormalize(env, norm_obs=False, norm_reward=False, training=False) env.env_method('set_experiment_tag', indices=[0], tag=args.tag) env.env_method('set_max_ep_length', indices=[0], max_ep_length=episode_length) env.env_method("toggle_datadump_on", indices=[0]) env.env_method('init_HamiltonianGetter', indices=[0], phase='TEST', directory=args.hamiltonian_directory ) #Attempting to restore most recently saved model print("!!!!!!!!!!!!!!!!!!!!!!!") max_path = mostrecentmodification(os.path.join("./saves", args.tag)) print(f" Attempting to restore model {max_path}") model = PPO2.load(max_path, env=env, **model_args) print("!!!!!!!!!!!!!!!!!!!!!!!") env = model.get_env()
class Training: def __init__(self): self.best_mean_reward = 0 self.n_steps =0 self.stats = {"rewards": []} self.i = 0 def process_end_of_actor_activation(self): """ Applies runtime patches to the Stable Baselines source code in order to set the End of Actor Activation """ supported_values = ["tanh", "cbv"] if self.args.activation_end_of_actor not in supported_values: raise RuntimeError(f"End of Actor Activation {self.args.activation_end_of_actor} not supported") if self.args.activation_end_of_actor == "cbv": apply_tanh_patch() def f_clw_set_interval(self, x): """ Sets the interval related to which the checkpoints are saved """ logging.debug(f"Operation: SET, Key: self.interval, Value: {x}") self.interval = x def f_clr_get_interval(self): """ Gets the interval related to which the checkpoints are saved """ return self.interval def f_clw_set_model(self, x): """ Sets the model that is used for the training """ logging.debug(f"Operation: SET, Key: self.model, Value: {x['model_name']}") self.model = x['model'] self.model_name = x['model_name'] def f_clr_get_model(self): """ Gets the model that is used for the training """ logging.debug(f"Operation: GET, Key: self.model, Value: {self.model_name}") return self.model def f_clr_get_feed_dict(self, model): feed_dict = {model.actions: model.stats_sample['actions']} for placeholder in [model.action_train_ph, model.action_target, model.action_adapt_noise, model.action_noise_ph]: if placeholder is not None: feed_dict[placeholder] = model.stats_sample['actions'] for placeholder in [model.obs_train, model.obs_target, model.obs_adapt_noise, model.obs_noise]: if placeholder is not None: feed_dict[placeholder] = model.stats_sample['obs'] return feed_dict def f_cb_check_switch(self): if self.sp_desc.get_is_switch_active() and not self.has_switched_training_mode and (self.n_steps / self.args.n_steps) > self.sp_desc.get_time_perc(): if self.sp_desc.get_is_continuous(): temp = "Continuous" for x in self.__envs_training: x.set_continuous(quadcopter=Quadcopter(T=self.tp_desc.qg_continuous.get_T_episode(), dt_commands=self.tp_desc.qg_continuous.get_dt_command(), dt=self.tp_desc.qg_continuous.get_dt())) else: temp = "Episodic" for x in self.__envs_training: x.set_episodic(quadcopter=Quadcopter(T=self.tp_desc.qg_episodic.get_T_episode(), dt_commands=self.tp_desc.qg_episodic.get_dt_command(), dt=self.tp_desc.qg_episodic.get_dt())) logging.info(f"QUERY MODE GENERATION SWITCH HAPPENED, now it is {temp}") self.has_switched_training_mode = True def callback(self, _locals, _globals): self._debug_callback(model=_locals['self'], sim_time=self.i) self._callback_tf_log() if (self.n_steps + 1) % self.f_clr_get_interval() == 0: self.f_cb_check_switch() self.i += 1 full_checkpoint_id = int(self.model_desc.get_checkpoint_id())+int(self.i) logging.info(f"Checkpoint ID: Internal={self.i}, Full={full_checkpoint_id}, n_timesteps: {self.n_steps}") temp=self._save_model_stable_baselines(model=_locals['self'], cp_id=full_checkpoint_id) self._save_model_sherlock(temp) if self.train_saver is not None: self.train_saver.save(sess=self.model.sess, save_path=f"{self.args.log_dir_tensorboard}/cp", global_step=self.i) if(self.args.save_as_tf): path_save_cp = os.path.join(self.args.log_dir_tensorboard, f"cp-{self.i}") print(f"Saving Tensorflow Checkpoint in {path_save_cp}") self._save_model(path_save_cp) evaluation = f_model_2_evaluation(model=_locals['self'], env=self.env_test) quadcopter = self.__envs_training[0].quadcopter temp_plot_fn = f_iofsw_eval_2_plot( evaluation=evaluation, checkpoint_id=full_checkpoint_id, iteration_time=0, plots_dir=self.args.plots_dir, saturated=quadcopter.saturated, not_saturated=quadcopter.not_saturated) self.stats['rewards'].append(evaluation['re']) self.n_steps += 1 # Returning False will stop training early return True def _debug_callback(self, model, sim_time): if(self.args.debug_is_active): if(self.args.debug_model_describe): print(self._describe_model()) if(self.args.debug_try_save_all_vars): tf_path = f"{self.args.models_dir}/tf_quadcopter-{self.i}-desc" if not os.path.exists(tf_path): os.mkdir(tf_path) tf_testname_model = "debug_vars_all.json" tf_full_path = tf_path + "/" + tf_testname_model res = "" for v in tf.get_default_graph().as_graph_def().node: res += f"{v.name}\n" print(f"Trying to save debug data in {tf_full_path}") with open(tf_full_path, "w") as f: f.write(self._describe_model()) if(self.args.debug_try_save_trainable_vars): tf_path = f"{self.args.models_dir}/tf_quadcopter-{self.i}-desc" if not os.path.exists(tf_path): os.mkdir(tf_path) tf_testname_model = "debug_vars_trainable.json" tf_full_path = tf_path + "/" + tf_testname_model res = "" for v in tf.trainable_variables(): res += f"{v.name}\n" print(f"Trying to save debug data in {tf_full_path}") with open(tf_full_path, "w") as f: f.write(self._describe_model()) if(self.args.debug_try_save_graph): tf_path = f"{self.args.models_dir}/tf_quadcopter-{self.i}-desc" if not os.path.exists(tf_path): os.mkdir(tf_path) tf_testname_model = "debug_graph.json" tf_full_path = tf_path + "/" + tf_testname_model graph = tf.get_default_graph().as_graph_def() json_graph = json_format.MessageToJson(graph) print(f"Trying to save debug data in {tf_full_path}") with open(tf_full_path, "w") as f: f.write(json_graph) if(self.args.debug_try_save_weights): tf_path = f"{self.args.models_dir}/tf_quadcopter-{self.i}-desc" if not os.path.exists(tf_path): os.mkdir(tf_path) tf_testname_model = "debug_weights.json" tf_full_path = tf_path + "/" + tf_testname_model weights = tf.trainable_variables() weights_vals = tf.get_default_session().run(weights) print(dir(tf.get_default_session().graph)) print(f"Trying to save debug data in {tf_full_path}") with open(tf_full_path, "w") as f: f.write(str(weights_vals)) if self.args.debug_show_tensors_active: ops = [] for e in self.args.debug_show_tensors_list: temp = getattr(model, e) ops.append(temp) values = model.sess.run(ops, feed_dict=f_fwtf_get_feed_dict(model)) for v in values: print(f"v.shape = {v.shape}\nv.value={v}\n\n") def _save_model(self, export_dir): builder = tf.saved_model.builder.SavedModelBuilder(export_dir) builder.add_meta_graph_and_variables(self.model.sess, [tf.saved_model.tag_constants.TRAINING]) builder.save() def _save_model_stable_baselines(self, model, cp_id): # Evaluate policy training performance path = f"{self.args.models_dir}/quadcopter-{cp_id}{self.args.suffix}" logging.info(f"SAVING CURRENT MODEL, Model SAVED at {path}") model.save(path) return path + '.pkl' def _save_model_sherlock(self, filename): output_filename = filename + '.sherlock' params = get_stable_baseline_file_params(filename) print(f"Saving Sherlock Format File {output_filename}") with open( output_filename, 'w' ) as file_ : file_.write(architectures.export.get_sherlock_format(model_desc=self.model_desc, params=params)) def _describe_model(self): res = f"Model.Graph Type={type(self.model.graph)}\nContent={dir(self.model.graph)}\n\n\n" res += f"Analysing {len(tf.get_default_graph().as_graph_def().node)} nodes \n" res += f"Graph Def = {tf.get_default_graph().as_graph_def()}\n" res += f"---------\n" for v in tf.get_default_graph().as_graph_def().node: res += f"{v.name}\n" res += f"-----------\n" return res def _get_action_noise(self, noise_dict, n_actions): if noise_dict['name'] == 'OrnsteinUhlenbeck': return OrnsteinUhlenbeckActionNoise(mean=float(noise_dict['mu'])*np.ones(n_actions), sigma=float(noise_dict['sigma']) * np.ones(n_actions)) else: raise RuntimeError(f"Unrecognized Noise Model {noise_dict['name']}") def _args2str(self,a): return f"step={a.step}\n" \ f"env={a.env}\n" \ f"verbose={str(a.verbose)}\n" \ f"save_plots={str(a.save_plots)}\n" \ f"suffix={a.suffix}\n" \ f"model={json.dumps(a.model)}\n" \ f"activation={a.activation}\n" \ f"action_noise={json.dumps(a.action_noise)}\n" \ f"n_steps={a.n_steps}\n" \ f"model_dir={a.models_dir}\n" \ f"plots_dir={a.plots_dir}\n" def _get_plot_rewards(self): fig=plt.figure("Rewards") plt.plot(self.stats["rewards"]) fig.suptitle('Reward') plt.xlabel('time') plt.ylabel('reward') return plt def _write_graph_def_for_tb(self, graph_def, LOGDIR): """ TODO: Remove """ train_writer = tf.summary.FileWriter(LOGDIR) train_writer.add_graph(graph_def) train_writer.flush() train_writer.close() @property def sb_tb_log_active(self): """ Returns if native Stable Baseline Logging is active """ return self.args.logging['tensorflow']['stable_baselines_native']['active'] @property def sb_tb_log_dir(self): """ Returns the Stable Baseline TF Log Dir """ return self.args.log_dir_tensorboard if self.sb_tb_log_active else None def f_clr_instantiate_model(self, m): res_model = None model_name = m.get_model_name() if m.get_actor_feature_extractor_type() == 'standard': pk = dict(act_fun=activations[self.args.activation]) else: pk = dict(act_fun=activations[self.args.activation], layers=m.get_actor_feature_extractor_architecture()) model_params = { 'policy': MlpPolicy, 'env': self.env, 'verbose': int(self.args.verbose), 'policy_kwargs': pk, 'tensorboard_log': self.sb_tb_log_dir, 'full_tensorboard_log': self.sb_tb_log_active } if m.get_actor_feature_extractor_name() != 'mlp': raise NotImplementedError(f"Exporting Policy Type {model_desc.get_actor_feature_extractor_name()} is unsupported at the moment") if model_name == 'ddpg': algo = DDPG model_params['param_noise'] = self.param_noise model_params['action_noise'] = self.action_noise model_params['render_eval'] = True model_params['policy'] = ddpg_policies.MlpPolicy elif model_name == 'trpo': algo = TRPO model_params['policy'] = common.MlpPolicy elif model_name == 'ppo': algo = PPO2 model_params['policy'] = common.MlpPolicy elif model_name == 'td3': algo = TD3 model_params['policy'] = td3_MlpPolicy elif model_name == 'sac': algo = SAC model_params['policy'] = sac_MlpPolicy model = algo(**model_params) # Tensorboard # tf.io.write_graph(model.graph, self.args.log_dir_tensorboard, "model.pbtxt") if self.train_writer is not None: self.train_writer.add_graph(model.graph) if self.train_writer is not None: self.train_writer.flush() logging.info(f"Instantiated Model Name={res_model}, policy={type(model_params['policy'])}, pk={pk}") return {"model": model, "model_name": model_name.upper()} def f_clw_instantiate_envs(self): """ Instantiate both the Training and Test Gym Env - They provide the same dynamical model and the same reward """ temp = 'gym_quadcopter:quadcopter-v' + str(self.env_desc.get_env_id()) # TODO FIXME: Some models cannot handle multiple envs. N = self.env_desc.get_n_envs() if N < 1: raise RuntimeError(f"Got NumEnvs needs to be >=1 but got NumEnvs={N}") logging.info(f"[SETUP] Creating {N} Training Environments - START") # Instantiating all the Envs and storing them into a private var self.__envs_training = [f_fwgym_get_env( env_id=temp, used_states=self.used_states, instance_index=i, query_classes=self.query_classes, query_class=self.query_class, params=self.args.training_params ) for i in range(N)] # Passing references to previously created envs self.env = DummyVecEnv([lambda: self.__envs_training[i] for i in range(N)]) logging.info(f"[SETUP] Creating {N} Training Environments - DONE") logging.info(f"[SETUP] Creating 1 Test Environments - START") self.env_test = f_fwgym_get_env( env_id=temp, used_states=self.used_states, instance_index=0, query_classes=self.query_classes, query_class=self.query_class, params=self.args.testing_params ) logging.info(f"[SETUP] Creating 1 Test Environments - DONE") def f_clw_args_2_state(self, args): """Initialize internal instance state """ self.model_desc = ModelDict(model_dict=self.args.model) self.env_desc = EnvDict(env_dict=self.args.env) self.tp_desc = TrainingParamsDict(tp_dict=self.args.training_params) self.sp_desc = SwitchParamsDict(self.tp_desc.get_switch_params()) self.query_classes = self.args.query_classes self.query_class = self.args.query_class self.used_states = self.args.used_states self.train_writer = None self.param_noise = None self.f_clw_instantiate_envs() self.n_actions = self.env.action_space.shape[-1] self.action_noise = f_fwgym_get_action_noise(noise_dict=self.args.action_noise, n_actions=self.n_actions) self.has_switched_training_mode = False def f_fwtfw_init(self): """Initialize TF Environment """ tfl.set_verbosity(tfl.ERROR) os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' def get_global_summary(self): return {"ModelName": self.model_desc.get_model_name(), "Continuous": str(self.tp_desc.get_is_continuous()), "Total_Training_Iterations": self.args.n_steps, "Iterations_Per_Checkpoint": self.args.iterations_checkpoint, "Env" : { "ID" : self.env_desc.get_env_id(), "Num_Envs" : self.env_desc.get_n_envs() }} def _add_tf_logs(self): """Adds the additional Tensorflow Logs to the standard Stable Baselines ones """ with self.model.graph.as_default(): # Conditional Logging for Summary if self.args.logging['tensorflow']['summary']['active']: tf.summary.text('Env Summary', tf.convert_to_tensor(str(self.env))) # Conditional Logging for the Stable Baselines Tensors specified in the list if self.args.logging['tensorflow']['stable_baselines_tensors']['active']: for e in self.args.logging['tensorflow']['stable_baselines_tensors']['list']: tf.summary.scalar(f"Custom_SB_Log/{e}", tf.reduce_mean(getattr(self.model, e))) # Conditional Logging for the Tensorflow Tensors specified in the list if self.args.logging['tensorflow']['tensorflow_tensors']['active']: for e in self.args.logging['tensorflow']['tensorflow_tensors']['list']: tf.summary.histogram(f"Custom_TF_Log/{e}", tf.get_default_graph().get_tensor_by_name(e)) # Conditional Logging for Quadcopter Framework Events if self.args.logging['tensorflow']['events']['active']: if 'on_step' in self.args.logging['tensorflow']['events']['list']: tf.summary.text(f'EnvStep{self.n_steps}', tf.convert_to_tensor(self.env.env_method('get_on_step_log'))) # Merge all of the added summaries self.model.summary = tf.summary.merge_all() def _callback_tf_log(self): with self.model.graph.as_default(): if self.args.logging['tensorflow']['events']['active']: if 'on_step' in self.args.logging['tensorflow']['events']['list']: tf.summary.text('EnvStep', tf.convert_to_tensor(self.env.env_method('get_on_step_log'))) self.model.summary = tf.summary.merge_all() def run_training(self, args): """ Training Function """ # Use standard log just for the initial setup # Set the log used during training self.args = args self.process_end_of_actor_activation() self.f_clw_args_2_state(args) logging.info(f"Train Arguments\n{self._args2str(self.args)}") logging.info(f"Writing Tensorboard Log to {self.args.log_dir_tensorboard}") logging.info(f"Start training at {dt.now().strftime('%Y%m%d_%H%M')}") self.f_fwtfw_init() if self.model_desc.get_is_load(): # TODO: Fix this part path = self.model_desc.get_checkpoint_path() model_name = self.model_desc.get_model_name() logging.info(f"LOADING MODEL at {path}") if model_name == "ddpg": self.model = DDPG.load(path, self.env) elif model_name == "ppo": self.model = PPO2.load(path, self.env) elif model_name == "trpo": self.model = TRPO.load(path, self.env) elif model_name == "td3": self.model = TD3.load(path, self.env) elif model_name == 'sac': self.model = SAC.load(path, self.env) else: # the noise objects for DDPG self.f_clw_set_model(self.f_clr_instantiate_model(m=self.model_desc)) self.f_clw_set_interval(self.args.iterations_checkpoint) if self.args.save_tf_checkpoint: with self.model.graph.as_default(): self.train_saver = tf.compat.v1.train.Saver() else: self.train_saver = None self.i = 0 # Implemented in # https://github.com/hill-a/stable-baselines/blob/master/stable_baselines/ddpg/ddpg.py#L807 logging.info(f"GLOBAL SUMMARY: {self.get_global_summary()}") self._add_tf_logs() self.model.learn(total_timesteps=int(self.args.n_steps), callback=self.callback) logging.info(f"Training Finished after {self.n_steps} iterations saving {self.i} intermediate checkpoints") logging.info(f"Saving Final Model in Stable Baseline Checkpoint") temp=self._save_model_stable_baselines(model=self.model, cp_id="final") print(f"Exporting Actor from Final Model in Stable Baseline Checkpoint as Sherlock Format") self._save_model_sherlock(temp) if self.train_writer is not None: self.train_writer.close() plt = self._get_plot_rewards() now = dt.now() plt.savefig(f"{self.args.plots_dir}/reward_{now.strftime('%Y%m%d_%H%M%S')}.png") return True
learning_results.to_pickle(FOLDER + "/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_0.pkl") (sac_model, learning_results) = sac_model.learn(total_timesteps=NUM_TRAINING_STEPS, log_interval=1000) learning_results['AgentID'] = agent_step learning_results.to_pickle(FOLDER + "/results/SAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_0.pkl") for resample_step in range(1, NUM_RESAMPLES): # Set both environments to the same resampled values if (RANDOMIZATION_LEVEL == "Normal"): clac_env.env_method("randomize", 0) elif (RANDOMIZATION_LEVEL == "Extreme"): clac_env.env_method("randomize", 1) elif (RANDOMIZATION_LEVEL == "Test"): clac_env.env_method("randomize", -1) else: print("Error resampling unknown value: ", RANDOMIZATION_LEVEL) continue if (agent_step == 1): print(mut_coef, " ", ent_coef, " ", NUM_TRAINING_STEPS, " ", ENVIRONMENT_NAME, " ", FOLDER, " resample step ", resample_step) env_features = clac_env.env_method("get_features")[0] sac_env.env_method("set_features", env_features)
x = np.array(x_range) y = eval(formula) plt.plot(x, y) plt.show() # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: ProcessorEnv()]) model = PPO2(MlpPolicy, env, verbose=1, learning_rate=0.00025) model.learn(total_timesteps=5000) model.save("ppo2_microprocessor_4") model = PPO2.load("ppo2_microprocessor_4") # env = DummyVecEnv([lambda: ProcessorEnv(taskFile='data/example.xlsx')]) env = DummyVecEnv([lambda: ProcessorEnv()]) obs = env.reset() print("^^^^^^^^^^^^^^^^^^^RESET") # env.env_method('graphShow') # for i in range(200): done = False while not done: env.render() action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) if done: print(info[0]) writeOutputFile('out.csv', info[0]['info']) env.env_method('graphShow')
env = Monitor(env, log_dir, allow_early_resets=True) return env if __name__ == '__main__': env = DummyVecEnv([ lambda: env_generator(ep_len=episode_length, total_sweeps=episode_length * 100, beta_init_function=lambda: 1.4 * np.random.rand( ) + 0.2) ]) env = VecNormalize(env, norm_obs=False, norm_reward=False, training=True) env.env_method('set_experiment_tag', indices=[0], tag=args.tag) #env.env_method('init_HamiltonianGetter', indices=[0], phase='TRAIN') env.env_method('init_HamiltonianGetter', indices=[0], phase='WSC', directory=args.hamiltonian_directory) env.env_method('set_max_ep_length', indices=[0], max_ep_length=episode_length) n_steps = 0 best_mean_reward = -np.inf def callback(_locals, _globals): global n_steps, best_mean_reward if (n_steps) % 100 == 0:
T = env.get_attr('T')[0] model = DDPG(MlpPolicy, env, verbose=1) model.load(TEST_MODEL) delta = DeltaHedge() for i in range(cfg.test_times): # rl env.set_attr("b_rl", True) obs = env.reset() # every time, create a new transaction naked_returns.append(naked(env)) covered_returns.append(covered(env)) for i in range(T): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) # env.render() rl_returns.append(env.get_attr('final_reward')[0]) env.env_method('restart') # only trace back to the initial state env.set_attr("b_rl", False) # delta for i in range(T): action = delta.make_decision(env) obs, rewards, done, info = env.step(action) # env.render() delta_returns.append(env.get_attr('final_reward')[0]) print("naked:", naked_returns) print("covered:", covered_returns) print("rl:", rl_returns) print("delta:", delta_returns) else: # load data df_train, df_test, df_rate = load_data(cfg)
class PPO2Agent: def __init__(self, base_env, subproc=True, envs=64, ): self.base_env = base_env self.subproc = subproc if subproc: envs = multiprocessing.cpu_count() if envs is None else envs self.env = SubprocVecEnv([lambda: base_env for _ in range(envs)]) else: self.env = DummyVecEnv([lambda: self.base_env]) self.model = None def load_model(self, path, model_name): self.model = PPO2.load(path, self.env) self.env.env_method("load_normalization_info", model_name=model_name) def save_model(self, path="ppo2_simple_robot"): if self.model is None: raise AssertionError("Model does not exist- cannot be saved.") self.model.save(path) def new_model(self, policy=MlpPolicy, gamma=0.99, batch_size=128): self.model = PPO2(policy, self.env, verbose=1, gamma=gamma, n_steps=batch_size) def learn(self, timesteps, learning_handler, checkpoint_interval=1000, path=None, learning_rate=0.00025, curiosity_path=None, batch_size=128): curiosity = curiosity_path is not None self.model.learning_rate = learning_rate if self.model is None: self.new_model(batch_size=batch_size) if checkpoint_interval is not None: for checkpoint in range(int(timesteps / checkpoint_interval)): cb = learning_handler.get_learn_callback(checkpoint * checkpoint_interval, curiosity=curiosity, subproc=self.subproc, batch_size=batch_size) self.model.learn(total_timesteps=checkpoint_interval, callback=cb, reset_num_timesteps=False) self.save_model(path) if curiosity: self.base_env.curiosity_module.save_forward(curiosity_path) matplotlib.use('Agg') m = learning_handler.model_storage.get_model(learning_handler.model_name) learning_handler.save_plot(m['realtime_data']['plot_path'], real_time=True, curiosity=curiosity) learning_handler.save_plot(m['timestep_data']['plot_path'], real_time=False, curiosity=curiosity) else: cb = learning_handler.get_learn_callback(curiosity=curiosity, subproc=self.subproc) self.model.learn(total_timesteps=timesteps, callback=cb, reset_num_timesteps=False) self.save_model(path) def demo(self, timestep_sleep=0): obs = self.base_env.reset() while True: action, _states = self.model.predict(obs) obs, _, done, info = self.base_env.step(action, render=True) self.base_env.render() time.sleep(timestep_sleep) if done: obs = self.base_env.reset() def validate(self, n_episodes): obs = self.base_env.reset() ep_histories = None for i in range(n_episodes): ep_history = [] while True: action, _states = self.model.predict(obs) obs, reward, done, info = self.base_env.step(action) ep_history.append(info['distance']) if done: if ep_histories is None: ep_histories = np.array([ep_history]) else: ep_histories = np.concatenate((ep_histories, [ep_history])) obs = self.base_env.reset() break return ep_histories
def run_matchup(drafter1: str, drafter2: str, battler: str, games: int, seed: int, concurrency: int) \ -> Tuple[Tuple[float, float], Tuple[list, list], Tuple[list, list], List[List[Tuple]], Tuple[list, list], List[float]]: """ Run the match-up between `drafter1` and `drafter2` using `battler` battler :param drafter1: drafter to play as first player :param drafter2: drafter to play as second player :param battler: battler to simulate the matches :param games: amount of matches to simulate :param seed: seed used to generate the matches :param concurrency: amount of matches executed at the same time :return: a tuple containing (i) a tuple containing the win rate of the first and second players, (ii) a tuple containing the average mana curves of the first and second players, (iii) a tuple containing the `30 * games` individual draft choices of the first and second players; (iv) a tuple of 3-uples containing the card alternatives presented to the players at each of the `games` episodes; and (v) a tuple containing the `games` decks built by the first and second players. """ # parse the battle agent battler = agents.parse_battle_agent(battler) # initialize envs env = [lambda: LOCMDraftEnv(battle_agents=(battler(), battler())) for _ in range(concurrency)] # wrap envs in a vectorized env env = DummyVecEnv(env) for i in range(concurrency): # no overlap between episodes at each process current_seed = seed + (games // concurrency) * i current_seed -= 1 # resetting the env increases the seed by 1 # set seed to env env.env_method('seed', current_seed, indices=[i]) # reset the env env.reset() # initialize first player if drafter1.endswith('zip'): current_drafter = agents.RLDraftAgent(PPO2.load(drafter1)) current_drafter.use_history = "history" in drafter1 else: current_drafter = agents.parse_draft_agent(drafter1)() current_drafter.seed(seed) current_drafter.name = drafter1 drafter1 = current_drafter # initialize second player if drafter2.endswith('zip'): other_drafter = agents.RLDraftAgent(PPO2.load(drafter2)) other_drafter.use_history = "history" in drafter2 else: other_drafter = agents.parse_draft_agent(drafter2)() other_drafter.seed(seed) other_drafter.name = drafter2 drafter2 = other_drafter # initialize metrics episodes_so_far = 0 episode_rewards = [[0.0] for _ in range(env.num_envs)] drafter1.mana_curve = [0 for _ in range(13)] drafter2.mana_curve = [0 for _ in range(13)] drafter1.choices = [[] for _ in range(env.num_envs)] drafter2.choices = [[] for _ in range(env.num_envs)] drafter1.decks = [[[]] for _ in range(env.num_envs)] drafter2.decks = [[[]] for _ in range(env.num_envs)] alternatives = [[] for _ in range(env.num_envs)] # run the episodes while True: observations = env.get_attr('state') # get the current agent's action for all concurrent envs if isinstance(current_drafter, agents.RLDraftAgent): all_past_choices = env.get_attr('choices') new_observations = [] for i, observation in enumerate(observations): new_observation = encode_state_draft( observation, use_history=current_drafter.use_history, past_choices=all_past_choices[i][observation.current_player.id] ) new_observations.append(new_observation) actions = current_drafter.act(new_observations) else: actions = [current_drafter.act(observation) for observation in observations] # log chosen cards into current agent's mana curve for i, (action, observation) in enumerate(zip(actions, observations)): # get chosen index try: chosen_index = action.origin except AttributeError: chosen_index = action # save choice current_drafter.choices[i].append(chosen_index) # get chosen card chosen_card = observation.current_player.hand[chosen_index] # increase amount of cards chosen with the chosen card's cost current_drafter.mana_curve[chosen_card.cost] += 1 # add chosen card to this episode's deck current_drafter.decks[i][-1].append(chosen_card.id) # save card alternatives if observation.current_player.id == PlayerOrder.FIRST: alternatives[i].append(tuple(map(lambda c: c.id, observation.current_player.hand))) # perform the action and get the outcome _, rewards, dones, _ = env.step(actions) if isinstance(current_drafter, agents.RLDraftAgent): current_drafter.dones = dones # update metrics for i in range(env.num_envs): episode_rewards[i][-1] += rewards[i] if dones[i]: episode_rewards[i].append(0.0) current_drafter.decks[i].append([]) other_drafter.decks[i].append([]) episodes_so_far += 1 # check exiting condition if episodes_so_far >= games: break # swap drafters current_drafter, other_drafter = other_drafter, current_drafter # normalize mana curves total_choices = sum(drafter1.mana_curve) drafter1.mana_curve = [freq / total_choices for freq in drafter1.mana_curve] drafter2.mana_curve = [freq / total_choices for freq in drafter2.mana_curve] # join all parallel rewards all_rewards = [reward for rewards in episode_rewards for reward in rewards[:-1]] # join all parallel choices drafter1.choices = [c for choices in drafter1.choices for c in choices] drafter2.choices = [c for choices in drafter2.choices for c in choices] # join all parallel decks drafter1.decks = [deck for decks in drafter1.decks for deck in decks if deck] drafter2.decks = [deck for decks in drafter2.decks for deck in decks if deck] # join all parallel alternatives alternatives = [turn for env in alternatives for turn in env] # cap any unsolicited data from additional episodes all_rewards = all_rewards[:games] drafter1.choices = drafter1.choices[:30 * games] drafter2.choices = drafter2.choices[:30 * games] drafter1.decks = drafter1.decks[:games] drafter2.decks = drafter2.decks[:games] alternatives = alternatives[:30 * games] # convert the list of rewards to the first player's win rate win_rate = (mean(all_rewards) + 1) * 50 return (win_rate, 100 - win_rate), \ (drafter1.mana_curve, drafter2.mana_curve), \ (drafter1.choices, drafter2.choices), \ alternatives, \ (drafter1.decks, drafter2.decks), \ all_rewards
def test_agent(agent_step): now = time.time() for coef_index in range(len(CLAC_COEFS)): mut_coef = CLAC_COEFS[coef_index] ent_coef = SAC_COEFS[coef_index] training_timestep = 0 clac_env = gym.make(ENVIRONMENT_NAME) clac_env = DummyVecEnv([lambda: clac_env]) clac_model = CLAC(CLAC_MlpPolicy, clac_env, mut_inf_coef=mut_coef, verbose=1) sac_env = gym.make(ENVIRONMENT_NAME) sac_env = DummyVecEnv([lambda: sac_env]) sac_model = SAC(MlpPolicy, sac_env, ent_coef=ent_coef, verbose=1) mirl_env = gym.make(ENVIRONMENT_NAME) mirl_env = DummyVecEnv([lambda: mirl_env]) mirl_model = CLAC(CLAC_MlpPolicy, mirl_env, mut_inf_coef=mut_coef, coef_schedule=3.3e-3, verbose=1) for resample_step in range(0, NUM_RESAMPLES): features = pd.DataFrame() if(agent_step == 1): print(mut_coef, " ", ent_coef, " ", NUM_TRAINING_STEPS, " ", ENVIRONMENT_NAME, " ", FOLDER, " ", resample_step) (clac_model, learning_results) = clac_model.learn(total_timesteps=NUM_TRAINING_STEPS, log_interval=1000) (sac_model, learning_results) = sac_model.learn(total_timesteps=NUM_TRAINING_STEPS, log_interval=1000) (mirl_model, learning_results) = mirl_model.learn(total_timesteps=NUM_TRAINING_STEPS, log_interval=1000) # Save models clac_model.save(FOLDER + "/Training/models/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step)) sac_model.save(FOLDER + "/Training/models/CLAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step)) mirl_model.save(FOLDER + "/Training/models/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step)) training_timestep += NUM_TRAINING_STEPS # Test Normal eval_results = eval_model(clac_model, clac_env, "CLAC", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 0) eval_results.to_pickle(FOLDER + "/Training/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") eval_results = eval_model(sac_model, sac_env, "SAC", ent_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 0) eval_results['AgentID'] = agent_step eval_results.to_pickle(FOLDER + "/Training/results/SAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") eval_results = eval_model(mirl_model, mirl_env, "MIRL", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 0) eval_results['AgentID'] = agent_step eval_results.to_pickle(FOLDER + "/Training/results/MIRL_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") # Test generalization eval_results = eval_model(clac_model, clac_env, "CLAC", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 1) eval_results['AgentID'] = agent_step eval_results.to_pickle(FOLDER + "/Generalization/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") eval_results = eval_model(sac_model, sac_env, "SAC", ent_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 1) eval_results['AgentID'] = agent_step eval_results.to_pickle(FOLDER + "/Generalization/results/SAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") eval_results = eval_model(mirl_model, mirl_env, "MIRL", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 1) eval_results['AgentID'] = agent_step eval_results.to_pickle(FOLDER + "/Generalization/results/MIRL_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") # Test generalization Extreme eval_results = eval_model(clac_model, clac_env, "CLAC", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 2) eval_results['AgentID'] = agent_step eval_results.to_pickle(FOLDER + "/Extreme/results/CLAC_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") eval_results = eval_model(sac_model, sac_env, "SAC", ent_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 2) eval_results['AgentID'] = agent_step eval_results.to_pickle(FOLDER + "/Extreme/results/SAC_" + str(ent_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") eval_results = eval_model(mirl_model, mirl_env, "MIRL", mut_coef, NUM_TESTING_STEPS, training_timestep, agent_step, resample_step, 2) eval_results['AgentID'] = agent_step eval_results.to_pickle(FOLDER + "/Extreme/results/MIRL_" + str(mut_coef).replace(".", "p") + "_" + str(agent_step) + "_" + str(resample_step) + ".pkl") clac_env.env_method("reset_features") sac_env.env_method("reset_features") mirl_env.env_method("reset_features") del sac_model del sac_env del clac_model del clac_env del mirl_model del mirl_env later = time.time() difference = int(later - now) print("Tested Agent Time: ", difference)
def baseline(num_hamiltonians=20, num_trials=10): env = DummyVecEnv([ lambda: env_generator(ep_len=args.episode_length, total_sweeps=args.total_sweeps) ]) env.env_method('set_experiment_tag', indices=[0], tag=args.tag) env.env_method('set_max_ep_length', indices=[0], max_ep_length=args.episode_length) env.env_method('init_HamiltonianGetter', indices=[0], phase='TEST', directory=args.hamiltonian_directory) env.env_method("init_HamiltonianSuccessRecorder", indices=[0], num_hamiltonians=num_hamiltonians, num_trials=num_trials) env.env_method("set_static_Hamiltonian_by_ID", indices=[0], ID=0) obs = env.reset() test_ep = -1 for ham in range(num_hamiltonians): env.env_method("set_static_Hamiltonian_by_ID", indices=[0], ID=ham) for trial in range(num_trials): test_ep += 1 state = None done = [False for _ in range(env.num_envs)] step = -1 print("beta=", env.env_method('get_current_beta', indices=[0])[0]) while True: step += 1 obs, reward, d, _ = env.step(np.array([dbetas[step]])) if d: break env.env_method("hsr_write")
if PHASE == 'VALUE_ANALYSIS': model_args["learning_rate"] = 0.0000000000 model_args["noptepochs"] = 1 if __name__ == '__main__': env = DummyVecEnv([ lambda: env_generator(ep_len=episode_length, total_sweeps=episode_length * 1, beta_init_function=lambda: 2 * np.random.rand() + 0.333) ]) env = VecNormalize(env, norm_obs=False, norm_reward=False, training=True) env.env_method('set_experiment_tag', indices=[0], tag=args.tag) if PHASE == 'VALUE_ANALYSIS' or PHASE == 'ISING': env.env_method('init_HamiltonianGetter', indices=[0], phase=PHASE, directory=args.hamiltonian_directory) elif PHASE == 'TRAIN': env.env_method('init_HamiltonianGetter', indices=[0], phase='TRAIN') env.env_method('set_max_ep_length', indices=[0], max_ep_length=episode_length) if args.destructive: env.env_method('set_destructive_observation_on', indices=[0]) n_steps = 0
class LagrangianCMDPSolver(CMDPSolverBase): """ Class to solve CMDP with Lagrangian method. The method we use is bases on "Batch policy learning under constraints" by Le et al. The constrained MDP is addressed by solving a sequence of unconstrained MDPs. In particular, we alternate between a best response (BR) algorithm that solves the unconstrained problem deriving from fixing the value of the Lagrange multipliers and an online optimization algorithm that sets the multipliers based on the performance of the BR. """ # TODO: Estimate the duality gap for stopping def __init__(self, env, br_algo, online_algo, br_kwargs=None, online_kwargs=None, _init_setup_model=True, lagrangian_ronuds=10, log_training=False, br_uses_vec_env=False, n_envs=1, use_sub_proc_env=True): """ Parameters ---------- env: src.envs.CMDP or None br_algo: stable baselines algorithm class Best response algorithm online_algo: src.online Online optimization algorithm class br_kwargs: dict Keyword arguments for best response online_kwargs: dict Keyword arguments for online opt algorithm _init_setup_model: bool Whether to set up the br and online upon initialization lagrangian_ronuds: int Number of times we alternate between br and online log_training: bool Whether to log episode rewards and constraints during training br_uses_vec_env: bool Whether br algorithms needs a vectorized environment n_envs: int Number of environments to use (only relevant for vectorized case) use_sub_proc_env: bool Whether to use subprocesses for vectorized env (otherwise dummy vec is used) """ self.br_algo = br_algo self.online_algo = online_algo self.br_kwargs = {} if br_kwargs is None else br_kwargs online_kwargs = {} if online_kwargs is None else online_kwargs self.online_kwargs = online_kwargs.copy() # Initialize placeholders to fill when setting the environment and # the model self.br = None self.online = None self.unconstrainedMDP = None # The MDP resulting from Lagrangian ofCMDP self._env = None self.observation_space = None self.action_space = None self.env_generator = None self.lagrangian_rounds = lagrangian_ronuds self._log_training = log_training self.training_rewards = None self.training_constraints = None # Vectorized environment arguments self.br_uses_vec_env = br_uses_vec_env self.use_sub_proc_env = use_sub_proc_env self.n_envs = n_envs self.set_env(env) if _init_setup_model: self.setup_model() def set_unconstrainedMDP(self): """ Set up the unconstrained Lagrangian MDP. It can be set up either as a normal environment, a dummy vecotrized environment or a multiprocessing vectorized environment """ assert self.online is not None, 'Need a value for Lagrange ' \ 'multipliers to initialize the ' \ 'unconstrained MDP' if self.br_uses_vec_env: # The function that generate the Lagrangian environment needs to # be outside the class to avoid pickling errors with # multiprocessing lagrangian_env = partial(get_lagrangian_env, cenv=None, # Passing _env here is not necessary and slows down serialization a lot w=self.online.w, cenv_gen=self.env_generator) assert self.env_generator is not None, \ 'Environment generator is necessary for vectorized env' # With subprocesses for env if self.use_sub_proc_env: self.unconstrainedMDP = SubprocVecEnv( [lagrangian_env for _ in range(self.n_envs)]) # With dummy vec env else: self.unconstrainedMDP = DummyVecEnv( [lagrangian_env for _ in range(self.n_envs)]) else: lagrangian_env = partial(get_lagrangian_env, cenv=self._env, w=self.online.w, cenv_gen=self.env_generator) self.unconstrainedMDP = lagrangian_env() def _initialize_online(self): if self._env is not None: d = self._env.n_constraints + 1 self.online_kwargs.update({'d': d}) self.online = self.online_algo(**self.online_kwargs) else: print('Skipping online initialization since there is no env') def update_online(self, keep_multipliers=False): """ Update online optimization algorithm. """ if self.online is not None and keep_multipliers and \ self._env.n_constraints + 1 == len(self.online.w): pass else: self._initialize_online() def setup_model(self): """ Set best response. """ if self.unconstrainedMDP is None: self.br = None else: br_kwargs = self.br_kwargs.copy() br_kwargs.update({'env': self.unconstrainedMDP}) self.br = self.br_algo(**br_kwargs) def _setup_learn(self, seed): """ check the environment, set the seed, and set the logger Parameters ---------- seed: int The seed value """ if self._env is None: raise ValueError("Error: cannot train the model without a valid environment, please set an environment with" "set_env(self, env) method.") if seed is not None: set_global_seeds(seed) def learn(self, total_timesteps, seed=None, log=False): """ Solve the CMDP alternating BR and online algorithm. Parameters ---------- total_timesteps: int Total number of timesteps the algorithm is run for. Each Lagrangian round (i.e. alternation of br and online) is run to total_timesteps/self.lagrangian_rounds. seed: int or None The random seed log: Bool Print to screen some statistics about the BR training. Returns ------- R: float Return when evaluating the policy learned by BR in last Lagrangian round G: np.ndarray Constraint when evaluating the policy learned by BR in last Lagrangian round w: np.ndarray Lagrange multipliers """ self._setup_learn(seed) if total_timesteps < self.lagrangian_rounds: raise ValueError("There should be more time steps than Lagrangian rounds") # Number of timesteps per Lagrangian round br_time_steps = np.full(self.lagrangian_rounds, int(total_timesteps / self.lagrangian_rounds)) br_time_steps[-1] += np.mod(total_timesteps, self.lagrangian_rounds) # Alternate between br and online for ts in br_time_steps: # Reset the monitor that tracks the performance of BR on the # unconstrained Lagrangian MDP (constraint violation is also # tracked) if self.br_uses_vec_env: self.unconstrainedMDP.env_method('reset_monitor') else: self.unconstrainedMDP.reset_monitor() self.br._init_num_timesteps() # Reset exploration schedule # Train BR on unconstrained MDP if log: self.br.learn(ts, log_interval=ts) else: self.br.learn(ts, log_interval=np.inf) # Get training performance if self.br_uses_vec_env: # Get reward and constraints from all envs r_tmp = self.unconstrainedMDP.env_method( 'get_episode_rewards') g_tmp = self.unconstrainedMDP.env_method( 'get_episode_constraints') current_rewards = np.concatenate(r_tmp) current_constraints = np.concatenate(g_tmp) else: current_rewards = \ self.unconstrainedMDP.get_episode_rewards() current_constraints = \ self.unconstrainedMDP.get_episode_constraints() R = np.mean(current_rewards) G = np.mean(current_constraints, axis=0) # Log info about training if self._log_training: if self.training_rewards is None: self.training_rewards = np.copy(current_rewards) else: self.training_rewards = np.hstack(( self.training_rewards, current_rewards)) # self.training_rewards.append(list(current_rewards)) if self.training_constraints is None: self.training_constraints = np.copy(current_constraints) else: self.training_constraints = np.vstack(( self.training_constraints, current_constraints)) # evaluate performance may be necessary for off-policy methods # where the deployed policy is different from the one that # collects data (in that case, it would make sense to adjust the # multipliers according to the optimized policy and not the # exploratory one) # R, G = self.evaluate_performance(int(0.2 * ts), min_episodes=5) # print('Evaluation r:{}\tEvaluation g {}'.format(R, G)) # Online algorithm updates multipliers based on BR performance self.online.step(-np.append(G, 0)) # Set new multipliers if self.br_uses_vec_env: self.unconstrainedMDP.set_attr('lam', self.online.w[:-1]) else: self.unconstrainedMDP.lam = self.online.w[:-1] return R, G, self.online.w def predict(self, observation, state=None, mask=None, deterministic=True): """ Get the best response action from an observation """ if self.br is not None: return self.br.predict(observation, state, mask, deterministic) else: raise ValueError('Need a valid environment to setup learner and predict its action') def action_probability(self, observation, state=None, mask=None, actions=None, logp=False): if self.br is not None: return self.br.action_probability(observation, state, mask, actions, logp) else: raise ValueError('Need a valid environment to setup learner and predict its action probabilities') def evaluate_performance(self, min_steps, min_episodes): """ Deploy policy learned by BR to evaluate its performance in terms of return and constraint violation. Parameters ---------- min_steps: int Minimum number of steps that we run the environment for min_episodes: int Minimum number of episodes Returns ------- R: float Average return across episodes G: np.ndarray Average constraint value across episods """ if self.unconstrainedMDP is None: raise ValueError('Cannot reset monitor without a valid environment') n_episodes = 0 n_steps = 0 max_steps = min_steps * 5 # Fix a timeout # TODO: If we move to subproc env, we should aim to use the # vectorized env properly here if self.br_uses_vec_env: # This is equivalent to the non-vectorized case since we operate # only on one env. However, we still need to use the vectorized # env interface to access the individual attributes and methods. # Reser monitor and env self.unconstrainedMDP.env_method('reset_monitor') obs = self.unconstrainedMDP.env_method('reset', indices=0)[0] # Run env while (n_episodes < min_episodes or n_steps < min_steps) and not n_steps > max_steps: action, _ = self.br.predict(obs, deterministic=True) obs, reward, done, info = self.unconstrainedMDP.env_method( 'step', action, indices=0)[0] if done: n_episodes += 1 obs = self.unconstrainedMDP.env_method('reset', indices=0)[0] n_steps += 1 # Compute return and contraint R = np.mean(self.unconstrainedMDP.env_method( 'get_episode_rewards', indices=0)[0]) G = np.mean(self.unconstrainedMDP.env_method( 'get_episode_constraints', indices=0)[0], axis=0) else: # Reser monitor and env self.unconstrainedMDP.reset_monitor() obs = self.unconstrainedMDP.reset() # Run env while (n_episodes < min_episodes or n_steps < min_steps) and not n_steps > max_steps: action, _ = self.br.predict(obs, deterministic=True) obs, reward, done, info = self.unconstrainedMDP.step(action) if done: n_episodes += 1 obs = self.unconstrainedMDP.reset() n_steps += 1 # Compute return and contraint R = np.mean(self.unconstrainedMDP.get_episode_rewards()) G = np.mean(self.unconstrainedMDP.get_episode_constraints(), axis=0) return R, G def set_env(self, env, keep_multipliers=False, reset_br=False): """ Set a new environment. Parameters ---------- env: src.envs.CMDP keep_multipliers: bool setup_model: bool """ # Clean up resources if vectorized env already exists if isinstance(self.unconstrainedMDP, (DummyVecEnv, SubprocVecEnv)): self.unconstrainedMDP.close() # For vectorized environment we need an environment generating # function, otherwise we can simply set the env if self.br_uses_vec_env: if env is not None: assert callable(env), 'An environments generating callable is ' \ 'necessary for algorithms requiring a ' \ 'vectorized environment' # If necessary, this extra copy of the env can be removed. # Need to check all the places where _env is accessed and # modify them. super().set_env(env()) self.env_generator = env else: super().set_env(env) self.env_generator = None # Not needed in non-vectorized case if self.get_env() is not None: self.update_online(keep_multipliers) self.set_unconstrainedMDP() if reset_br or self.br is None: self.setup_model() self.br.set_env(self.unconstrainedMDP) self.training_rewards = None self.training_constraints = None def get_env(self): return super().get_env() def set_multipliers(self, w): if self.online is not None: if len(w) != len(self.online.w): raise ValueError('Multipliers must have the same length. Old ones have length {}, while new ones have ' 'length {}'.format(len(self.online.w), len(w))) else: self.online.w = w else: warnings.warn('There is no online algorithm to set the multipliers for') def get_multipliers(self): return self.online.w def get_br_params(self): return self.br.get_parameters() def set_br_params(self, params): self.br.load_parameters(params) def get_params(self): params = self.get_br_params() multipliers = self.get_multipliers() params.update({'multipliers': multipliers}) return params def set_params(self, params): multipliers = params['multipliers'] self.set_multipliers(multipliers) del params['multipliers'] self.set_br_params(params) def get_training_performance(self): if not self._log_training: warnings.warn('Log training is set to False and no data was logged') return self.training_rewards, self.training_constraints @property def log_training(self): return self._log_training @log_training.setter def log_training(self, value): self._log_training = bool(value)