def main(_): def make_env(): env_out = gym.make('CartPole-v0') env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) env = VecNormalize(env) policy = { 'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy }[FLAGS.policy] model = PPO2(policy=policy, env=env, n_steps=FLAGS.n_steps, nminibatches=FLAGS.nminibatches, lam=FLAGS.lam, gamma=FLAGS.gamma, noptepochs=FLAGS.noptepochs, ent_coef=FLAGS.ent_coef, learning_rate=FLAGS.learning_rate, cliprange=FLAGS.cliprange, verbose=FLAGS.verbose) model.learn(total_timesteps=FLAGS.num_timesteps)
def test(env_id, seed, policy): """ Train PPO2 model for atari environment, for testing purposes :param env_id: (str) the environment id string :param seed: (int) Used to seed the random generator. :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) """ # if 'lstm' in policy: # print('LSTM policies not supported for drawing') # return 1 env = DummyVecEnv([PadEnvRender for _ in range(1)]) # Need for lstm # else: # env = PadEnvRender() env = VecFrameStack(env, 8) model = PPO2.load('./pad_5combo_ppo2.pkl', env) while True: obs, done = env.reset(), False episode_rew = 0 while not done: env.render() action, _ = model.predict(obs) obs, rew, done, _ = env.step(action) done = done.any() episode_rew += rew time.sleep(1 / 24.) if done: print('Episode reward:', rew)
def do_ppo(args, start_theta, parent_this_run_dir, full_space_save_dir): """ Runs the test """ logger.log(f"#######CMA and then PPO TRAIN: {args}") this_conti_ppo_run_dir = get_ppo_part(parent_this_run_dir) log_dir = get_log_dir(this_conti_ppo_run_dir) conti_ppo_save_dir = get_save_dir(this_conti_ppo_run_dir) logger.configure(log_dir) full_param_traj_dir_path = get_full_params_dir(this_conti_ppo_run_dir) if os.path.exists(full_param_traj_dir_path): import shutil shutil.rmtree(full_param_traj_dir_path) os.makedirs(full_param_traj_dir_path) if os.path.exists(conti_ppo_save_dir): import shutil shutil.rmtree(conti_ppo_save_dir) os.makedirs(conti_ppo_save_dir) def make_env(): env_out = gym.make(args.env) env_out.env.disableViewer = True env_out.env.visualize = False env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) if args.normalize: env = VecNormalize(env) model = PPO2.load(f"{full_space_save_dir}/ppo2") model.set_from_flat(start_theta) if args.normalize: env.load_running_average(full_space_save_dir) model.set_env(env) run_info = {"run_num": args.run_num, "env_id": args.env, "full_param_traj_dir_path": full_param_traj_dir_path} # model = PPO2(policy=policy, env=env, n_steps=args.n_steps, nminibatches=args.nminibatches, lam=0.95, gamma=0.99, # noptepochs=10, # ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, optimizer=args.optimizer) model.tell_run_info(run_info) episode_returns = model.learn(total_timesteps=args.ppo_num_timesteps) model.save(f"{conti_ppo_save_dir}/ppo2") env.save_running_average(conti_ppo_save_dir) return episode_returns, full_param_traj_dir_path
def train(env_id, num_timesteps, seed): """ Train PPO2 model for Mujoco environment, for testing purposes :param env_id: (str) the environment id string :param num_timesteps: (int) the number of timesteps to run :param seed: (int) Used to seed the random generator. """ def make_env(): env_out = gym.make(env_id) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) env = VecNormalize(env) set_global_seeds(seed) policy = MlpPolicy model = PPO2(policy=policy, env=env, n_steps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, ent_coef=0.0, learning_rate=3e-4, cliprange=0.2) model.learn(total_timesteps=num_timesteps) return model, env
def create_env(n_envs=1,eval_env=True): import tensorflow as tf tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) tf.Session().__enter__() ncpu = 1 def make_env(): env = gym.make("CollisionAvoidance-v0") # The env provides a dict observation by default. Most RL code # doesn't handle dict observations, so these wrappers convert to arrays if Config.TRAIN_SINGLE_AGENT: # only return observations of a single agent env = FlattenDictWrapper(env, dict_keys=Config.STATES_IN_OBS) else: # return observation of all agents (as a long array) env = MultiagentFlattenDictWrapper(env, dict_keys=Config.STATES_IN_OBS, max_num_agents=Config.MAX_NUM_AGENTS_IN_ENVIRONMENT) return env # To be prepared for training on multiple instances of the env at once if Config.TRAIN_SINGLE_AGENT: env = DummyVecEnv([make_env for _ in range(n_envs)]) else: env = MultiagentDummyVecEnv([make_env for _ in range(n_envs)]) unwrapped_envs = [e.unwrapped for e in env.envs] # Set env id for each env for i, e in enumerate(unwrapped_envs): e.id = i one_env = unwrapped_envs[0] return env, one_env
def test_identity_multibinary(model_class): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) with a multibinary action space :param model_class: (BaseRLModel) A RL Model """ env = DummyVecEnv([lambda: IdentityEnvMultiBinary(10)]) model = model_class("MlpPolicy", env) model.learn(total_timesteps=1000, seed=0) n_trials = 1000 reward_sum = 0 obs = env.reset() for _ in range(n_trials): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) reward_sum += reward assert model.action_probability(obs).shape == (1, 10), \ "Error: action_probability not returning correct shape" assert np.prod(model.action_probability(obs, actions=env.action_space.sample()).shape) == 1, \ "Error: not scalar probability"
def main(): # Parse command line args parser = arg_parser() parser.add_argument("-hw", "--use-hardware", action="store_true") parser.add_argument("-l", "--load", type=str, default=None) args = parser.parse_args() env = "QubeSwingupEnv" def make_env(): env_out = QubeSwingupEnv(use_simulator=not args.use_hardware, frequency=250) return env_out try: env = DummyVecEnv([make_env]) policy = MlpPolicy model = PPO2(policy=policy, env=env) model.load_parameters(args.load) print("Running trained model") obs = np.zeros((env.num_envs, ) + env.observation_space.shape) obs[:] = env.reset() while True: actions = model.step(obs)[0] obs[:], reward, done, _ = env.step(actions) if not args.use_hardware: env.render() if done: print("done") obs[:] = env.reset() finally: env.close()
def run_model(save_name, nw_type, log_dir='./Logs/', log_name=None, env_name='CartPole-v2', runs=100, save_results=False): # Sets up an environment and a model: env = DummyVecEnv([lambda: gym.make(env_name)]) model = load_model(nw_type=nw_type, log_dir=log_dir, env_name=env_name, log_name=log_name, save_name=save_name) # Runs environment with the loaded model "runs" times max_reward = 0 max_steps = 0 rew_vec = [] header = 'theta1,alpha1,dtheta1,dalpha1,theta2,alpha2,dtheta2,dalpha2' for i in range(runs): # Resets the environment obs, done = env.reset(), False episode_rew = 0 ep_steps = 0 obs_vec = obs.reshape(-1, 1) # This loop runs the environment until a terminal state is reached while not done: action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) env.render() episode_rew += rewards[-1] ep_steps += 1 obs_vec = np.append(obs_vec, obs.reshape(-1, 1) * 180 / np.pi, axis=1) # Saves the reached reward and checks if its a record etc. rew_vec.append(episode_rew) print("Ep reward: ", '{0:.2f}'.format(episode_rew), '\tRecord: ', '{0:.2f}'.format(max_reward), '\tEp steps: ', ep_steps, '\tSteps record: ', max_steps) np.savetxt('rew_vec.csv', rew_vec, delimiter=',') if episode_rew > max_reward: max_reward = episode_rew if save_results: np.savetxt('obs_vec.csv', obs_vec.T, delimiter=',', header=header, fmt='%1.3f', comments='') if ep_steps > max_steps: max_steps = ep_steps
def main(): import sys logger.log(sys.argv) common_arg_parser = get_common_parser() args, cma_unknown_args = common_arg_parser.parse_known_args() this_run_dir = get_dir_path_for_this_run(args) plot_dir_alg = get_plot_dir(args) traj_params_dir_name = get_full_params_dir(this_run_dir) intermediate_data_dir = get_intermediate_data_dir(this_run_dir, params_scope="pi") save_dir = get_save_dir(this_run_dir) if not os.path.exists(intermediate_data_dir): os.makedirs(intermediate_data_dir) if not os.path.exists(plot_dir_alg): os.makedirs(plot_dir_alg) final_file = get_full_param_traj_file_path(traj_params_dir_name, "pi_final") final_params = pd.read_csv(final_file, header=None).values[0] def make_env(): env_out = gym.make(args.env) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) if args.normalize: env = VecNormalize(env) model = PPO2.load(f"{save_dir}/ppo2") # this also loads V function model.set_pi_from_flat(final_params) if args.normalize: env.load_running_average(save_dir) obz_tensor = model.act_model.fake_input_tensor some_neuron = model.act_model.policy_neurons[2][-1] grads = tf.gradients(tf.math.negative(some_neuron), obz_tensor) grads = list(zip(grads, obz_tensor)) trainer = tf.train.AdamOptimizer(learning_rate=0.01, epsilon=1e-5) train_op = trainer.apply_gradients(grads) for i in range(10000): obz, _ = model.sess.run([obz_tensor, train_op])
def create_monitor_dummy_vec_env(save_path: str): env = TuggerEnv() env = Monitor( env, filename=save_path, allow_early_resets=False, info_keywords=(Info.FINISHED_PRODUCTS.value, ), ) env = DummyVecEnv([lambda: env]) return env
def neuron_values_generator(args, save_dir, pi_theta, eval_timesteps): # logger.log(f"#######EVAL: {args}") neuron_values_list = [] def make_env(): env_out = gym.make(args.env) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) if args.normalize: env = VecNormalize(env) # policy = MlpPolicy # # model = PPO2.load(f"{save_dir}/ppo2") # this also loads V function # model = PPO2(policy=policy, env=env, n_steps=args.n_steps, nminibatches=args.nminibatches, lam=0.95, gamma=0.99, noptepochs=10, # ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, optimizer=args.optimizer) model = PPO2.load(f"{save_dir}/ppo2") # this also loads V function if pi_theta is not None: model.set_pi_from_flat(pi_theta) if args.normalize: env.load_running_average(save_dir) obs = np.zeros((env.num_envs, ) + env.observation_space.shape) obs[:] = env.reset() env.render() ep_infos = [] while 1: neuron_values, actions, _, _, _ = model.step_with_neurons(obs) # neuron_values = model.give_neuron_values(obs) # neuron_values_list.append( neuron_values ) yield neuron_values obs, rew, done, infos = env.step(actions) env.render() # time.sleep(1) for info in infos: maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_infos.append(maybe_ep_info) # env.render() done = done.any() if done: episode_rew = safe_mean([ep_info['r'] for ep_info in ep_infos]) print(f'episode_rew={episode_rew}') obs = env.reset()
def _make_warmstart_cartpole(): """Warm-start VecNormalize by stepping through CartPole""" venv = DummyVecEnv([lambda: gym.make("CartPole-v1")]) venv = VecNormalize(venv) venv.reset() venv.get_original_obs() for _ in range(100): actions = [venv.action_space.sample()] venv.step(actions) return venv
def train(num_timesteps, model_to_load): try: env = DummyVecEnv([dsgym]) env = VecNormalize(env) policy = MlpPolicy lr = 3e-4 * 0.75 model = PPO2(policy=policy, env=env, n_steps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, ent_coef=0.01, learning_rate=linear_schedule(lr), cliprange=0.2) if model_to_load: env = DummyVecEnv([dsgym]) env = VecNormalize.load( model_to_load.replace(".zip", "vec_normalize.pkl"), env) model = model.load(model_to_load) model.set_env(env) print("Loaded model from: ", model_to_load) model.set_learning_rate_func(linear_schedule_start_zero(lr)) model.learn(total_timesteps=num_timesteps) except KeyboardInterrupt: print("Saving on keyinterrupt") model.save("D:/openAi/ppo2save/" + time.strftime("%Y_%m_%d-%H_%M_%S")) # quit sys.exit() except BaseException as error: model.save("D:/openAi/ppo2save/" + time.strftime("%Y_%m_%d-%H_%M_%S")) print('An exception occurred: {}'.format(error)) traceback.print_exception(*sys.exc_info()) sys.exit() model.save("D:/openAi/ppo2save/" + time.strftime("%Y_%m_%d-%H_%M_%S"))
def visualize_neurons(args, save_dir, pi_theta, eval_timesteps): # logger.log(f"#######EVAL: {args}") def make_env(): env_out = gym.make(args.env) env_out.env.disableViewer = True env_out.env.visualize = False env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) if args.normalize: env = VecNormalize(env) model = PPO2.load(f"{save_dir}/ppo2") # this also loads V function if pi_theta is not None: model.set_pi_from_flat(pi_theta) if args.normalize: env.load_running_average(save_dir) obs = np.zeros((env.num_envs, ) + env.observation_space.shape) obs[:] = env.reset() ep_infos = [] for _ in range(eval_timesteps): actions = model.step(obs)[0] neuron_values = model.give_neuron_values(obs) obs, rew, done, infos = env.step(actions) for info in infos: maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_infos.append(maybe_ep_info) # env.render() done = done.any() if done: if pi_theta is None: episode_rew = safe_mean([ep_info['r'] for ep_info in ep_infos]) print(f'episode_rew={episode_rew}') obs = env.reset() return safe_mean([ep_info['r'] for ep_info in ep_infos])
def test_vec_env(): """Test VecNormalize Object""" def make_env(): return gym.make(ENV_ID) env = DummyVecEnv([make_env]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) _, done = env.reset(), [False] obs = None while not done[0]: actions = [env.action_space.sample()] obs, _, done, _ = env.step(actions) assert np.max(obs) <= 10
def main(): # Save argument values to yaml file args_file_path = os.path.join(args.log_dir, 'args.yaml') with open(args_file_path, 'w') as f: yaml.dump(vars(args), f, default_flow_style=False) # Create and wrap the environment env = gym.make(args.env) env = Monitor(env, args.log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) # Add some param noise for exploration if args.model == 'DDPG': param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.2, desired_action_stddev=0.2) model = MODEL_CLASS(MlpPolicy, env, param_noise=param_noise, memory_limit=int(1e6), verbose=0) if args.model == 'SAC': # TODO: This doesn't work model = MODEL_CLASS(MlpPolicy, env, verbose=1, policy_kwargs={ 'n_env': 1, 'n_steps': 64, 'n_batch': 64 }) else: model = MODEL_CLASS(MlpPolicy, env, verbose=0) # Train the agent model.learn(total_timesteps=args.n_steps, callback=callback) # Save the final model if args.save_model: model_file_path = os.path.join(args.log_dir, 'model.pkl') model.save(model_file_path) print("Best and final models saved in ", os.path.abspath(args.log_dir)) if args.plots: raise NotImplementedError
def create_env(env_params): global hyperparams if algo_ in ['dqn']: env = gym.make(env_id, env_params=env_params) env.seed(args.seed) if env_wrapper is not None: env = env_wrapper(env) else: env = DummyVecEnv([make_env(env_id, 0, args.seed, wrapper_class=env_wrapper, env_params=env_params)]) if normalize: if args.verbose > 0: if len(normalize_kwargs) > 0: print("Normalization activated: {}".format(normalize_kwargs)) else: print("Normalizing input and reward") env = VecNormalize(env, **normalize_kwargs) return env
def test_identity_multidiscrete(model_func): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) with a multidiscrete action space :param model_func: (lambda (Gym Environment): BaseRLModel) the model generator """ env = DummyVecEnv([lambda: IdentityEnvMultiDiscrete(10)]) model = model_func(env) model.learn(total_timesteps=1000, seed=0) n_trials = 1000 reward_sum = 0 obs = env.reset() for _ in range(n_trials): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) reward_sum += reward
def test_identity_multibinary(model_class): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) with a multibinary action space :param model_class: (BaseRLModel) A RL Model """ env = DummyVecEnv([lambda: IdentityEnvMultiBinary(10)]) model = model_class("MlpPolicy", env) model.learn(total_timesteps=1000, seed=0) n_trials = 1000 reward_sum = 0 obs = env.reset() for _ in range(n_trials): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) reward_sum += reward
def train(num_timesteps, logdir, save, save_interval, load, seed): def make_env(): env_out = StudentEnv() env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) batch_size = 2048 set_global_seeds(seed) # policy = "MlpLnLstmPolicy" policy = "MlpPolicy" model = PPO2( policy=policy, env=env, n_steps=batch_size, nminibatches=1, lam=0.95, gamma=0.99, noptepochs=10, ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, verbose=1, ) if save and save_interval > 0: callback = init_save_callback(logdir, batch_size, save_interval) else: callback = None # Optionally load before or save after training if load is not None: model.load_parameters(load) model.learn(total_timesteps=num_timesteps, callback=callback) if save: model.save(logdir + "/model") return model, env
def test_identity(learn_func): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) :param learn_func: (lambda (Gym Environment): A2CPolicy) the policy generator """ env = DummyVecEnv([lambda: IdentityEnv(10)]) model = learn_func(env) n_trials = 1000 reward_sum = 0 obs = env.reset() for _ in range(n_trials): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) reward_sum += reward assert reward_sum > 0.9 * n_trials # Free memory del model, env
def main(_): p_dic = getattr(conf.dic.path_dic, FLAGS.env_name) register(id=FLAGS.env_id, entry_point='env.env_ep:Env', kwargs={ 'env_name': FLAGS.env_name, 'done_step': 8760 }) def make_env(): env_out = gym.make(FLAGS.env_id) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) env = VecNormalize(env) policy = { 'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy }[FLAGS.policy] model = PPO2(policy=policy, env=env, n_steps=FLAGS.n_steps, nminibatches=FLAGS.nminibatches, lam=FLAGS.lam, gamma=FLAGS.gamma, noptepochs=FLAGS.noptepochs, ent_coef=FLAGS.ent_coef, learning_rate=FLAGS.learning_rate, cliprange=FLAGS.cliprange, verbose=FLAGS.verbose, log_dir=p_dic.get('agent_log_dir')) model.learn(total_timesteps=FLAGS.num_timesteps)
def test_vec_env(tmpdir): """Test VecNormalize Object""" clip_obs = 0.5 clip_reward = 5.0 orig_venv = DummyVecEnv([make_env]) norm_venv = VecNormalize(orig_venv, norm_obs=True, norm_reward=True, clip_obs=clip_obs, clip_reward=clip_reward) _, done = norm_venv.reset(), [False] while not done[0]: actions = [norm_venv.action_space.sample()] obs, rew, done, _ = norm_venv.step(actions) assert np.max(np.abs(obs)) <= clip_obs assert np.max(np.abs(rew)) <= clip_reward path = str(tmpdir.join("vec_normalize")) norm_venv.save(path) deserialized = VecNormalize.load(path, venv=orig_venv) check_vec_norm_equal(norm_venv, deserialized)
def test(model): env = DummyVecEnv([make_env] * n_env) #env = VecNormalize.load("models/machine_snap_env.bin", venv=env) #env.training = False for trial in range(1): obs = env.reset() running_reward = 0.0 alpha = 0.01 for _ in range(5000): action, _states = model.predict(obs) obs, reward, done, info = env.step(action) reward = reward[0] done = done[0] info = info[0] #running_reward = running_reward * (1-alpha) + alpha * reward running_reward += reward #print(obs, reward, done, info, running_reward) if done: print("Finished after {} timesteps".format(_ + 1)) break else: env.envs[0].render()
def train(env_id, num_timesteps, seed, policy): """ Train PPO2 model for atari environment, for testing purposes :param env_id: (str) the environment id string :param num_timesteps: (int) the number of timesteps to run :param seed: (int) Used to seed the random generator. :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) """ env = Monitor(PadEnv(), './logs', allow_early_resets=True) env = DummyVecEnv([lambda: env for _ in range(16)]) env = VecFrameStack(env, 8) policy = { 'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy }[policy] model = PPO2(policy=policy, env=env, n_steps=256, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01, learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, verbose=1) # model = model.load('./pad_4combo_ppo2.pkl', env) try: model.learn(total_timesteps=num_timesteps) except KeyboardInterrupt: print('Keyboard Interrupted') model.save('./pad_5combo_ppo2.pkl')
args = parser.parse_args() if args.robot_eye_video: import av output = av.open(args.robot_eye_video, mode='w') stream = output.add_stream('mpeg4', rate=13) stream.pix_fmt = 'yuv420p' stream.height, stream.width = 128, 128 set_global_seeds(args.seed) env = HamstirRoomEmptyEnv(render=True, dim=128) if args.debug_video: env.logVideo(args.debug_video) env.seed(args.seed) env = DummyVecEnv([lambda: env]) model = PPO2.load(args.model, policy=NatureLitePolicy) sess = model.sess graph = sess.graph # input = graph.get_tensor_by_name('model/module_apply_default/hub_input/Sub:0') # output = graph.get_tensor_by_name('model/pi/add:0') obs = env.reset() try: while True: action, _states = model.predict(obs, deterministic=True) # print(action, sess.run(input, feed_dict={model.act_model.obs_ph:obs})) # print(action, sess.run(output, feed_dict={input:obs})) obs, rewards, dones, info = env.step(action) if args.verbose:
def test_model_manipulation(model_policy): """ Test if the algorithm (with a given policy) can be loaded and saved without any issues, the environment switching works and that the action prediction works :param model_policy: (BaseRLModel, Object) A model, policy pair """ model_class, policy = model_policy try: env = DummyVecEnv([lambda: IdentityEnv(10)]) # check the env is deterministic action = [env.action_space.sample()] set_global_seeds(0) obs = env.step(action)[0] for _ in range(N_TRIALS): set_global_seeds(0) assert obs == env.step(action)[0], "Error: environment tested not deterministic with the same seed" # create and train model = model_class(policy=policy, env=env) model.learn(total_timesteps=50000) # predict and measure the acc reward acc_reward = 0 obs = env.reset() set_global_seeds(0) for _ in range(N_TRIALS): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) acc_reward += reward acc_reward = sum(acc_reward) / N_TRIALS # saving model.save("./test_model") del model, env # loading model = model_class.load("./test_model") # changing environment (note: this can be done at loading) env = DummyVecEnv([lambda: IdentityEnv(10)]) model.set_env(env) # predict the same output before saving loaded_acc_reward = 0 obs = env.reset() set_global_seeds(0) for _ in range(N_TRIALS): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) loaded_acc_reward += reward loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS assert abs(acc_reward - loaded_acc_reward) < 0.1, "Error: the prediction seems to have changed between " \ "loading and saving" # learn post loading model.learn(total_timesteps=1000) # validate no reset post learning loaded_acc_reward = 0 obs = env.reset() set_global_seeds(0) for _ in range(N_TRIALS): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) loaded_acc_reward += reward loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS assert abs(acc_reward - loaded_acc_reward) < 0.1, "Error: the prediction seems to have changed between " \ "pre learning and post learning" # predict new values obs = env.reset() for _ in range(N_TRIALS): action, _ = model.predict(obs) obs, _, _, _ = env.step(action) del model, env finally: if os.path.exists("./test_model"): os.remove("./test_model")
def play(train=True): ncpu = 4 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.Session(config=config).__enter__() n_env = 1 env = DummyVecEnv([make_env] * n_env) #env = VecNormalize(env, gamma=GAMMA) seed = 10 set_global_seeds(seed) model = DQN( policy="LnMlpPolicy", env=env, tensorboard_log="tb_log_new", #n_steps=32, #nminibatches=4, #noptepochs=10, learning_rate=0.0003, exploration_fraction=0.3, #cliprange=0.2, #max_grad_norm=0.2, gamma=GAMMA, verbose=1, policy_kwargs={ #"net_arch": [128, 64, 32, 32, 32], #"n_lstm": 32 }) def test(model): env = DummyVecEnv([make_env] * n_env) #env = VecNormalize.load("models/machine_snap_env.bin", venv=env) #env.training = False for trial in range(1): obs = env.reset() running_reward = 0.0 alpha = 0.01 for _ in range(5000): action, _states = model.predict(obs) obs, reward, done, info = env.step(action) reward = reward[0] done = done[0] info = info[0] #running_reward = running_reward * (1-alpha) + alpha * reward running_reward += reward #print(obs, reward, done, info, running_reward) if done: print("Finished after {} timesteps".format(_ + 1)) break else: env.envs[0].render() def callback(locals_, globals_): import ipdb ipdb.set_trace() return True if train: try: model.learn(total_timesteps=3_000_000, log_interval=50) except KeyboardInterrupt: model.save("models/machine_snap_model.bin") env.save("models/machine_snap_env.bin") raise model.save(f'models/machine_0_model.bin') env.save(f'models/machine_0_env.bin') model = DQN.load('models/machine_snap_model.bin') test(model)
def train(args): """ Runs the test """ args, argv = mujoco_arg_parser().parse_known_args(args) logger.log(f"#######TRAIN: {args}") args.alg = "ppo2" this_run_dir = get_dir_path_for_this_run(args) if os.path.exists(this_run_dir): import shutil shutil.rmtree(this_run_dir) os.makedirs(this_run_dir) log_dir = get_log_dir(this_run_dir) save_dir = get_save_dir(this_run_dir) logger.configure(log_dir) def make_env(): env_out = gym.make(args.env) env_out.env.visualize = False env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) env.envs[0].env.env.disableViewer = True set_global_seeds(args.seed) env.envs[0].env.env.seed(args.seed) if args.normalize: env = VecNormalize(env) policy = MlpPolicy # extra run info I added for my purposes full_param_traj_dir_path = get_full_params_dir(this_run_dir) if os.path.exists(full_param_traj_dir_path): import shutil shutil.rmtree(full_param_traj_dir_path) os.makedirs(full_param_traj_dir_path) if os.path.exists(save_dir): import shutil shutil.rmtree(save_dir) os.makedirs(save_dir) run_info = { "run_num": args.run_num, "env_id": args.env, "full_param_traj_dir_path": full_param_traj_dir_path, "state_samples_to_collect": args.state_samples_to_collect } model = PPO2(policy=policy, env=env, n_steps=args.n_steps, nminibatches=args.nminibatches, lam=0.95, gamma=0.99, noptepochs=10, ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, optimizer=args.optimizer, seed=args.seed) model.tell_run_info(run_info) model.learn(total_timesteps=args.num_timesteps) model.save(f"{save_dir}/ppo2") if args.normalize: env.save_running_average(save_dir)
from active_env.envs.active_network_env import ActiveEnv from stable_baselines.common.vec_env.dummy_vec_env import DummyVecEnv from stable_baselines.ddpg.policies import LnMlpPolicy from stable_baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise from stable_baselines import DDPG from stable_baselines.ddpg.noise import AdaptiveParamNoiseSpec import numpy as np powerenv = ActiveEnv() powerenv.set_parameters({ 'state_space': ['sun', 'demand', 'imbalance'], 'reward_terms': ['voltage', 'current', 'imbalance'] }) powerenv = DummyVecEnv([lambda: powerenv]) action_mean = np.zeros(powerenv.action_space.shape) action_sigma = 0.3 * np.ones(powerenv.action_space.shape) action_noise = OrnsteinUhlenbeckActionNoise(mean=action_mean, sigma=action_sigma) param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.2, desired_action_stddev=0.01) t_steps = 800000 logdir = 'C:\\Users\\vegar\\Dropbox\\Master\\logs' powermodel = DDPG( LnMlpPolicy, powerenv, verbose=2, action_noise=action_noise,