def test_action_mask_run_trpo(vec_env, policy, env_class): env = vec_env([env_class]) model = TRPO(policy, env, verbose=0) obs, done, action_masks = env.reset(), [False], [] while not done[0]: action, _states = model.predict(obs, action_mask=action_masks) obs, _, done, infos = env.step(action) action_masks.clear() for info in infos: env_action_mask = info.get('action_mask') action_masks.append(env_action_mask) env.close()
def render_to_gif(): def save_frames_as_gif(frames, path='./', filename='growspace_with_trpo.gif'): # Mess with this to change frame size plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi=72) patch = plt.imshow(frames[0]) plt.axis('off') def animate(i): patch.set_data(frames[i]) anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames), interval=50) anim.save(path + filename, writer='imagemagick', fps=60) env = gym.make('GrowSpaceEnv-Control-v0') model = TRPO(MlpPolicy, env, verbose=1) # model.learn(total_timesteps=2500) # model.save("trpo_cartpole") # del model # remove to demonstrate saving and loading model = TRPO.load("trpo_cartpole") frames = [] obs = env.reset() for _ in range(150): # while True: frames.append(env.render(mode="rgb_array")) action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) # if done: # break # env.render() env.close() save_frames_as_gif(frames)
def render_growspace_with_trpo(): env = gym.make('GrowSpaceEnv-Control-v0') model = TRPO(MlpPolicy, env, verbose=1) # model.learn(total_timesteps=2500) # model.save("trpo_cartpole") # # del model # remove to demonstrate saving and loading model = TRPO.load("trpo_cartpole") obs = env.reset() for t in range(150): print(t) # while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) # if dones: # env.reset() env.render()
def optimize_agent(trial): """ Train the model and optimise Optuna maximises the negative log likelihood, so we need to negate the reward here """ model_params = optimize_ddpg(trial) seed = trial.suggest_int('numpyseed', 1, 429496729) np.random.seed(seed) original_env = gym.make('rustyblocks-v0') original_env.max_invalid_tries = 3 env = DummyVecEnv([lambda: original_env]) model = TRPO("MlpPolicy", env, verbose=0, **model_params) print("DOING LEARING trpo") original_env.force_progression = False model.learn(int(2e5), seed=seed) print("DONE LEARING trpo") original_env.max_invalid_tries = -1 rewards = [] n_episodes, reward_sum = 0, 0.0 obs = env.reset() original_env.force_progression = True original_env.invalid_try_limit = 5000 while n_episodes < 4: action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) reward_sum += reward if done: rewards.append(reward_sum) reward_sum = 0.0 n_episodes += 1 obs = env.reset() last_reward = np.mean(rewards) trial.report(last_reward) return last_reward
def tst(): def _init_openmpi(): """Pre-load libmpi.dll and register OpenMPI distribution.""" import os import ctypes if os.name != 'nt' or 'OPENMPI_HOME' in os.environ: return try: openmpi_home = os.path.abspath(os.path.dirname(__file__)) openmpi_bin = os.path.join(openmpi_home, 'bin') os.environ['OPENMPI_HOME'] = openmpi_home os.environ['PATH'] = ';'.join((openmpi_bin, os.environ['PATH'])) ctypes.cdll.LoadLibrary(os.path.join(openmpi_bin, 'libmpi.dll')) except Exception: pass _init_openmpi() import gym from stable_baselines.common.policies import MlpPolicy, CnnPolicy from stable_baselines import TRPO env = gym.make('BreakoutNoFrameskip-v4') #'CartPole-v1') model = TRPO(CnnPolicy, env, timesteps_per_batch=1024, verbose=1) model.learn(total_timesteps=25000) model.save("trpo_cartpole") del model # remove to demonstrate saving and loading model = TRPO.load("trpo_cartpole") obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
def main(game, num_timesteps, num_episodes, dir_name, model_name, policy, discount=0.99, batch_size=1024): dir_name = get_valid_filename(dir_name) model_name = get_valid_filename(model_name) eval_log_dir = f"logs/{dir_name}/{model_name}" tr_log_dir = f"{eval_log_dir}-training" model_dir = f"models/{dir_name}" os.makedirs(eval_log_dir, exist_ok=True) os.makedirs(tr_log_dir, exist_ok=True) os.makedirs(model_dir, exist_ok=True) env = make_vec_env(game) env.seed(309) model = TRPO(policy=policy, env=env, gamma=discount, timesteps_per_batch=batch_size, verbose=1, seed=309, tensorboard_log=tr_log_dir, n_cpu_tf_sess=1) model.learn(total_timesteps=num_timesteps) model.save(f"{model_dir}/{model_name}") eps_done = 0 ep_rewards = np.array([0] * num_episodes) curr_rewards = 0 obs = env.reset() while eps_done != num_episodes: if eps_done % 10 == 0: print(f"Episodes completed: {eps_done} / {num_episodes}", end="\r") # For vectorised environments, they are automatically reset when done, # so returned obs would be the start state of next episode action, _ = model.predict(obs) obs, reward, done, info = env.step(action) env.render(mode="human") curr_rewards += reward[0] if done[0]: ep_rewards[eps_done] = curr_rewards curr_rewards = 0 eps_done += 1 print("All episodes completed") env.close() mean = ep_rewards.mean() std_dev = ep_rewards.std() # Outliers: outside of 3 standard deviations outlier_threshold_upper = mean + 3 * std_dev outlier_threshold_lower = mean - 3 * std_dev trimmed_rewards = np.array([ rew for rew in ep_rewards if outlier_threshold_lower <= rew <= outlier_threshold_upper ]) avg_reward = trimmed_rewards.mean() print(f"Average score over {num_episodes} games: {avg_reward:.2f}") summary_writer = tf.summary.FileWriter(eval_log_dir) sess = tf.Session() rew_var = tf.Variable(0, dtype=tf.int64) rew_val = tf.summary.scalar(f"Reward / Episode ({model_name})", rew_var) for i in range(num_episodes): rew = ep_rewards[i] sess.run(rew_var.assign(rew)) summary_writer.add_summary(sess.run(rew_val), i) avg_var = tf.Variable(0.0, dtype=tf.float64) avg_val = tf.summary.scalar(f"Trimmed Average ({model_name})", avg_var) sess.run(avg_var.assign(avg_reward)) summary_writer.add_summary(sess.run(avg_val), 0) summary_writer.flush() summary_writer.close() sess.close()
BSS_Controller_Supply_Direction_Prediction(env_settings_init, budget, open(letter+ "/v6_stepsBudget" + str(budget) + ".csv", 'a+')), "v6" ) ]: accumulatedRew = 0 iterations = 0 outFile = open(letter + "/" + expName + "_perfBudget" + str(budget) + ".csv", 'a+') agent = TRPO(MlpPolicy, env) state = env.reset() start = time.time() print("Beginning to learn " + expName) agent.learn(learnSteps) print(time.time() - start) print("\tDone Learning") for _ in range(evaluationLen): action = agent.predict(state) state, reward, done, info = env.step(action[0]) accumulatedRew += reward iterations += 1 if done: outFile.write(str("%.4f" % (accumulatedRew/iterations)) + "," + str(env.getBudget()) + "\n") accumulatedRew = 0 iterations = 0 env.reset() outFile.close() env.close() ''' No Agent ''' print("No agent")
"feature_extraction": "mlp", "act_fun": tf.keras.activations.linear } model = TRPO(FFP, pol_env, verbose=0, policy_kwargs=pol_kwargs) model.learn(total_timesteps=pol_timesteps) # evaluate the policy print("Evaluating policy...") n_evals = 5 eval_rollout = int(200 / 3) eval_rewards = [] for _ in range(n_evals): obs = pol_env.reset() rollout_rewards = [] for _ in range(eval_rollout): action, _states = model.predict(obs) obs, rewards, dones, info = pol_env.step(action) rollout_rewards.append(rewards / 3) eval_rewards.append(np.mean(rollout_rewards)) print("Mean eval step reward: {}".format(np.mean(eval_rewards))) # update the policy and sampler objects pol = EncoderPolicy(TorchStateEncoder(encnet), model) sampler = srt.PolicyTrajectorySampler(env, pol, T) # save stuff torch.save(rep_model, "./repnet") model.save("./model") # train the model more? """
print('Model choosen not available, check spelling or if it is supported') # Using only one expert trajectory # you can specify `traj_limitation=-1` for using the whole dataset dataset = ExpertDataset(expert_path='./pretrain/dummy_quadruped.npz', traj_limitation=-1, batch_size=128) model.pretrain(dataset, n_epochs=args['pt']) if args['pretrainVisualization']: # Test the pre-trained model env = model.get_env() obs = env.reset() reward_sum = 0.0 for _ in range(1000): action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) reward_sum += reward env.render() if done: print(reward_sum) reward_sum = 0.0 obs = env.reset() # As an option, you can train the RL agent model.learn(total_timesteps=args['timesteps']) model.save('./pretrain/Preentrenado_{} bs, {} timesteps'.format( args['bs'], args['timesteps']))
def main(): # parameters for the gym_carla environment params = { 'number_of_vehicles': 8, 'number_of_walkers': 0, 'display_size': 256, # screen size of bird-eye render 'max_past_step': 1, # the number of past steps to draw 'dt': 0.1, # time interval between two frames 'discrete': True, # whether to use discrete control space 'continuous_accel_range': [-3.0, 3.0], # continuous acceleration range 'ego_vehicle_filter': 'vehicle.lincoln*', # filter for defining ego vehicle 'port': 2000, # connection port 'town': 'Town06', # which town to simulate 'task_mode': 'acc_1', # mode of the task, [random, roundabout (only for Town03)] 'max_time_episode': 1000, # maximum timesteps per episode 'max_waypt': 12, # maximum number of waypoints 'obs_range': 32, # observation range (meter) 'lidar_bin': 0.125, # bin size of lidar sensor (meter) 'd_behind': 12, # distance behind the ego vehicle (meter) 'out_lane_thres': 2.0, # threshold for out of lane 'desired_speed': 16.67, # desired speed (m/s) 'max_ego_spawn_times': 200, # maximum times to spawn ego vehicle 'display_route': True, # whether to render the desired route 'pixor_size': 64, # size of the pixor labels 'pixor': False, # whether to output PIXOR observation 'RGB_cam': True, # whether to use RGB camera sensor } solver_params = { 'layers': [64, 64, 64], 'alpha': 0.001, 'gamma': 0.99, 'epsilon': 0.1, 'replay_memory_size': 500000, 'update_target_estimator_every': 10000, 'batch_size': 64, } # Set gym-carla environment env = gym.make('carla-v0', params=params) # check_env(env) obs = env.reset() checkpoint_callback = CheckpointCallback(save_freq=5000, save_path='./trpo_checkpoint/', name_prefix='trpo_check') #model = DQN.load("./trpo_checkpoint/trpo_check_200_steps.zip",env=env,tensorboard_log="./trpo) model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log="./trpo") model.learn(total_timesteps=35000, tb_log_name="35k-with-checkoint", callback=checkpoint_callback) model.save("trpo_carla") del model # remove to demonstrate saving and loading model = TRPO.load("trpo_carla") obs = env.reset() for i in range(100): while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) if dones: obs = env.reset() break