示例#1
0
def test_action_mask_run_trpo(vec_env, policy, env_class):
    env = vec_env([env_class])

    model = TRPO(policy, env, verbose=0)

    obs, done, action_masks = env.reset(), [False], []
    while not done[0]:
        action, _states = model.predict(obs, action_mask=action_masks)
        obs, _, done, infos = env.step(action)

        action_masks.clear()
        for info in infos:
            env_action_mask = info.get('action_mask')
            action_masks.append(env_action_mask)

    env.close()
def render_to_gif():
    def save_frames_as_gif(frames,
                           path='./',
                           filename='growspace_with_trpo.gif'):
        # Mess with this to change frame size
        plt.figure(figsize=(frames[0].shape[1] / 72.0,
                            frames[0].shape[0] / 72.0),
                   dpi=72)

        patch = plt.imshow(frames[0])
        plt.axis('off')

        def animate(i):
            patch.set_data(frames[i])

        anim = animation.FuncAnimation(plt.gcf(),
                                       animate,
                                       frames=len(frames),
                                       interval=50)
        anim.save(path + filename, writer='imagemagick', fps=60)

    env = gym.make('GrowSpaceEnv-Control-v0')
    model = TRPO(MlpPolicy, env, verbose=1)
    # model.learn(total_timesteps=2500)
    # model.save("trpo_cartpole")

    # del model  # remove to demonstrate saving and loading

    model = TRPO.load("trpo_cartpole")

    frames = []
    obs = env.reset()
    for _ in range(150):
        # while True:
        frames.append(env.render(mode="rgb_array"))

        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        # if done:
        #     break
        # env.render()

    env.close()
    save_frames_as_gif(frames)
def render_growspace_with_trpo():
    env = gym.make('GrowSpaceEnv-Control-v0')
    model = TRPO(MlpPolicy, env, verbose=1)
    # model.learn(total_timesteps=2500)
    # model.save("trpo_cartpole")
    #
    # del model  # remove to demonstrate saving and loading

    model = TRPO.load("trpo_cartpole")

    obs = env.reset()
    for t in range(150):
        print(t)
        # while True:
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)

        # if dones:
        #     env.reset()
        env.render()
示例#4
0
def optimize_agent(trial):
    """ Train the model and optimise
        Optuna maximises the negative log likelihood, so we
        need to negate the reward here
    """
    model_params = optimize_ddpg(trial)
    seed = trial.suggest_int('numpyseed', 1, 429496729)
    np.random.seed(seed)
    original_env = gym.make('rustyblocks-v0')
    original_env.max_invalid_tries = 3
    env = DummyVecEnv([lambda: original_env])
    model = TRPO("MlpPolicy", env, verbose=0, **model_params)
    print("DOING LEARING trpo")
    original_env.force_progression = False
    model.learn(int(2e5), seed=seed)
    print("DONE LEARING trpo")
    original_env.max_invalid_tries = -1

    rewards = []
    n_episodes, reward_sum = 0, 0.0

    obs = env.reset()
    original_env.force_progression = True
    original_env.invalid_try_limit = 5000
    while n_episodes < 4:
        action, _ = model.predict(obs)
        obs, reward, done, _ = env.step(action)
        reward_sum += reward

        if done:
            rewards.append(reward_sum)
            reward_sum = 0.0
            n_episodes += 1
            obs = env.reset()

    last_reward = np.mean(rewards)
    trial.report(last_reward)

    return last_reward
示例#5
0
文件: main.py 项目: ddlau/needle
def tst():
    def _init_openmpi():
        """Pre-load libmpi.dll and register OpenMPI distribution."""
        import os
        import ctypes
        if os.name != 'nt' or 'OPENMPI_HOME' in os.environ:
            return
        try:
            openmpi_home = os.path.abspath(os.path.dirname(__file__))
            openmpi_bin = os.path.join(openmpi_home, 'bin')
            os.environ['OPENMPI_HOME'] = openmpi_home
            os.environ['PATH'] = ';'.join((openmpi_bin, os.environ['PATH']))
            ctypes.cdll.LoadLibrary(os.path.join(openmpi_bin, 'libmpi.dll'))
        except Exception:
            pass

    _init_openmpi()

    import gym

    from stable_baselines.common.policies import MlpPolicy, CnnPolicy
    from stable_baselines import TRPO

    env = gym.make('BreakoutNoFrameskip-v4')  #'CartPole-v1')

    model = TRPO(CnnPolicy, env, timesteps_per_batch=1024, verbose=1)
    model.learn(total_timesteps=25000)
    model.save("trpo_cartpole")

    del model  # remove to demonstrate saving and loading

    model = TRPO.load("trpo_cartpole")

    obs = env.reset()
    while True:
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()
示例#6
0
def main(game,
         num_timesteps,
         num_episodes,
         dir_name,
         model_name,
         policy,
         discount=0.99,
         batch_size=1024):
    dir_name = get_valid_filename(dir_name)
    model_name = get_valid_filename(model_name)

    eval_log_dir = f"logs/{dir_name}/{model_name}"
    tr_log_dir = f"{eval_log_dir}-training"
    model_dir = f"models/{dir_name}"
    os.makedirs(eval_log_dir, exist_ok=True)
    os.makedirs(tr_log_dir, exist_ok=True)
    os.makedirs(model_dir, exist_ok=True)

    env = make_vec_env(game)
    env.seed(309)

    model = TRPO(policy=policy,
                 env=env,
                 gamma=discount,
                 timesteps_per_batch=batch_size,
                 verbose=1,
                 seed=309,
                 tensorboard_log=tr_log_dir,
                 n_cpu_tf_sess=1)
    model.learn(total_timesteps=num_timesteps)
    model.save(f"{model_dir}/{model_name}")

    eps_done = 0
    ep_rewards = np.array([0] * num_episodes)
    curr_rewards = 0
    obs = env.reset()
    while eps_done != num_episodes:
        if eps_done % 10 == 0:
            print(f"Episodes completed: {eps_done} / {num_episodes}", end="\r")
        # For vectorised environments, they are automatically reset when done,
        # so returned obs would be the start state of next episode
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        env.render(mode="human")
        curr_rewards += reward[0]
        if done[0]:
            ep_rewards[eps_done] = curr_rewards
            curr_rewards = 0
            eps_done += 1
    print("All episodes completed")
    env.close()

    mean = ep_rewards.mean()
    std_dev = ep_rewards.std()
    # Outliers: outside of 3 standard deviations
    outlier_threshold_upper = mean + 3 * std_dev
    outlier_threshold_lower = mean - 3 * std_dev
    trimmed_rewards = np.array([
        rew for rew in ep_rewards
        if outlier_threshold_lower <= rew <= outlier_threshold_upper
    ])
    avg_reward = trimmed_rewards.mean()
    print(f"Average score over {num_episodes} games: {avg_reward:.2f}")

    summary_writer = tf.summary.FileWriter(eval_log_dir)
    sess = tf.Session()
    rew_var = tf.Variable(0, dtype=tf.int64)
    rew_val = tf.summary.scalar(f"Reward / Episode ({model_name})", rew_var)
    for i in range(num_episodes):
        rew = ep_rewards[i]
        sess.run(rew_var.assign(rew))
        summary_writer.add_summary(sess.run(rew_val), i)

    avg_var = tf.Variable(0.0, dtype=tf.float64)
    avg_val = tf.summary.scalar(f"Trimmed Average ({model_name})", avg_var)
    sess.run(avg_var.assign(avg_reward))
    summary_writer.add_summary(sess.run(avg_val), 0)

    summary_writer.flush()
    summary_writer.close()
    sess.close()
示例#7
0
                BSS_Controller_Supply_Direction_Prediction(env_settings_init, budget, open(letter+ "/v6_stepsBudget" + str(budget) + ".csv", 'a+')),
                "v6"
            )
        ]:
            accumulatedRew = 0
            iterations = 0
            outFile = open(letter + "/" + expName + "_perfBudget" + str(budget) + ".csv", 'a+')
            agent = TRPO(MlpPolicy, env)
            state = env.reset()
            start = time.time()
            print("Beginning to learn " + expName)
            agent.learn(learnSteps)
            print(time.time() - start)
            print("\tDone Learning")
            for _ in range(evaluationLen):
                action = agent.predict(state)
                state, reward, done, info = env.step(action[0])
                accumulatedRew += reward
                iterations += 1
                if done:
                    outFile.write(str("%.4f" % (accumulatedRew/iterations)) + "," + str(env.getBudget()) + "\n")
                    accumulatedRew = 0
                    iterations = 0
                    env.reset()
            outFile.close()
            env.close()

    '''
    No Agent
    '''
    print("No agent")
示例#8
0
        "feature_extraction": "mlp",
        "act_fun": tf.keras.activations.linear
    }
    model = TRPO(FFP, pol_env, verbose=0, policy_kwargs=pol_kwargs)
    model.learn(total_timesteps=pol_timesteps)

    # evaluate the policy
    print("Evaluating policy...")
    n_evals = 5
    eval_rollout = int(200 / 3)
    eval_rewards = []
    for _ in range(n_evals):
        obs = pol_env.reset()
        rollout_rewards = []
        for _ in range(eval_rollout):
            action, _states = model.predict(obs)
            obs, rewards, dones, info = pol_env.step(action)
            rollout_rewards.append(rewards / 3)
        eval_rewards.append(np.mean(rollout_rewards))
    print("Mean eval step reward: {}".format(np.mean(eval_rewards)))

    # update the policy and sampler objects
    pol = EncoderPolicy(TorchStateEncoder(encnet), model)
    sampler = srt.PolicyTrajectorySampler(env, pol, T)

# save stuff
torch.save(rep_model, "./repnet")
model.save("./model")

# train the model more?
"""
    print('Model choosen not available, check spelling or if it is supported')

# Using only one expert trajectory
# you can specify `traj_limitation=-1` for using the whole dataset
dataset = ExpertDataset(expert_path='./pretrain/dummy_quadruped.npz',
                        traj_limitation=-1,
                        batch_size=128)

model.pretrain(dataset, n_epochs=args['pt'])

if args['pretrainVisualization']:
    # Test the pre-trained model
    env = model.get_env()
    obs = env.reset()

    reward_sum = 0.0
    for _ in range(1000):
        action, _ = model.predict(obs)
        obs, reward, done, _ = env.step(action)
        reward_sum += reward
        env.render()
        if done:
            print(reward_sum)
            reward_sum = 0.0
            obs = env.reset()

# As an option, you can train the RL agent
model.learn(total_timesteps=args['timesteps'])
model.save('./pretrain/Preentrenado_{} bs, {} timesteps'.format(
    args['bs'], args['timesteps']))
示例#10
0
def main():
    # parameters for the gym_carla environment
    params = {
        'number_of_vehicles': 8,
        'number_of_walkers': 0,
        'display_size': 256,  # screen size of bird-eye render
        'max_past_step': 1,  # the number of past steps to draw
        'dt': 0.1,  # time interval between two frames
        'discrete': True,  # whether to use discrete control space
        'continuous_accel_range': [-3.0, 3.0],  # continuous acceleration range
        'ego_vehicle_filter':
        'vehicle.lincoln*',  # filter for defining ego vehicle
        'port': 2000,  # connection port
        'town': 'Town06',  # which town to simulate
        'task_mode':
        'acc_1',  # mode of the task, [random, roundabout (only for Town03)]
        'max_time_episode': 1000,  # maximum timesteps per episode
        'max_waypt': 12,  # maximum number of waypoints
        'obs_range': 32,  # observation range (meter)
        'lidar_bin': 0.125,  # bin size of lidar sensor (meter)
        'd_behind': 12,  # distance behind the ego vehicle (meter)
        'out_lane_thres': 2.0,  # threshold for out of lane
        'desired_speed': 16.67,  # desired speed (m/s)
        'max_ego_spawn_times': 200,  # maximum times to spawn ego vehicle
        'display_route': True,  # whether to render the desired route
        'pixor_size': 64,  # size of the pixor labels
        'pixor': False,  # whether to output PIXOR observation
        'RGB_cam': True,  # whether to use RGB camera sensor
    }
    solver_params = {
        'layers': [64, 64, 64],
        'alpha': 0.001,
        'gamma': 0.99,
        'epsilon': 0.1,
        'replay_memory_size': 500000,
        'update_target_estimator_every': 10000,
        'batch_size': 64,
    }
    # Set gym-carla environment
    env = gym.make('carla-v0', params=params)
    # check_env(env)
    obs = env.reset()
    checkpoint_callback = CheckpointCallback(save_freq=5000,
                                             save_path='./trpo_checkpoint/',
                                             name_prefix='trpo_check')

    #model = DQN.load("./trpo_checkpoint/trpo_check_200_steps.zip",env=env,tensorboard_log="./trpo)
    model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log="./trpo")
    model.learn(total_timesteps=35000,
                tb_log_name="35k-with-checkoint",
                callback=checkpoint_callback)
    model.save("trpo_carla")

    del model  # remove to demonstrate saving and loading

    model = TRPO.load("trpo_carla")

    obs = env.reset()
    for i in range(100):
        while True:
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            if dones:
                obs = env.reset()
                break