示例#1
0
def train_agent_with_ddpg(load):
    from stable_baselines.ddpg.policies import FeedForwardPolicy
    from stable_baselines.common.vec_env import DummyVecEnv
    from stable_baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise
    from stable_baselines import DDPG

    # Create and wrap the environment
    env = gym.make('F16GCAS-v0')
    env = DummyVecEnv([lambda: env])

    # the noise objects for DDPG
    n_actions = env.action_space.shape[-1]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.01) * np.ones(n_actions))

    # Custom MLP policy of two layers of size 16 each
    class CustomPolicy(FeedForwardPolicy):
        def __init__(self, *args, **kwargs):
            super(CustomPolicy, self).__init__(*args, **kwargs,
                                               layers=[128, 128],
                                               layer_norm=False,
                                               feature_extraction="mlp")

    model = DDPG(CustomPolicy, env, verbose=1, action_noise=action_noise)

    if not load:
        ExpData = ExpertDataset("./lqr_export.npz")
        model.pretrain(ExpData, n_epochs=100)
        model.save(ROOT+"/trained_models/TDRL/f16/ddpg/128_128")
    else:
        model = DDPG.load(ROOT+"/trained_models/TDRL/f16/ddpg/128_128", policy=CustomPolicy, env=env)

    return model
示例#2
0
def main(output_folder_path:Path):
    # Set gym-carla environment
    agent_config = AgentConfig.parse_file(Path("configurations/agent_configuration.json"))
    carla_config = CarlaConfig.parse_file(Path("configurations/carla_configuration.json"))

    params = {
        "agent_config": agent_config,
        "carla_config": carla_config,
        "ego_agent_class": RLPIDAgent,
        "max_collision": 5
    }

    env = gym.make('roar-pid-v0', params=params)
    env.reset()

    model_params: dict = {
        "verbose": 1,
        "render": True,
        "tensorboard_log": (output_folder_path / "tensorboard").as_posix()
    }
    latest_model_path = find_latest_model(output_folder_path)
    if latest_model_path is None:
        model = DDPG(LnMlpPolicy, env=env, **model_params)  # full tensorboard log can take up space quickly
    else:
        model = DDPG.load(latest_model_path, env=env, **model_params)
        model.render = True
        model.tensorboard_log = (output_folder_path / "tensorboard").as_posix()

    logging_callback = LoggingCallback(model=model)
    checkpoint_callback = CheckpointCallback(save_freq=1000, verbose=2, save_path=(output_folder_path / "checkpoints").as_posix())
    event_callback = EveryNTimesteps(n_steps=100, callback=checkpoint_callback)
    callbacks = CallbackList([checkpoint_callback, event_callback, logging_callback])
    model = model.learn(total_timesteps=int(1e10), callback=callbacks, reset_num_timesteps=False)
    model.save(f"pid_ddpg_{datetime.now()}")
def test_ddpg_normalization():
    """
    Test that observations and returns normalizations are properly saved and loaded.
    """
    param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.05,
                                         desired_action_stddev=0.05)
    model = DDPG('MlpPolicy',
                 'Pendulum-v0',
                 memory_limit=50000,
                 normalize_observations=True,
                 normalize_returns=True,
                 nb_rollout_steps=128,
                 nb_train_steps=1,
                 batch_size=64,
                 param_noise=param_noise)
    model.learn(1000)
    obs_rms_params = model.sess.run(model.obs_rms_params)
    ret_rms_params = model.sess.run(model.ret_rms_params)
    model.save('./test_ddpg.zip')

    loaded_model = DDPG.load('./test_ddpg.zip')
    obs_rms_params_2 = loaded_model.sess.run(loaded_model.obs_rms_params)
    ret_rms_params_2 = loaded_model.sess.run(loaded_model.ret_rms_params)

    for param, param_loaded in zip(obs_rms_params + ret_rms_params,
                                   obs_rms_params_2 + ret_rms_params_2):
        assert np.allclose(param, param_loaded)

    del model, loaded_model

    if os.path.exists("./test_ddpg.zip"):
        os.remove("./test_ddpg.zip")
示例#4
0
    def _get_weights(self):
        class CustomPolicy(FeedForwardPolicy):
            def __init__(self, *args, **kwargs):
                super(CustomPolicy, self).__init__(*args,
                                                   **kwargs,
                                                   layers=[128, 128],
                                                   layer_norm=False,
                                                   feature_extraction="mlp")

        DVenv = DummyVecEnv([lambda: self.env])
        self.nn_model = DDPG.load(ROOT +
                                  "/trained_models/TDRL/f16/ddpg/128_128",
                                  policy=CustomPolicy)

        with self.nn_model.graph.as_default():
            # print(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/pi'))
            # print(tf.all_variables())
            # train_writer = tf.summary.FileWriter('./neural_network_graph', model.sess.graph)
            wb_list = self.nn_model.sess.run(
                tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                  scope='model/pi'))

        self.w_list = []
        self.b_list = []
        count = 0
        with tf.name_scope("neural_controller"):
            for wb in wb_list:
                if count % 2 == 0:
                    self.w_list.append(tf.convert_to_tensor(wb, name="w"))
                else:
                    self.b_list.append(tf.convert_to_tensor(wb, name="b"))
                count += 1
示例#5
0
def test(model_path: str, exp_config: dict):

    test_env, _ = init_env(exp_config)

    if ALG == 'ddpg':
        model = DDPG.load(model_path, env=test_env)
    elif ALG == 'trpo':
        model = TRPO.load(model_path, env=test_env)
    elif ALG == 'ppo2':
        model = PPO2.load(model_path, env=test_env)
    elif ALG == 'her':
        # model = HER.load(model_path, env=test_env)
        raise NotImplemented()
    else:
        raise ValueError(f'Unknown algorithm "{ALG}"!')

    monitor = test_env.envs[0]  # type: Monitor
    assert isinstance(monitor, Monitor)

    raw_env = monitor.unwrapped  # type: GaussianPendulumEnv
    assert isinstance(raw_env, GaussianPendulumEnv)

    raw_env.configure(seed=42,
                      mass_mean=(0.05, 1.5),
                      mass_stdev=(0.01, 0.15),
                      embed_knowledge=exp_config.get('embed_knowledge', False),
                      perfect_knowledge=exp_config.get('perfect_knowledge',
                                                       False),
                      gym_env=test_env)

    runs = np.zeros((TEST_RUNS, 4))
    fixed_masses = np.linspace(0.030, 1.600, TEST_RUNS)

    for test_ep in range(runs.shape[0]):

        obs = test_env.reset()

        if TEST_LINSPACE_MASS:
            p = raw_env.physical_props
            raw_env.physical_props = p[0], fixed_masses[test_ep], p[2]

        mass_distr_params = raw_env.mass_distr_params.copy()
        sampled_mass = raw_env.physical_props[1]

        while True:
            action, states = model.predict(obs, deterministic=True)
            obs, rewards, dones, info = test_env.step(action)
            rewards_by_episode = monitor.episode_rewards
            episode = len(rewards_by_episode)
            if episode != test_ep:
                break

        last_tot_reward = rewards_by_episode[-1]
        runs[test_ep, :] = mass_distr_params[0], mass_distr_params[
            1], sampled_mass, last_tot_reward

    avg_reward = runs[:, 3].mean()
    print(f'Avg. test reward: {avg_reward}\n')

    return runs
示例#6
0
def test_build_chain():
    chain_length = 1000
    f16_model = F16TFModel(chain_length)
    # f16_model.sess.run([f16_model.roll_state_assign, f16_model.pull_state_assign, f16_model.done_state_assign],
    #                             feed_dict={f16_model.roll_state_placeholder: 2.00-.001,
    #                                       f16_model.pull_state_placeholder: 3.27-.001,
    #                                       f16_model.done_state_placeholder: 9.98-.001})

    env = gym.make("F16GCAS-v0")
    ob = env.reset()
    x0 = env.states[-1]

    class CustomPolicy(FeedForwardPolicy):
        def __init__(self, *args, **kwargs):
            super(CustomPolicy, self).__init__(*args,
                                               **kwargs,
                                               layers=[128, 128],
                                               layer_norm=False,
                                               feature_extraction="mlp")

    DVenv = DummyVecEnv([lambda: f16_model.env])
    model = DDPG.load(ROOT + "/trained_models/TDRL/f16/ddpg/128_128",
                      env=DVenv,
                      policy=CustomPolicy)

    assign_x0 = tf.assign(f16_model.x0, x0)
    f16_model.sess.run(assign_x0)
    f16_model.update_change_points()
    x_out = f16_model.sess.run(f16_model.xt_list[-1])

    trace, reward = simulation_with_nn(env, chain_length, model, x0, mute=True)

    print("")
    print(x_out - env.states[-1])
示例#7
0
def ddpg(env_id,
         timesteps,
         policy="MlpPolicy",
         log_interval=None,
         tensorboard_log=None,
         seed=None,
         load_weights=None):
    from stable_baselines import DDPG

    env = gym.make(env_id)

    n_actions = env.action_space.shape[-1]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))

    if load_weights is not None:
        model = DDPG.load(load_weights, env=env)
    else:
        model = DDPG(policy,
                     env,
                     verbose=1,
                     param_noise=param_noise,
                     action_noise=action_noise,
                     tensorboard_log=tensorboard_log)

    callback = WandbRenderEnvCallback(model_name="ddpg", env_name=env_id)

    model.learn(total_timesteps=timesteps,
                log_interval=log_interval,
                callback=callback)
    save_model_weights(model, "ddpg", env_id, policy, seed=seed, path=".")
示例#8
0
def load_model(path: str, env, desc: str):
    """ Loads a model from a stable baseline checkpoint file into a memory representation 

    Args:
        path        (str)           :       Path to the Stable Baseline Checkpoint File 
        env         (SB Env)        :       Path to the Stable Baseline Checkpoint File 
        desc        (str)           :       Text Description of what model this is

    Returns:
        The loaded model
    """

    if desc == "ddpg":
        return DDPG.load(path, env)
    elif desc == "ppo":
        env = DummyVecEnv([lambda: env])
        return PPO2.load(path, env)
    elif desc == "trpo":
        env = DummyVecEnv([lambda: env])
        return TRPO.load(path, env)
    elif desc == "td3":
        return TD3.load(path, env)
    elif desc == "sac":
        return SAC.load(path, env)
    else:
        raise RuntimeError(f"Model Name {desc} not supported")
    def my_compute_data(self, args, env, params, n_episodes):
        env = gym.make('gym_quadcopter:quadcopter-v' + str(args.env))
        for alg, start_index, end_index, step, suffix in params:
            re_d = []
            sr_d = []
            rewards, s_rates = [], []
            for i in range(start_index, end_index, step):
                print("")
                print(
                    f"Working on alg={alg}, start_index={start_index}, end_index={end_index}, step={step}, suffix={suffix}, i={i}"
                )
                path = f"{self.base_dir}models/{alg}/quadcopter-v{args.env}-{i}{suffix}.pkl"
                print(f"Evaluating model at {path}")
                if not os.path.exists(path):
                    print(f"WARNING: File {path} does not exist --> SKIPPING")
                    continue

                if alg == "ddpg":
                    model = DDPG.load(path)
                elif alg == "ppo":
                    model = PPO2.load(path)
                else:
                    model = TRPO.load(path)
                r, su = mean_eval(n_episodes, model, env, False, False)
                print(f"Average Success Rate: {su}")
                rewards.append(r)
                s_rates.append(su[0])

            i_max = np.argmax(s_rates)
            re_d.append(rewards)
            sr_d.append(s_rates)
            return re_d, sr_d
示例#10
0
def test_DDPG(env, out_dir, seed=None, **kwargs):

    model = DDPG.load(os.path.join(out_dir, 'final_model.pkl'), env=env)

    #model.learn(total_timesteps=10000)
    # Evaluate the trained agent
    mean_reward = evaluate(env, model, num_steps=5000)

    return
示例#11
0
文件: plot_ult.py 项目: yizhoucc/ffsb
def plot_path_ddpg(modelname, env, num_episode=None):

    from stable_baselines import DDPG

    num_episode = 20 if num_episode is None else num_episode

    agent = DDPG.load(modelname, env=env)

    # create saving vars
    all_ep = []
    # for ecah episode,
    for i in range(num_episode):
        ep_data = {}
        ep_statex = []
        ep_statey = []
        ep_belifx = []
        ep_belify = []
        # get goal position at start
        decisioninfo = env.reset()
        goalx = env.goalx
        goaly = env.goaly
        ep_data['goalx'] = goalx
        ep_data['goaly'] = goaly
        # log the actions raw, v and w
        while not env.stop:
            action, _ = agent.predict(decisioninfo)
            decisioninfo, _, _, _ = env.step(action)
            ep_statex.append(env.s[0, 0])
            ep_statey.append(env.s[0, 1])
            ep_belifx.append(env.b[0, 0])
            ep_belify.append(env.b[0, 1])
        ep_data['x'] = ep_statex
        ep_data['y'] = ep_statey
        ep_data['bx'] = ep_belifx
        ep_data['by'] = ep_belify
        ep_data['goalx'] = env.goalx
        ep_data['goaly'] = env.goaly
        ep_data['theta'] = env.theta.tolist()
        # save episode data dict to all data
        all_ep.append(ep_data)

    for i in range(num_episode):
        plt.figure
        ep_xt = all_ep[i]['x']
        ep_yt = all_ep[i]['y']
        plt.title(str(['{:.2f}'.format(x) for x in all_ep[i]['theta']]))
        plt.plot(ep_xt, ep_yt, 'r-')
        plt.plot(all_ep[i]['bx'], all_ep[i]['by'], 'b-')
        # plt.scatter(all_ep[i]['goalx'],all_ep[i]['goaly'])

        circle = np.linspace(0, 2 * np.pi, 100)
        r = all_ep[i]['theta'][-1]
        x = r * np.cos(circle) + all_ep[i]['goalx'].item()
        y = r * np.sin(circle) + all_ep[i]['goaly'].item()
        plt.plot(x, y)

        plt.savefig('path.png')
示例#12
0
def get_policy(name="ddpg"):
    """
    Note: ppo requires the NeuralShield package in the docker.
    :param name: pretrained policy name
    :return: stable baselines policy
    """
    if name == "ppo":
        return PPO2.load(get_dir_root() + "/pretrained/ppo.pkl")
    elif name == "ddpg":
        return DDPG.load(get_dir_root() + "/pretrained/ddpg.pkl")
示例#13
0
 def __init__(self, agent: Agent, steering_boundary: Tuple[float, float],
              throttle_boundary: Tuple[float, float], **kwargs):
     super().__init__(agent, **kwargs)
     self.max_speed = self.agent.agent_settings.max_speed
     self.throttle_boundary = throttle_boundary
     self.steering_boundary = steering_boundary
     self.long_pid_controller = LongPIDController(
         agent=agent,
         throttle_boundary=throttle_boundary,
         max_speed=self.max_speed)
     self.lat_pid_controller = LatPIDController(
         agent=agent, steering_boundary=steering_boundary)
     self.logger = logging.getLogger(__name__)
     try:
         self.pid_rl_model = DDPG.load(
             Path("./ROAR_Sim/data/weights/rl_pid_model.zip"))
     except:
         path = Path(self.agent.kwargs['kwargs']["rl_pid_model_file_path"])
         self.pid_rl_model = DDPG.load(load_path=path)
def testing(env, name):
    model = DDPG.load("models\\ddpg_sbl_" + name)

    obs = env.reset()
    while True:
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        env.render()
        if done or env.steps > 1000:
            env.reset()
示例#15
0
def main(argv):
    # -p
    fixed = False
    # -j
    numControlledJoints = 7
    # -n
    policy_name = "pushing_policy"

    # COMMAND LINE PARAMS MANAGEMENT:
    try:
        opts, args = getopt.getopt(argv, "hj:p:n:", ["j=", "p=", "n="])
    except getopt.GetoptError:
        print('test.py -j <numJoints> -p <fixedPoseObject> -p <policy_name> ')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('------------------ Default values:')
            print(
                'test.py  -j <numJoints: 7> -p <fixedPoseObject: False> -n <policy_name:"pushing_policy"> '
            )
            print('------------------')
            return 0
            sys.exit()
        elif opt in ("-j", "--j"):
            if (numControlledJoints > 7):
                print("Check dimension state")
                return 0
            else:
                numControlledJoints = int(arg)
        elif opt in ("-p", "--p"):
            fixed = bool(arg)
        elif opt in ("-n", "--n"):
            policy_name = str(arg)

    print(colored("-----Number Joints Controlled:", "red"))
    print(colored(numControlledJoints, "red"))
    print(colored("-----Object Position Fixed:", "red"))
    print(colored(fixed, "red"))
    print(colored("-----Policy Name:", "red"))
    print(colored(policy_name, "red"))
    print(colored("------", "red"))
    print(colored("Launch the script with -h for further info", "red"))

    model = DDPG.load(policy_name)

    pandaenv = pandaPushGymEnv(urdfRoot=robot_data.getDataPath(),
                               renders=True,
                               useIK=0,
                               numControlledJoints=numControlledJoints,
                               fixedPositionObj=fixed,
                               includeVelObs=True)
    obs = pandaenv.reset()
    while True:
        action, _states = model.predict(obs)
        obs, rewards, dones, info = pandaenv.step(action)
示例#16
0
def main():

    # create Environment
    env = iCubPushGymEnv(urdfRoot=robot_data.getDataPath(), renders=True, useIK=1,
                        isDiscrete=0, rnd_obj_pose=0, maxSteps=2000, reward_type=0)


    model = DDPG.load(os.path.join(log_dir,'final_model.pkl'), env=env)

    #model.learn(total_timesteps=10000)
    # Evaluate the trained agent
    mean_reward = evaluate(env, model, num_steps=6000)
示例#17
0
    def f_checkpoints_range_2_mean_performance(
            self, checkpoints: range) -> Tuple[np.ndarray, np.ndarray]:
        logging.debug(
            f"[f_checkpoints_range_2_mean_performance]: checkpoints={checkpoints}"
        )
        rewards = np.zeros(len(checkpoints))
        s_rates = np.zeros(len(checkpoints))
        # Intent
        # - Iterate over this range, to load the associated Stable Baseline Model Checkpoint
        # - Pass that model to `mean_eval` evaluation function which will evaluate the model on
        #   - a certain number of episodes
        #   - a certain env
        #    - continuous or not continuous space
        # - an evaluation returns reward and average success rate
        #
        # Evaluating N checkpoints on M queries and then averaging on M so to finally have N Rewards and N Success Rates

        j = 0
        """ NOTE: i can range in anyway while j iterates over the numpy array 
        """
        for i in checkpoints:
            path = f"{self.args.training_base_path}/models/quadcopter-{i}{self.args.suffix}"
            logging.debug(f"Evaluating model at {path}")
            if self.args.model['name'] == "ddpg":
                model = DDPG.load(path)
            elif self.args.model['name'] == "ppo":
                model = PPO2.load(path)
            elif self.args.model['name'] == "trpo":
                model = TRPO.load(path)
            elif self.args.model['name'] == "td3":
                model = TD3.load(path)
            elif self.args.model['name'] == "sac":
                model = SAC.load(path)
            logging.debug(
                f"Evaluating Model {self.args.model['name']} for {self.args.n_episodes} episodes in {self.args.env} environment with continuous={str(self.args.continuous)}"
            )
            rewards_list, success_rates_list = mean_eval(
                num_episodes=self.args.n_episodes,
                checkpoint_id=i,
                model=model,
                env=self.env,
                v=True,
                continuous=self.args.continuous,
                plots_dir=self.args.plots_dir)
            rewards_mean = np.mean(rewards_list)
            success_rates_mean = np.mean(success_rates_list)
            logging.debug(
                f"Evaluation Checkpoint={i} --> Average Reward = {rewards_mean}, Average Success Rate = {success_rates_mean}"
            )
            rewards[j] = rewards_mean
            s_rates[j] = success_rates_mean
            j += 1
        return rewards, s_rates
def launchAgent(model_name: str):
    """
    :param model_name: 실행시킬 모델의 종류. HER, DDPG, PPO2 혹은 기타값(DQN)이어야 함
                        현재는 의도상 PPO2로 세팅할 것
    :return: 1000회의 사이클을 돌고 난 이후의 모델
    """
    import Reinforcement_AI.env.e_enhanced_image_env as image_env
    from stable_baselines import DQN, HER, DDPG, PPO2
    from stable_baselines.common import make_vec_env

    print("Current Env is " + model_name)

    if model_name == "HER":
        env = image_env.DetailedMiniMapEnv()
        model = HER("CnnPolicy", env=env, model_class=DQN)
    if model_name == "DDPG":
        env = image_env.DDPGImageEnv()
        model = DDPG(policy="CnnPolicy", env=env, normalize_observations=True)
    if model_name == "PPO2":
        env = make_vec_env(image_env.DetailedMiniMapEnv, n_envs=1)
        model = PPO2(policy="CnnPolicy", env=env, verbose=1)
    else:
        env = image_env.DetailedMiniMapEnv()
        model = DQN(
            "CnnPolicy",  # policy
            env=env,  # environment
            double_q=True,  # Double Q enable
            prioritized_replay=True,  # Replay buffer enabled
            verbose=0  # log print
        )

    for i in range(1000):
        if i != 0:
            if model_name == "HER":
                model = HER.load("detailedmap_HER_" + str(i), env)
            if model_name == "DDPG":
                model = DDPG.load("detailedmap_DDPG_" + str(i), env)
            if model_name == "PPO2":
                model = PPO2.load("detailedmap_PPO2_" + str(i), env)
            else:
                model = DQN.load("detailedmap_DQN_" + str(i), env)

        # print('model learn start')
        model.learn(total_timesteps=12500)  #FPS가 130이상 넘어갈때의 최소수치
        print("this model is : detailedmap_" + model_name + "_" + str(i + 1))
        # print('model learn finished')

        # print('model save start')
        model.save("detailedmap_" + model_name + "_" + str(i + 1))
        del model
        # print('model save end')

    return model
示例#19
0
def load_env(model_name='flexible_load_first', seed=9):
    #flexible_load_first, overnight, larger_margin_cost, discount_06, flex50
    location = 'C:\\Users\\vegar\\Dropbox\\Master\\thesis.git\\scripts\\models\\'
    params_name = model_name + '_params.p'
    model = DDPG.load(location + model_name)
    env = ActiveEnv(seed=seed)
    with open(location + params_name, 'rb') as f:
        params = pickle.load(f)

    env.set_parameters(params)
    model.set_env(env)
    return model, env
示例#20
0
def DDPGAgent(multi_stock_env, num_episodes):
    models_folder = 'saved_models'
    rewards_folder = 'saved_rewards'

    env = DummyVecEnv([lambda: multi_stock_env])
    
    # the noise objects for DDPG
    n_actions = env.action_space.shape[-1]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
    
    # Hyper parameters
    GAMMA = 0.99
    TAU = 0.001
    BATCH_SIZE = 16
    ACTOR_LEARNING_RATE = 0.0001
    CRITIC_LEARNING_RATE = 0.001
    BUFFER_SIZE = 500

    print("\nRunning DDPG Agent...\n")
    model = DDPG(MlpPolicy, env, 
                gamma = GAMMA, tau = TAU, batch_size = BATCH_SIZE,
                actor_lr = ACTOR_LEARNING_RATE, critic_lr = CRITIC_LEARNING_RATE,
                buffer_size = BUFFER_SIZE, verbose=1, 
                param_noise=param_noise, action_noise=action_noise)
    model.learn(total_timesteps=50000)
    model.save(f'{models_folder}/rl/ddpg.h5')

    del model
    
    model = DDPG.load(f'{models_folder}/rl/ddpg.h5')
    obs = env.reset()
    portfolio_value = []

    for e in range(num_episodes):
        action, _states = model.predict(obs)
        next_state, reward, done, info = env.step(action)
        print(f"episode: {e + 1}/{num_episodes}, episode end value: {info[0]['cur_val']:.2f}")
        portfolio_value.append(round(info[0]['cur_val'], 3))

    # save portfolio value for each episode
    np.save(f'{rewards_folder}/rl/ddpg.npy', portfolio_value)

    print("\nDDPG Agent run complete and saved!")

    a = np.load(f'./saved_rewards/rl/ddpg.npy')

    print(f"\nCumulative Portfolio Value Average reward: {a.mean():.2f}, Min: {a.min():.2f}, Max: {a.max():.2f}")
    plt.plot(a)
    plt.title("Portfolio Value Per Episode (DDPG)")
    plt.ylabel("Portfolio Value")
    plt.xlabel("Episodes")
    plt.show()
示例#21
0
def main(output_folder_path: Path):
    # Set gym-carla environment
    agent_config = AgentConfig.parse_file(
        Path("configurations/agent_configuration.json"))
    carla_config = CarlaConfig.parse_file(
        Path("configurations/carla_configuration.json"))

    params = {
        "agent_config": agent_config,
        "carla_config": carla_config,
        "ego_agent_class": RLLocalPlannerAgent,
        "max_collision": 5,
    }

    env = gym.make('roar-local-planner-v0', params=params)
    env.reset()

    model_params: dict = {
        "verbose": 1,
        "render": True,
        "env": env,
        "n_cpu_tf_sess": None,
        "buffer_size": 1000,
        "nb_train_steps": 50,
        "nb_rollout_steps": 100,
        # "nb_eval_steps": 50,
        "batch_size": 32,
    }
    latest_model_path = find_latest_model(Path(output_folder_path))
    if latest_model_path is None:
        model = DDPG(CnnPolicy, **model_params)
    else:
        model = DDPG.load(latest_model_path, **model_params)
    tensorboard_dir = (output_folder_path / "tensorboard")
    ckpt_dir = (output_folder_path / "checkpoints")
    tensorboard_dir.mkdir(parents=True, exist_ok=True)
    ckpt_dir.mkdir(parents=True, exist_ok=True)
    model.tensorboard_log = tensorboard_dir.as_posix()
    model.render = True
    logging_callback = LoggingCallback(model=model)
    checkpoint_callback = CheckpointCallback(save_freq=1000,
                                             verbose=2,
                                             save_path=ckpt_dir.as_posix())
    event_callback = EveryNTimesteps(n_steps=100, callback=checkpoint_callback)
    callbacks = CallbackList(
        [checkpoint_callback, event_callback, logging_callback])
    model = model.learn(total_timesteps=int(1e10),
                        callback=callbacks,
                        reset_num_timesteps=False)
    model.save(f"local_planner_ddpg_{datetime.now()}")
示例#22
0
def main(argv):
    # -p
    fixed = False
    # -j
    numControlledJoints = 12
    # -n
    policy_name = "models/DDPG/DDPG_16batch_false-norm-ret-ob_12Actions"

    # COMMAND LINE PARAMS MANAGEMENT:
    try:
        opts, args = getopt.getopt(argv,"hj:p:n:",["j=","p=","n="])
    except getopt.GetoptError:
        print ('test.py -j <numJoints> -p <fixedPoseObject> -p <policy_name> ')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('------------------ Default values:')
            print('test.py  -j <numJoints: 12> -p <fixedPoseObject: false> -n <policy_name:"DDPG_16batch_false-norm-ret-ob_12Actions"> ')
            print('------------------')
            return 0
            sys.exit()
        elif opt in ("-j", "--j"):
            if(numControlledJoints >18):
                print("Check dimension state")
                return 0
            else:
                numControlledJoints = int(arg)
        elif opt in ("-p", "--p"):
            fixed = bool(arg)
        elif opt in ("-n","--n"):
            policy_name = str(arg)


    print(colored("-----Number Joints Controlled:","red"))
    print(colored(numControlledJoints,"red"))
    print(colored("-----Object Position Fixed:","red"))
    print(colored(fixed,"red"))
    print(colored("-----Policy Name:","red"))
    print(colored(policy_name,"red"))
    print(colored("------","red"))
    print(colored("Launch the script with -h for further info","red"))

    model = DDPG.load(policy_name)

    bioenv = bioEnv()
    obs = bioenv.reset()
    while True:
        action, _states = model.predict(obs)
        obs, rewards, dones, info = bioenv.step(action)
示例#23
0
def load_env(model_name='flexible_load_first',seed=9):
#flexible_load_first, overnight, larger_margin_cost, discount_06, flex50
    model_path = os.path.join(MODEL_PATH,model_name)
    params_name = model_name +'_params.p'
    param_path = os.path.join(MODEL_PATH,params_name)
    try:
        model = DDPG.load(model_path)
    except:
        model = PPO1.load(model_path)
    env = ActiveEnv(seed=seed)
    with open(param_path,'rb') as f:
        params = pickle.load(f)

    env.set_parameters(params)
    model.set_env(env)
    return model, env
示例#24
0
def train_stable_baselines(submodule, flags):
    """Train policies using the PPO algorithm in stable-baselines."""
    from stable_baselines.common.vec_env import DummyVecEnv
    from stable_baselines import DDPG
    flow_params = submodule.flow_params
    # Path to the saved files
    exp_tag = flow_params['exp_tag']
    result_name = '{}/{}'.format(exp_tag, strftime("%Y-%m-%d-%H:%M:%S"))

    # Perform training.
    print('Beginning training.')
    model = run_model_stablebaseline(
        flow_params, flags.num_cpus, flags.rollout_size, flags.num_steps)

    # Save the model to a desired folder and then delete it to demonstrate
    # loading.
    print('Saving the trained model!')
    path = os.path.realpath(os.path.expanduser('~/baseline_results'))
    ensure_dir(path)
    save_path = os.path.join(path, result_name)
    model.save(save_path)

    # dump the flow params
    with open(os.path.join(path, result_name) + '.json', 'w') as outfile:
        json.dump(flow_params, outfile,
                  cls=FlowParamsEncoder, sort_keys=True, indent=4)

    # Replay the result by loading the model
    print('Loading the trained model and testing it out!')
    model = DDPG.load(save_path)
    flow_params = get_flow_params(os.path.join(path, result_name) + '.json')
    flow_params['sim'].render = True
    env = env_constructor(params=flow_params, version=0)()
    
    n_actions = env.action_space.shape[-1]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))

    # The algorithms require a vectorized environment to run
    eval_env = DummyVecEnv([lambda: env])
    obs = eval_env.reset()
    reward = 0
    for _ in range(flow_params['env'].horizon):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = eval_env.step(action)
        reward += rewards
    print('the final reward is {}'.format(reward))
def main(output_folder_path: Path):
    # Set gym-carla environment
    agent_config = AgentConfig.parse_file(
        Path("configurations/agent_configuration.json"))
    carla_config = CarlaConfig.parse_file(
        Path("configurations/carla_configuration.json"))

    params = {
        "agent_config": agent_config,
        "carla_config": carla_config,
        "ego_agent_class": RLLocalPlannerAgent,
        "max_collision": 5,
    }

    env = gym.make('roar-local-planner-v1', params=params)
    env.reset()

    tensorboard_dir, ckpt_dir = prep_dir(output_folder_path)
    model_params: dict = {
        "verbose": 1,
        "render": True,
        "env": env,
        "n_cpu_tf_sess": 2,
        "buffer_size": 10,
        "random_exploration": 0.1,
        "tensorboard_log": tensorboard_dir.as_posix(),
    }
    latest_model_path = find_latest_model(Path(output_folder_path))
    if latest_model_path is None:
        model = DDPG(
            LnMlpPolicy,
            **model_params)  # full tensorboard log can take up space quickly
    else:
        model = DDPG.load(latest_model_path, **model_params)

    logging_callback = LoggingCallback(model=model)
    checkpoint_callback = CheckpointCallback(save_freq=1000,
                                             verbose=2,
                                             save_path=ckpt_dir.as_posix())
    event_callback = EveryNTimesteps(n_steps=100, callback=checkpoint_callback)
    callbacks = CallbackList(
        [checkpoint_callback, event_callback, logging_callback])
    model = model.learn(total_timesteps=int(1e10),
                        callback=callbacks,
                        reset_num_timesteps=False)
    model.save(f"local_planner_v1_ddpg_{datetime.now()}")
示例#26
0
def load_model(eval_env):
    model = DDPG.load('./ddpg_robot_env', env=eval_env)
    count = 0
    step_num_arr = []
    for _ in range(20):
        number_steps = 0
        obs = eval_env.reset()
        for _ in range(400):
            action, _ = model.predict(obs)
            obs, reward, done, _ = eval_env.step(action)
            number_steps += 1
            if done:
                step_num_arr.append(number_steps)
                count += 1
                print("----------------It reached terminal state -------------------")
                break
    print("Robot reached the goal position successfully ", count, " times and the Average step count was ",
          np.average(np.array(step_num_arr)))
示例#27
0
def view_ddpg():
    env = gimbal(5, 500)
    model = DDPG.load("./models/baseline_ddpg_t2")
    success_rate = 0
    reward_avg = 0
    for episodes in range(50):
        obs = env.reset()
        while True:
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            env.render()
            if dones:
                if rewards > -100:
                    success_rate += 1
                    reward_avg += rewards
                break
    print("Success rate: ", success_rate, "Avg rewards: ",
          reward_avg / success_rate)
示例#28
0
def main():
    # unpause Simulation so that robot receives data on all topics
    gazebo_connection.GazeboConnection().unpauseSim()
    # create node
    rospy.init_node('pickbot_gym', anonymous=True, log_level=rospy.FATAL)

    env = gym.make('Pickbot-v1')

    model = DDPG.load("pickbot_model_ddpg_continuous_2019-03-11 12:45:38")

    while True:
        obs, done = env.reset(), False
        action, _states = model.predict(obs)
        episode_rew = 0
        while not done:
            obs, rewards, done, info = env.step(action)
            episode_rew += rewards
            print("Episode reward", episode_rew)
示例#29
0
def setup(model_params, output_folder_path):
    latest_model_path = find_latest_model(Path(output_folder_path))
    if latest_model_path is None:
        print("Creating model...")
        model = DDPG(CnnPolicy, **model_params)
    else:
        print("Loading model...")
        model = DDPG.load(latest_model_path, **model_params)
    tensorboard_dir = (output_folder_path / "tensorboard")
    ckpt_dir = (output_folder_path / "checkpoints")
    tensorboard_dir.mkdir(parents=True, exist_ok=True)
    ckpt_dir.mkdir(parents=True, exist_ok=True)
    checkpoint_callback = CheckpointCallback(save_freq=200,
                                             verbose=2,
                                             save_path=ckpt_dir.as_posix())
    # event_callback = EveryNTimesteps(n_steps=100, callback=checkpoint_callback)
    logging_callback = LoggingCallback(model=model, verbose=1)
    callbacks = CallbackList([checkpoint_callback, logging_callback])
    return model, callbacks
def run_baseline_ddpg(env_name, train=True):
    import numpy as np
    # from stable_baselines.ddpg.policies import MlpPolicy
    from stable_baselines.common.vec_env import DummyVecEnv
    from stable_baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise
    from stable_baselines import DDPG

    env = gym.make(env_name)
    env = DummyVecEnv([lambda: env])

    if train:
        # mlp
        from stable_baselines.ddpg.policies import FeedForwardPolicy
        class CustomPolicy(FeedForwardPolicy):
            def __init__(self, *args, **kwargs):
                super(CustomPolicy, self).__init__(*args, **kwargs,
                                                layers=[64, 64, 64],
                                                layer_norm=True,
                                                feature_extraction="mlp")

        # the noise objects for DDPG
        n_actions = env.action_space.shape[-1]
        param_noise = None
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions)+0.15, sigma=0.3 * np.ones(n_actions))
        model = DDPG(CustomPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, 
            tau=0.01, observation_range=(env.observation_space.low, env.observation_space.high),
            critic_l2_reg=0, actor_lr=1e-3, critic_lr=1e-3, memory_limit=100000)
        model.learn(total_timesteps=1e5)
        model.save("checkpoints/ddpg_" + env_name)

    else:
        model = DDPG.load("checkpoints/ddpg_" + env_name)

        obs = env.reset()
        while True:
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            env.render()
            print("state: ", obs, " reward: ", rewards, " done: ", dones, "info: ", info)

    del model # remove to demonstrate saving and loading