Пример #1
0
        'return_info': True,
        'train': False
    }
    eval_env = make_vec_env("L5-CLE-v0",
                            env_kwargs=eval_env_kwargs,
                            n_envs=args.n_eval_envs,
                            vec_env_cls=SubprocVecEnv,
                            vec_env_kwargs={"start_method": "fork"})

    # callbacks
    # Note: When using multiple environments, each call to ``env.step()``
    # will effectively correspond to ``n_envs`` steps.
    # To account for that, you can use ``save_freq = max(save_freq // n_envs, 1)``
    # Save Model Periodically
    checkpoint_callback = CheckpointCallback(save_freq=(args.save_freq //
                                                        args.n_envs),
                                             save_path=args.save_path,
                                             name_prefix=args.output)

    # Eval Model Periodically
    eval_callback = L5KitEvalCallback(
        eval_env,
        eval_freq=(args.eval_freq // args.n_envs),
        n_eval_episodes=args.n_eval_episodes,
        n_eval_envs=args.n_eval_envs,
        enable_scene_type_aggregation=args.enable_scene_type_aggregation,
        scene_id_to_type_path=args.scene_id_to_type_path)

    # train
    model.learn(args.n_steps, callback=[checkpoint_callback, eval_callback])
# Here we are also multi-worker training (n_envs=4 => 4 environments), The model must support Multi Processing
env = make_atari_env(atari_env_name, n_envs=1, seed=0)
# Frame-stacking with 4 frames. Με 1 frame ο αλγόριθμος ξέρει τη θέση των πραγμάτων, με 2 frames την ταχύτητα, με 3 την επιτάχυνση και με 4 το jerk
env = VecFrameStack(env, n_stack=4)
# Test environment must be unique
test_env = make_atari_env(atari_env_name, n_envs=1, seed=0)
# Frame-stacking with 4 frames
test_env = VecFrameStack(test_env, n_stack=4)

model_name='ppo-MlpPolicy'
time_stamp=datetime.datetime.now().strftime("-%Y%m%d-%H%M%S")
model_log= LOG_DIR + model_name + time_stamp

ppo_model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=model_log, )
max_steps=10000
ppo_model.learn(total_timesteps=max_steps)

# %tensorboard --logdir {LOG_DIR}

from stable_baselines3.common.callbacks import StopTrainingOnMaxEpisodes
callback = StopTrainingOnMaxEpisodes
mean_reward, std_reward = evaluate_policy(ppo_model, test_env, callback=callback, n_eval_episodes=10)
print(f"Eval reward: {mean_reward} (+/-{std_reward})")

record_video(test_env, ppo_model, video_length=5000, prefix='ppo_BerzerkDeterministic-v4')

show_videos(video_path = video_folder, prefix='ppo')

ppo_model.save("a2c_BerzerkDeterministic-v4")
obs = env.reset()
    run_id = str(uuid.uuid4())  # ALL running environments must share this
    print(f"RUN ID: {run_id}")

    # to pass launch args, add to env_kwargs: 'launch_args': ['render:=false', 'plot_log:=true']
    env = make_vec_env(RocketLeagueInterface,
                       env_kwargs={'run_id': run_id},
                       n_envs=24,
                       vec_env_cls=SubprocVecEnv)

    model = PPO("MlpPolicy", env)

    # log training progress as CSV
    log_dir = expanduser(f'~/catkin_ws/data/rocket_league/{run_id}')
    logger = configure(log_dir, ["stdout", "csv", "log"])
    model.set_logger(logger)

    # log model weights
    freq = 20833  # save 20 times
    # freq = steps / (n_saves * n_envs)
    callback = CheckpointCallback(save_freq=freq, save_path=log_dir)

    # run training
    steps = 240000000  # 240M (10M sequential)
    print(f"training on {steps} steps")
    model.learn(total_timesteps=steps, callback=callback)

    # save final weights
    print("done training")
    model.save(log_dir + "/final_weights")
    env.close()  # this must be done to clean up other processes
Пример #4
0
from callback import SaveOnBestTrainingRewardCallback
from env.env import CitadelsEnv
import os

log_dir = "/Users/daniel/repos/CitadelsAI/logs"
os.makedirs(log_dir, exist_ok=True)

env = CitadelsEnv()
env = Monitor(env, log_dir)

callback = SaveOnBestTrainingRewardCallback(check_freq=100, log_dir=log_dir)

# # Learn
# model = A2C('MlpPolicy', env, verbose=1)
model = PPO('MlpPolicy', env, verbose=1)
model = PPO.load("/Users/daniel/repos/CitadelsAI/logs/best_model.zip", env=env)
model.learn(total_timesteps=100000, callback=callback)

# mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
# print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

# Play
# print('-Play-')
# obs = env.reset()
# for i in range(100):
#     action, _state = model.predict(obs, deterministic=True)
#     obs, reward, done, info = env.step(action)
#     # env.render()
#     if done:
#         obs = env.reset()
Пример #5
0
                continue_training_model_folder,
                'vec_normalize_' + continue_training_model_filename + '.pkl')

            print(
                f"Continual training on model located at {continue_training_model_path}"
            )

            # Load normalized env
            env = VecNormalize.load(continue_training_vecnormalize_path, env)

            # Load model
            model = PPO.load(continue_training_model_path, env=env)

        # Training
        model.learn(total_timesteps=training_timesteps,
                    tb_log_name=tb_log_name,
                    callback=checkpoint_callback,
                    reset_num_timesteps=True)

        # Save trained model
        model.save(save_model_path)
        env.save(save_vecnormalize_path)

    else:
        # Create evaluation environment
        env_options['has_renderer'] = True
        register_gripper(UltrasoundProbeGripper)
        env_gym = GymWrapper(suite.make(env_id, **env_options))
        env = DummyVecEnv([lambda: env_gym])

        # Load normalized env
        env = VecNormalize.load(load_vecnormalize_path, env)
Пример #6
0
from gym.wrappers import FrameStack, FlattenObservation
from stable_baselines3 import PPO

from top_view_rl_car.sensor_environment import SensorEnvironment
from top_view_rl_car.sensor_environment import config

# Create environment
env = SensorEnvironment(config)
env = FlattenObservation(env)
env = FrameStack(env, 4)
env = FlattenObservation(env)
# Instantiate the agent
model = PPO('MlpPolicy', env, verbose=1, device="cuda")
# Train the agent
model.learn(total_timesteps=int(1e5))
Пример #7
0
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

programing_type = int(sys.argv[1])
environment_name = 'ChromeCrossyRoad-v0'

# Start to train the agent
if programing_type == 0:
    env = gym.make(environment_name)
    model = PPO("MlpPolicy",
                env,
                learning_rate=0.0001,
                gamma=0.7,
                batch_size=1024,
                verbose=1,
                tensorboard_log="./log/ppo_crossy_road_tensorboard/")
    model.learn(total_timesteps=30000)
    model.save("../model/ppo")
    env.close()

# Continue to train
elif programing_type == 1:
    myenv = gym.make(environment_name)
    env = DummyVecEnv([lambda: myenv])
    model = PPO.load('../model/ppo', env=env)
    model.set_env(env)
    model.learn(total_timesteps=20000,
                callback=None,
                reset_num_timesteps=False)
    model.save("../model/ppo")
    env.close()
Пример #8
0
else:
    model = PPO("MlpPolicy",
                env_name,
                learning_rate=1e-3,
                policy_kwargs=policy_kwargs,
                tensorboard_log="{}/tensorboard".format(results_root),
                verbose=1)

# Train the agent
# Evaluate the model every 1000 steps on 5 test episodes
# and save the evaluation to the "logs/" folder
# total_timesteps:Number of interactions between agent and environment(one step==one transition);
# Each n_steps(2048) contains many episodes;
# Then n_steps transitions used to training.(1 epoch == n_steps transitions)
model.learn(total_timesteps=100000,
            eval_freq=1000,
            n_eval_episodes=5,
            eval_log_path="./logs/")
# save the model
model.save("{}/model".format(results_root))

# et policy
policy = model.policy
# Retrieve the environment
env = model.get_env()
# Evaluate the policy
mean_reward, std_reward = evaluate_policy(policy,
                                          env,
                                          n_eval_episodes=10,
                                          deterministic=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")
    reward_lb = -0.012 * 4
    reward_ub = 0.012 * 2

    indicators = [
        MidPriceDeltaSign(3),
        Imbalance(),
        NormalizedPosition(position_limit)
    ]

    env = AbsoluteExchange(files=['AAPL_20170201'],
                           indicators=indicators,
                           reward_lb=reward_lb,
                           reward_ub=reward_ub,
                           start_time=34230000000000,
                           end_time=57540000000000,
                           order_size=order_size,
                           position_limit=position_limit,
                           liquidation_ratio=liquidation_ratio)

    print('Checking environment')
    check_env(env)
    print('Done checking environment')

    # env = make_vec_env(lambda: env, n_envs=1)
    model = PPO('MlpPolicy', env, verbose=False)
    print('\nBegin training')
    for iteration in range(100):
        print(f'Iteration: {iteration}')
        model.learn(3000)
        evaluate(env, model)
Пример #10
0
def test_rl():
    import gym
    import datetime as dt
    import matplotlib.pyplot as plt

    # from stable_baselines.common.policies import MlpPolicy, CnnPolicy, MlpLstmPolicy, ActorCriticPolicy, LstmPolicy
    # from stable_baselines.common.vec_env import DummyVecEnv
    # from stable_baselines import PPO2, PPO1, A2C, DQN, TD3, SAC

    # from stable_baselines3.common.policies import MlpPolicy
    from stable_baselines3 import PPO
    from stable_baselines3.common.vec_env import DummyVecEnv
    from stable_baselines3.common.evaluation import evaluate_policy

    from sklearn import preprocessing

    import pandas as pd

    from lutils.stock import LTdxHq

    ltdxhq = LTdxHq()
    code = '600519' # 000032 300142 603636 600519
    df = ltdxhq.get_k_data_1min(code, end='2021-09-02') # 000032 300142 603636 600519
    # df = ltdxhq.get_k_data_daily('603636', end='2019-01-01') # 000032 300142 603636 600519
    df = StockDataFrame(df.rename(columns={'vol': 'volume'}))

    # min_max_scaler = preprocessing.MinMaxScaler()
    # df = pd.DataFrame(min_max_scaler.fit_transform(df.drop(columns=['date', 'code'])))
    # df.columns = ['open', 'close', 'high', 'low', 'volume', 'amount']

    df_eval = ltdxhq.get_k_data_1min(code, start='2021-09-01')
    df_eval = StockDataFrame(df_eval.rename(columns={'vol': 'volume'}))

    ltdxhq.close()
    # df = ltdxhq.get_k_data_5min('603636')
    # df = ltdxhq.get_k_data_daily('603636')

    # df1 = df[:-240]
    # df2 = df[-240:]
    # The algorithms require a vectorized environment to run
    env = DummyVecEnv([lambda: LStockDailyEnv(df)])
    # model = PPO2(MlpPolicy, env, verbose=1) # , tensorboard_log='log')
    model = PPO('MlpPolicy', env, verbose=1) # , tensorboard_log='log')
    model.learn(100000)
    # model = PPO1(LstmPolicy, env, verbose=1)
    # model.learn(total_timesteps=1000)



    # env.set_attr('df', df2)
    # obs = env.reset()

    # rewards = []
    # actions = []
    # net_worths = []
    # # for i in range(220):
    # for i in range(NEXT_OBSERVATION_SIZE, df2.shape[0]):
    #     # actual_obs = observation(df2, i)
    #     # action, _states = model.predict(actual_obs)
    #     # action = [action]
    #     action, _states = model.predict(obs)
    #     obs, reward, done, info = env.step(action)
    #     rewards.append(reward)
    #     actions.append(action[0][0])
    #     net_worths.append(info[0]['net_worth'])
    #     # print(info[0]['current_step'])
    #     env.render()

    # mean_reward, _  = evaluate_policy(model, eval_env, n_eval_episodes=1, render=True) # EVAL_EPS

    # print(mean_reward)

    model.save('ppo_stock')
    # model = PPO.load('ppo_stock')

    eval_env = DummyVecEnv([lambda: LStockDailyEnv(df_eval)])
    obs = eval_env.reset()

    net_worths = []
    actions = []
    done, state = False, None
    while not done:
        action, state = model.predict(obs, state=state, deterministic=True)
        obs, reward, done, _info = eval_env.step(action)
        net_worths.append(_info[0]['net_worth'])
        # if is_recurrent:
        #     obs[0, :] = new_obs
        # else:
        #     obs = new_obs

        # if action[0] < Actions.Buy: # Buy
        #     actions.append(1)
        # elif action[0] < Actions.Sell: # Sell
        #     actions.append(2)
        # else:
        #     actions.append(0)
        actions.append(action[0])
        eval_env.render()

    plt.plot(net_worths)
    plt.plot(actions)
    plt.show()
Пример #11
0
        eval_env = VecTransposeImage(eval_env)

    #### Train the model #######################################
    # checkpoint_callback = CheckpointCallback(save_freq=1000, save_path=filename+'-logs/', name_prefix='rl_model')
    callback_on_best = StopTrainingOnRewardThreshold(
        reward_threshold=EPISODE_REWARD_THRESHOLD, verbose=1)
    eval_callback = EvalCallback(eval_env,
                                 callback_on_new_best=callback_on_best,
                                 verbose=1,
                                 best_model_save_path=filename + '/',
                                 log_path=filename + '/',
                                 eval_freq=int(2000 / ARGS.cpu),
                                 deterministic=True,
                                 render=False)
    model.learn(
        total_timesteps=35000,  #int(1e12),
        callback=eval_callback,
        log_interval=100,
    )

    #### Save the model ########################################
    model.save(filename + '/success_model.zip')
    print(filename)

    #### Print training progression ############################
    with np.load(filename + '/evaluations.npz') as data:
        for j in range(data['timesteps'].shape[0]):
            print(
                str(data['timesteps'][j]) + "," +
                str(data['results'][j][0][0]))
Пример #12
0
# Control Variables
episodes = 15000
test_ratio = 0.25

train_episodes = ceil(episodes * (1 - test_ratio))
test_episodes = floor(episodes * test_ratio)

# Init Training Environment
train_env = load_environment()

# Training Stage
print("Start Training Stage")

rl = PPO(MlpPolicy, train_env, verbose=1, n_steps=10)
train_env.reset()
rl.learn(total_timesteps=episodes)
rl.save("breakout_model")
train_env.close()
print("Closed")

copyfile('./data.csv', './data_train.csv')

multi_output = GamePredictor('rf', single_output=False)
single_output = GamePredictor('rf', single_output=True)

dataset = read_dataset("data_train.csv")
#plot_satisfactions("train", dataset)
filtered_dataset = filter_satisfaction(dataset)

multi_output.train(filtered_dataset)
single_output.train(filtered_dataset)
Пример #13
0
def main():
    # nn = torch.nn.Sequential(torch.nn.Linear(8, 64), torch.nn.Tanh(),
    #                          torch.nn.Linear(64, 2))

    os.makedirs(_log_dir, exist_ok=True)

    DoTraining = True
    StartFresh = True
    num_cpu = 8
    if (DoTraining):

        # This doesn't work but it might have something to do with how the environment is written
        # num_cpu = 1
        # env = make_vec_env(env_id, n_envs=num_cpu, monitor_dir=_log_dir) # make_vec_env contains Monitor

        # Create the callback: check every 1000 steps
        # callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=_log_dir)

        if (StartFresh):
            env = SubprocVecEnv([
                make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu)
            ])
            env = VecNormalize(env,
                               norm_obs=True,
                               norm_reward=True,
                               clip_obs=10.)
            env.reset()
            policy_kwargs = {
                'net_arch': [128, 128, 128],
            }
            model = PPO('MlpPolicy',
                        env,
                        policy_kwargs=policy_kwargs,
                        verbose=2,
                        tensorboard_log=tb_log)
        else:
            env = SubprocVecEnv([
                make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu)
            ])
            env = VecNormalize.load(_stats_path, env)
            env.reset()

            model = PPO.load(
                'log\monitor_simpledriving_vecNormalized_128x3_2\PPO_4243456.mdl',
                tensorboard_log=tb_log)
            model.set_env(env)

        eval_env = gym.make(env_id)
        # print('!!!!Checking Environment!!!!')
        # print(check_env(eval_env))

        mean_reward, std_reward = evaluate_policy(model,
                                                  eval_env,
                                                  n_eval_episodes=10)
        print(f'mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}')
        for _ in range(50):
            model.learn(total_timesteps=100000,
                        tb_log_name=env_id,
                        reset_num_timesteps=False)  #, callback=callback
            mean_reward, std_reward = evaluate_policy(model,
                                                      eval_env,
                                                      n_eval_episodes=10)
            print(f'mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}')
            model.save(_log_dir + 'PPO_{}'.format(model.num_timesteps) +
                       '.mdl')
            env.save(_log_dir +
                     'vec_normalize_{}'.format(model.num_timesteps) + '.pkl')

    if (not DoTraining):
        # eval_env = SubprocVecEnv([make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu)])
        # eval_env = VecNormalize.load(_log_dir + 'vec_normalize_5734400.pkl', eval_env)
        # eval_env = VecVideoRecorder(eval_env, video_folder='videos/',
        #                       record_video_trigger=lambda step: step == 0, video_length=500,
        #                       name_prefix='test')
        # eval_env.training = False
        # eval_env.norm_reward = False
        # eval_env.reset()

        eval_env = DummyVecEnv(
            [make_env(env_id, i, log_dir=_log_dir) for i in range(1)])
        # eval_env = gym.make(env_id)
        eval_env = VecNormalize.load(_log_dir + 'vec_normalize_5734400.pkl',
                                     eval_env)

        model = PPO.load(
            'log\monitor_simpledriving_vecNormalized_128x3\PPO_5734400.mdl',
            tensorboard_log=tb_log)
        model.set_env(eval_env)
        # record_video(env_id, model, video_length=500, prefix='ppo_'+env_id)
        # Start the video at step=0 and record 500 steps
        # eval_env = VecVideoRecorder(eval_env, video_folder='tmp',
        #                       record_video_trigger=lambda step: step == 0, video_length=500,
        #                       name_prefix='')

        obs = eval_env.reset()
        # for i in range(500):
        #     action, _ = model.predict(obs)
        #     obs, _, _, _ = eval_env.step(action)
        # eval_env.close()
        while True:
            action, _states = model.predict(obs, deterministic=True)
            obs, _, done, _ = eval_env.step(action)
            # eval_env.render()
            if done.any():
                # obs = eval_env.reset()
                # time.sleep(1/30)
                eval_env.close()
                break
Пример #14
0
def train(env_function,
          name="model",
          n_processes: int = 6,
          seed: int = 0,
          load_checkpoint: Optional[str] = None,
          from_index=0,
          to_index=12,
          steps_per_episode=125 * 1000):
    """
    Trains a model with a given environment

    :param env_function: Function that creates an gym.Env
    :param name: name for saving
    :param n_processes: number of processes used for training
    :param seed:
    :param load_checkpoint: if None: Create new model. Else: Load model from file
    :param steps_per_episode: Number of steps for model.learn()
    :param from_index: starting with this episode (for continuing training later than 0)
    :param to_index: last index of episode
    :return:
    """
    def make_env(rank: int):
        """
        Utility function for multiprocessed env.

        :param rank: index of the subprocess (needed to update seed)
        """
        def _init():
            env = env_function()
            # Important: use a different seed for each environment
            env.seed(seed + rank)
            return env

        return _init

    # Create the vectorized environment
    env_vector = SubprocVecEnv([make_env(i) for i in range(n_processes)])

    # Create model
    if load_checkpoint is None:
        model = PPO(
            "MlpPolicy",
            env_vector,
            tensorboard_log="./ppo_trafficgym_tensorboard/",
            verbose=2,
            learning_rate=1e-2,
            # gamma=0.95,
            batch_size=256,
            policy_kwargs=dict(net_arch=[64, 64]),
        )
    else:
        model = PPO.load(load_checkpoint)

    # Evaluate before training
    env = Monitor(env_function())
    print("Evaluating...")
    evaluation = evaluate_policy(model, env)
    print("Eval1:", evaluation)

    # Actual training
    t1 = time.time()
    for i in range(from_index, to_index + 1):
        try:
            model.learn(steps_per_episode)
            print(f"Save model {i}")
            model.save(f"{name}{i:02d}.stable_baselines")
        except KeyboardInterrupt:
            print("Interrupted by KeyBoard")
            break
    t2 = time.time()
    print(f"Learning took {t2 - t1} seconds")

    # Evaluate after training
    print("Evaluating...")
    evaluation = evaluate_policy(model, env)
    print("Eval2:", evaluation)
Пример #15
0
import gym

from stable_baselines3 import PPO
from servo_env_sim import Servo_Env_Sim

#env = gym.make('MountainCarContinuous-v0')
env = Servo_Env_Sim()

model = PPO('MlpPolicy', env, verbose=1)
model.learn(total_timesteps= 50_000)

model.save("Model_1")

# obs = env.reset()
# for i in range(1000):
#     action, _state = model.predict(obs, deterministic=False)
#     obs, reward, done, info = env.step(action)
#     if done:
#       obs = env.reset()
Пример #16
0
        env = gym.make(env_id)
        env.seed(seed + rank)
        return env

    set_random_seed(seed)
    return _init


if __name__ == '__main__':
    env_id = "CartPole-v1"
    num_cpu = 8  # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

    # Stable Baselines provides you with make_vec_env() helper
    # which does exactly the previous steps for you:
    # env = make_vec_env(env_id, n_envs=num_cpu, seed=0)

    model = PPO('MlpPolicy', env, verbose=1)
    model.learn(total_timesteps=2500)

    model.save("./weights/ppo_cartpole" + str(n))
    del model  # remove to demonstrate saving and loading
    model = PPO2.load("./weights/ppo_cartpole" + str(n))

    obs = env.reset()
    for _ in range(1000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()
Пример #17
0
                body_info = test_body//100
            else:
                body_info = args.test_as_class
        else:
            body_info = 0
        eval_env = DummyVecEnv([utils.make_env(rank=0, seed=utils.seed+1, wrapper=default_wrapper, render=False, robot_body=test_body, body_info=body_info)])
        eval_env = VecNormalize(eval_env, norm_reward=False, **normalize_kwargs)
        eval_callback = EvalCallback_with_prefix(
            eval_env=eval_env,
            prefix=f"{test_body}",
            n_eval_episodes=3,
            eval_freq=1e3, # will implicitly multiplied by (train_num_envs)
            deterministic=True,
        )
        all_callbacks.append(eval_callback)

    if args.with_checkpoint:
        checkpoint_callback = CheckpointCallback(save_freq=1000, save_path=f'{folder}/checkpoints/', name_prefix=args.train_bodies)
        save_vec_callback = SaveVecNormalizeCallback(save_freq=1000, save_path=f"{folder}/checkpoints/", name_prefix=args.train_bodies)
        all_callbacks.append(checkpoint_callback)
        all_callbacks.append(save_vec_callback)

    model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=f"{folder}/tb/{save_filename}-s{utils.seed}", seed=utils.seed, **hyperparams)

    model.learn(total_timesteps=total_timesteps, callback=all_callbacks)
    model.save(f"{folder}/{save_filename}")
    # Important: save the running average, for testing the agent we need that normalization
    model.get_vec_normalize_env().save(f"{folder}/{save_filename}-vecnormalize.pkl")

    env.close()
Пример #18
0
def main():
    # multiprocess environment
    n_cpu = 8
    env = SubprocVecEnv(
        [lambda: gym.make('DYROSTocabi-v1') for i in range(n_cpu)])
    env = VecNormalize(env,
                       norm_obs=True,
                       clip_obs=2.0,
                       norm_reward=False,
                       training=True)

    # n_cpu = 1
    # env = gym.make('DYROSTocabi-v1')
    # env = DummyVecEnv([lambda: env])
    # env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True)

    model = PPO('MlpPolicy',
                env,
                verbose=1,
                n_steps=int(4096 / n_cpu),
                wandb_use=True)
    model.learn(total_timesteps=40000000)

    file_name = "ppo2_DYROSTocabi_" + str(datetime.datetime.now())
    model.save(file_name)
    env.save(file_name + "_env.pkl")

    model.policy.to("cpu")
    for name, param in model.policy.state_dict().items():
        weight_file_name = "./result/" + name + ".txt"
        np.savetxt(weight_file_name, param.data)

    np.savetxt("./result/obs_mean.txt", env.obs_rms.mean)
    np.savetxt("./result/obs_variance.txt", env.obs_rms.var)

    del model  # remove to demonstrate saving and loading
    del env

    # file_name = "ppo2_DYROSTocabi_2021-02-27 02:20:20.015346"

    env = gym.make('DYROSTocabi-v1')
    env = DummyVecEnv([lambda: env])
    env = VecNormalize.load(file_name + "_env.pkl", env)
    env.training = False

    model = PPO.load(file_name, env=env, wandb_use=False)

    model.policy.to("cpu")
    for name, param in model.policy.state_dict().items():
        weight_file_name = "./result/" + name + ".txt"
        np.savetxt(weight_file_name, param.data)

    np.savetxt("./result/obs_mean.txt", env.obs_rms.mean)
    np.savetxt("./result/obs_variance.txt", env.obs_rms.var)
    #Enjoy trained agent
    obs = np.copy(env.reset())
    epi_reward = 0

    while True:
        action, _states = model.predict(obs, deterministic=True)
        obs, rewards, dones, info = env.step(action)
        env.render()
        epi_reward += rewards

        if dones:
            print("Episode Reward: ", epi_reward)
            epi_reward = 0
Пример #19
0
    ####
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(env.N_ACTIONS),
                                                sigma=0.1 *
                                                np.ones(env.N_ACTIONS),
                                                dt=0.005)

    #### Create the callback: check every 1000 steps
    callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                                log_dir=log_dir)

    #### Train the model ###############################################################################
    model = PPO(CustomPolicy, env, verbose=1, batch_size=64)

    for i in range(step_iters):  # run for step_iters * training_timesteps

        model.learn(total_timesteps=training_timesteps)

        model.save("./models/ppo" + str((i + 1) * training_timesteps))
        # model.save_replay_buffer("./experiences/ppo_experience"+str((i+1)*training_timesteps))

        #### Show (and record a video of) the model's performance ##########################################
        env_test = RLTetherAviary(gui=False, record=True)
        obs = env_test.reset()
        start = time.time()
        for i in range(10 * env_test.SIM_FREQ):
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, done, info = env_test.step(action)
            if done: break
        env_test.close()

    env.close()
        )
        env = make_vec_env(make_configure_env,
                           n_envs=n_cpu,
                           seed=0,
                           vec_env_cls=SubprocVecEnv,
                           env_kwargs=env_kwargs)
        model = PPO("MlpPolicy",
                    env,
                    n_steps=512 // n_cpu,
                    batch_size=64,
                    learning_rate=2e-3,
                    policy_kwargs=policy_kwargs,
                    verbose=2,
                    tensorboard_log="./highway_attention_ppo/")
        # Train the agent
        model.learn(total_timesteps=200 * 1000)
        # Save the agent
        model.save("ppo-highway")

    model = PPO.load("ppo-highwayv0")
    env = make_configure_env(**env_kwargs)

    evaluate(env, model)
    for _ in range(5):
        obs = env.reset()
        done = False
        while not done:
            action, _ = model.predict(obs)
            obs, reward, done, info = env.step(action)
            env.render()
Пример #21
0
zip = "data/box_flipup_ppo_{observations}.zip"
log = "/tmp/ppo_box_flipup/"

if __name__ == '__main__':
    num_cpu = 48 if not args.test else 2
    env = make_vec_env("BoxFlipUp-v0",
                       n_envs=num_cpu,
                       seed=0,
                       vec_env_cls=SubprocVecEnv,
                       env_kwargs={
                           'observations': observations,
                           'time_limit': time_limit,
                       })
    #    env = "BoxFlipUp-v0"

    if args.test:
        model = PPO('MlpPolicy', env, n_steps=4, n_epochs=2, batch_size=8)
    elif os.path.exists(zip):
        model = PPO.load(zip, env, verbose=1, tensorboard_log=log)
    else:
        model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log)

    new_log = True
    while True:
        model.learn(total_timesteps=100000 if not args.test else 4,
                    reset_num_timesteps=new_log)
        if args.test:
            break
        model.save(zip)
        new_log = False
Пример #22
0
env = ss.pettingzoo_env_to_vec_env_v0(env)
env = ss.concat_vec_envs_v0(env, n_envs, num_cpus=1, base_class='stable_baselines3')
env = VecMonitor(env)

eval_env = base_env.copy().parallel_env()
eval_env = ss.frame_stack_v1(eval_env, 3)
eval_env = ss.pettingzoo_env_to_vec_env_v0(eval_env)
eval_env = ss.concat_vec_envs_v0(eval_env, 1, num_cpus=1, base_class='stable_baselines3')
eval_env = VecMonitor(eval_env)

eval_freq = int(n_timesteps / n_evaluations)
eval_freq = max(eval_freq // (n_envs*n_agents), 1)

model = PPO("MlpPolicy", env, verbose=3, gamma=0.95, n_steps=256, ent_coef=0.0905168, learning_rate=0.00062211, vf_coef=0.042202, max_grad_norm=0.9, gae_lambda=0.99, n_epochs=5, clip_range=0.3, batch_size=256)
eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=eval_freq, deterministic=True, render=False)
model.learn(total_timesteps=n_timesteps, callback=eval_callback)

model = PPO.load("./logs/best_model")

mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10)

print(mean_reward)
print(std_reward)

render_env = base_env.copy().parallel_env()
render_env = ss.color_reduction_v0(render_env, mode='B')
render_env = ss.resize_v0(render_env, x_size=84, y_size=84)
render_env = ss.frame_stack_v1(render_env, 3)

obs_list = []
i = 0
Пример #23
0
                verbose=1,
                tensorboard_log=str(common.output_data_folder /
                                    f"tensorboard" / saved_model_filename),
                seed=common.seed,
                **hyperparams)

    if len(args.initialize_weights_from) > 0:
        try:
            load_model = PPO.load(args.initialize_weights_from)
            load_weights = load_model.policy.state_dict()
            model.policy.load_state_dict(load_weights)
            print(f"Weights loaded from {args.initialize_weights_from}")
        except Exception:
            print("Initialize weights error.")
            raise Exception

    try:
        model.learn(total_timesteps=args.train_steps, callback=all_callbacks)
    except KeyboardInterrupt:
        pass
    model.save(str(common.output_data_folder / "models" /
                   saved_model_filename))

    if args.vec_normalize:
        # Important: save the running average, for testing the agent we need that normalization
        model.get_vec_normalize_env().save(
            str(common.output_data_folder / "models" /
                f"{saved_model_filename}.vnorm.pkl"))

    venv.close()
Пример #24
0
policy = MlpPolicy

model = PPO(policy, env, 
            learning_rate=2.5e-4, n_steps=128, batch_size=32, n_epochs=3, clip_range=0.1, ent_coef=.01, vf_coef=1, 
            #policy_kwargs={'net_arch': [128, 64, 32]},
            verbose=1)

old_weights_filename = 'ppo-torch-mbool-xstep1-death-perframe+pixdiff-newarch'
new_weights_filename = 'ppo-torch-mbool-xstep1-death-perframe+pixdiff-newarch'
if args.mode == 'train':
    callbacks = [
        CheckpointCallback(500000, save_path=f'./checkpoint_weights/{new_weights_filename}/', name_prefix=new_weights_filename),
    ]

    # model = PPO.load(old_weights_filename, env=env)
    model.learn(10000000, callback=callbacks, log_interval=5, tb_log_name=new_weights_filename)    
    model.save(new_weights_filename)    

elif args.mode == 'test':
    import logging

    model = PPO.load(old_weights_filename) #, env=env)

    obs = env.reset()

    testlog = logging.getLogger('testing')
    testlog.setLevel(logging.DEBUG)
    fh = logging.FileHandler('./test.log')
    testlog.addHandler(fh) 
    ch = logging.StreamHandler()
    testlog.addHandler(ch)
Пример #25
0
env.add_car(car)

# Uncomment this if you've made any changes to the environment and want to make
# sure that everything is still okay (no output means everything is fine):

# check_env(env)

# Uncomment one of the following depending on what you'd like to do

# A. Use an existing model
# model = PPO.load(model_dir + model_name)

# B. Create and train a new model
timesteps = 10000
model = PPO('MlpPolicy', env, tensorboard_log="./ppo/", verbose=1)
model.learn(total_timesteps=timesteps, callback=TensorboardCallback()) 
model.save(model_dir + model_name)

# Reset the env

env = Track()
car = Car()
env.add_car(car)

obs = env.reset(new=args.ifreset) # You can set new=True if you'd like to create a new track

# Run the simulation until the car crashes or finishes

done = False
while not done:
    action, _states = model.predict(obs) 
Пример #26
0
def main():

    set_random_seed(RANDOM_SEED)

    t_start = time()
    name = "LargeFinalLayer"

    checkpoint_path = os.path.join(BASE_CHECKPOINT_PATH, "PPO", ENV_NAME, name)
    os.makedirs(checkpoint_path, exist_ok=True)

    log_path = os.path.join(BASE_LOG_PATH, "PPO", ENV_NAME, name)
    os.makedirs(log_path, exist_ok=True)

    results_path = os.path.join(checkpoint_path, "results.json")

    env_args = dict(
        frame_skip=4,
        screen_size=84,
        terminal_on_life_loss=True,
        clip_reward=True,
    )

    # Creates a gym environment for an atari game using the specified seed and number of environments
    # This is a "vectorized environment", which means Stable Baselines batches the updates into vectors
    # for improved performance..
    # train_env = make_atari_env(ENV_NAME, n_envs=N_ENVS, seed=RANDOM_SEED, wrapper_kwargs=env_args)

    def atari_wrapper(env: gym.Env) -> gym.Env:
        env = AtariWrapper(env, **env_args)
        return env

    def make_env(rank: int, count: int) -> VecEnv:
        return make_vec_env(
            ENV_NAME,
            n_envs=count,
            seed=RANDOM_SEED + rank,
            start_index=0,
            monitor_dir=None,
            wrapper_class=atari_wrapper,
            env_kwargs=None,
            vec_env_cls=None,
            vec_env_kwargs=None,
            monitor_kwargs=None,
        )

    train_env = make_env(0, N_ENVS)
    eval_env = make_env(1, 1)

    # required by models in baselines
    train_env = VecTransposeImage(train_env)
    eval_env = VecTransposeImage(eval_env)

    # setup callback to save model at fixed intervals
    save_callback = CheckpointCallback(save_freq=CHECKPOINT_FREQ,
                                       save_path=checkpoint_path,
                                       name_prefix=name)
    stop_callback = StopTrainingOnRewardThreshold(
        reward_threshold=EVAL_THRESHOLD)
    time_callback = TimeLimitCallback(max_time=TIME_LIMIT)
    best_callback = EvalCallback(
        eval_env,
        eval_freq=EVAL_FREQ,
        best_model_save_path=checkpoint_path,
        callback_on_new_best=stop_callback,
    )
    list_callback = CallbackList([save_callback, best_callback, time_callback])

    model = PPO(
        CnnPolicy,
        train_env,
        verbose=VERBOSE,
        batch_size=BATCH_SIZE,
        seed=RANDOM_SEED,
        tensorboard_log=log_path,
        learning_rate=LEARNING_RATE,
        n_steps=UPDATE_STEPS,
        n_epochs=N_EPOCHS,
        ent_coef=ENT_COEF,
        vf_coef=VF_COEF,
        clip_range=CLIP_RANGE,
        device=DEVICE_TYPE,
        policy_kwargs=dict(features_extractor_class=FeatureExtractor),
    )

    config_path = os.path.join(checkpoint_path, "cnn_config")
    zip_path = os.path.join(checkpoint_path, "model.zip")

    # output the model config to a file for easier viewing
    with open(config_path, "w") as file:
        file.write(f"{name}\n")
        file.write(str(model.policy.features_extractor.cnn))

    print("Beginning training...")

    model.learn(TRAIN_STEPS, callback=list_callback, tb_log_name="run")
    # model.learn(TRAIN_STEPS, tb_log_name="run")
    model.save(zip_path)

    del train_env
    # del eval_env

    time_taken = time() - t_start

    print("Beginning evaluation...")

    # score of the game, standard deviation of multiple runs
    reward_mean, reward_std = evaluate_policy(model, make_env(2, 1))

    with open(results_path, "w") as handle:
        handle.write(json.dumps((reward_mean, reward_std, time_taken)))
opt_reward, std_reward = evaluate_policy(opt, env, n_eval_episodes=100)
mean_reward = mean_reward / opt_reward
std_reward = std_reward / opt_reward
leaderboard("MSY", ENV, mean_reward, std_reward, url)
print("algo:", "MSY", "env:", ENV, "mean reward:", mean_reward, "std:",
      std_reward)

## PPO ######################################################################

# load best tuned parameters...
model = PPO('MlpPolicy',
            vec_env,
            verbose=0,
            tensorboard_log=tensorboard_log,
            seed=seed)
model.learn(total_timesteps=300000)
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)
# Rescale score against optimum solution in this environment
opt = escapement(env)
opt_reward, std_reward = evaluate_policy(opt, env, n_eval_episodes=100)
mean_reward = mean_reward / opt_reward
std_reward = std_reward / opt_reward

leaderboard("PPO", ENV, mean_reward, std_reward, url)
print("algo:", "PPO", "env:", ENV, "mean reward:", mean_reward, "std:",
      std_reward)

## simulate and plot results
df = env.simulate(model, reps=10)
env.plot(df, "results/ppo.png")
policy = env.policyfn(model, reps=10)
Пример #28
0
        return observation


if __name__ == "__main__":
    from stable_baselines3 import PPO, DQN
    import os
    import time

    model_name = f"snake_{int(time.time())}"
    models_dir = f"models/{model_name}/"
    logdir = f"logs/{model_name}/"
    if not os.path.exists(models_dir):
        os.makedirs(models_dir)
    if not os.path.exists(logdir):
        os.makedirs(logdir)

    env = SnekEnv()
    env.reset()

    model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=logdir)
    # model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=logdir)

    TIMESTEPS = 10000
    while True:
        model.learn(total_timesteps=TIMESTEPS,
                    reset_num_timesteps=False,
                    tb_log_name=f"PPO")
        model.save(f"{models_dir}/{TIMESTEPS}")

    cv2.destroyAllWindows()
Пример #29
0
                lambd.append(env.get_lambd())
                N.append(env.get_N())
                #print (env.get_lambd(), env.get_N())
                with open(f"./{args.folder}/buffers/lambda.npy", "wb") as fp:
                    pickle.dump(lambd, fp)
                with open(f"./{args.folder}/buffers/N.npy", "wb") as fp:
                    pickle.dump(N, fp)
                # model.learn(total_timesteps=5000, log_interval=10,
                #            callback=callback, reset_num_timesteps=False)
            else:
                env.set_N(int(N[i]), list(lambd[i]))
                #print ("Lambda, N", N[i], lambd[i])

            if args.algo != 3 and args.algo != 4:
                model.learn(total_timesteps=args.eval_freq,
                            log_interval=10,
                            reset_num_timesteps=False)
                model_name = f"./{args.folder}/models/model_{args.algo}_{j}_{i}"
                model.save(model_name)
            # np.save(f"./{args.folder}/buffers/lambda_{args.algo}_{j}.npy", lambd)
            # np.save(f"./{args.folder}/buffers/N_{args.algo}_{j}.npy", N)
            if args.algo == 0:
                model = PPO.load(model_name, env)
            elif args.algo == 1:
                model = A2C.load(model_name, env)
            elif args.algo == 2:
                model = SAC.load(model_name, env)
            elif args.algo == 3:
                state = train_salmut(env, model, args.eval_freq, args, state,
                                     j)
        #parameters = atari_parameters if is_atari else regular_parameters
Пример #30
0
model = PPO(
    "GnnPolicy",
    env,
    # reducing batch_size to 1
    n_steps=1024,
    verbose=1,
    tensorboard_log="runs",
    batch_size=32,
    learning_rate=1e-3,
    gamma=0.99,
    gae_lambda=0.95,
    clip_range=0.2,
    vf_coef=0.5,
    policy_kwargs={
        'mlp_extractor_kwargs': {
            'task_name': task_name,
            'xml_assets_path': None
        }
    },
)

mean_reward_before_train = evaluate(model, num_episodes=4)
model.learn(total_timesteps=2000000,
            tb_log_name='{}_{}'.format(
                task_name,
                datetime.now().strftime('%d-%m_%H-%M-%S')))
model.save("a2c_ant")
mean_reward = evaluate(model, num_episodes=4)
print(mean_reward_before_train)
print(mean_reward)