Пример #1
0
def record_video(env_id,
                 model,
                 video_length=300,
                 prefix='',
                 video_folder='videos/',
                 lstm=False):
    """
    :param env_id: (str)
    :param model: (RL model)
    :param video_length: (int)
    :param prefix: (str)
    :param video_folder: (str)
    """
    eval_env = DummyVecEnv([lambda: gym.make(env_id)])
    # Start the video at step=0 and record 300 steps
    eval_env = VecVideoRecorder(eval_env,
                                video_folder=video_folder,
                                record_video_trigger=lambda step: step == 0,
                                video_length=video_length,
                                name_prefix=prefix)

    obs = eval_env.reset()
    state = None
    for _ in range(video_length):
        action, state = model.predict(np.tile(obs, (model.n_envs, 1)),
                                      state=state,
                                      deterministic=False)
        action = action[[0]] if lstm else action[0]
        obs, _, _, _ = eval_env.step(action)

    # Close the video recorder
    eval_env.close()
Пример #2
0
def main():
    save_path = args.checkpoint_dir + args.policy + "/" + args.policy
    env = gym.make("SegmentationEnv-v0", 
        objs_dir=args.objs_dir, 
        max_scenes=args.max_scenes,
        sample_size=args.sample_size,
        diff_punishment=args.diff_punishment,
        max_steps_per_scene=args.max_steps_per_scene,
        scene_mode=args.scene_mode,
        training=False,
        point_mode=args.point_mode,
        voxel_size=args.voxel_size,
        voxel_mode=args.voxel_mode,
		single_scenes=args.single_scenes,
        early_diff=args.early_diff)
        
    env = DummyVecEnv([lambda: env])  # The algorithms require a vectorized environment to run
    env = VecCheckNan(env, raise_exception=True)
    
    model = PPO2.load(save_path, env=env)
    
    n_episodes = 10
    for i in range(n_episodes):
        total_reward = 0
        obs = env.reset()
        while True:
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, done, info = env.step(action)
            total_reward += reward
            if done: 
                print("Total Reward: ", total_reward)
                break
    
    env.close()
Пример #3
0
def main():
    alg_input = input("Select algorithm (PPO2 or A2C only):")
    if alg_input != "PPO2" and alg_input != "A2C" and alg_input != "ppo2" and alg_input != "a2c":
        print("Not an option (PPO2 or A2C only) !")
        alg_input = input("Select algorithm (PPO2 or A2C only):")
    model_input = "trained_agents\\" + input(
        "Select model to test(input filename, eg. a2c_wf_2):")

    env = gym.make("WARFLEET-v0")
    # The algorithms require a vectorized environment to run
    env = DummyVecEnv([lambda: env])
    log_dir = "./logs/"

    done = False
    stage_reward = 0
    turns = 0

    if alg_input == "PPO2" or alg_input == "ppo2":
        model = PPO2.load(model_input, env=env, tensorboard_log=log_dir)
    elif alg_input == "A2C" or alg_input == "a2c":
        model = A2C.load(model_input, env=env, tensorboard_log=log_dir)

    obs = env.reset()

    while not done:
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        stage_reward += reward
        turns = turns + 1
        # env.render()

    print("Reward: {} /42".format(stage_reward))
    print("Turns: {}".format(turns))
    env.close()
Пример #4
0
def learn(algorithm,
          environment_name,
          total_timesteps=1000,
          n_steps=128,
          gamma=0.99,
          nminibatches=4,
          verbose=0):
    global environment
    environment = gym.make('gym_threshold:' + environment_name)
    dummy_vec_environment = DummyVecEnv([lambda: environment])

    if algorithm == "PPO2":
        model = PPO2(MlpPolicy,
                     dummy_vec_environment,
                     verbose=verbose,
                     n_steps=n_steps,
                     gamma=gamma,
                     nminibatches=nminibatches,
                     cliprange_vf=-1,
                     tensorboard_log="tensorboard")
    else:
        raise AttributeError('No algorithm with name: {}'.format(algorithm))

    model.learn(
        total_timesteps=total_timesteps,
        tb_log_name=
        "algorithm: {}, n_steps: {}, nminibatches: {}, gamma: {} run".format(
            algorithm, n_steps, nminibatches, gamma),
        callback=tensorboard_callback)

    dummy_vec_environment.close()
Пример #5
0
    def test_df(self, model, df, ohlc_df, train_len):
        train_df = df.iloc[:train_len].copy()
        test_df = df.iloc[train_len:].copy()
        train_ohlc = ohlc_df.iloc[:train_len].copy()
        test_ohlc = ohlc_df.iloc[train_len:].copy()

        # check test for train data
        test_env = DummyVecEnv([lambda: TradingEnv(train_df.drop('close', axis=1), train_ohlc, 1440)])
        obs = test_env.reset()
        done = False
        ac_data = None
        while done == False:
            action, _states = model.predict(obs)
            obs, rewards, done, ac_data = test_env.step(action)
            # test_env.render()
        test_env.close()

        print('pl=', ac_data[0]['ac'].total_pl, 'num trade=', ac_data[0]['ac'].num_trade, 'win_rate=', ac_data[0]['ac'].win_rate, 'fee ratio=', round(ac_data[0]['ac'].total_fee / ac_data[0]['ac'].total_pl, 4) if ac_data[0]['ac'].num_trade > 0 else 0)
        print('num market order=', ac_data[0]['ac'].num_market_order)
        fig, ax1 = plt.subplots()
        plt.figure(figsize=(30, 30), dpi=200)
        ax1.plot(np.array(ac_data[0]['ac'].performance_total_pl_log).reshape(-1, 1), color='red', linewidth=3.0, label='pl')
        ax1.legend(loc="best", edgecolor="red")
        ax2 = ax1.twinx()
        ax2.plot(np.array(train_ohlc['close'].iloc[1440: len(ac_data[0]['ac'].performance_total_pl_log) + 1440]).reshape(-1, 1), label='close')
        h1, l1 = ax1.get_legend_handles_labels()
        h2, l2 = ax2.get_legend_handles_labels()
        ax2.legend(h1 + h2, l1 + l2, loc="best", frameon=True, edgecolor="blue")
        plt.show()
        return ac_data
Пример #6
0
def main():
    env_id = "CartPole-v1"
    num_cpu = 8  # Number of processes to use
    training_steps = int(1e4)
    agent_types = [m.__name__ for m in g_stable_agents]

    for agent_type in agent_types:
        model_name = "{0}-{1}".format(agent_type, env_id)

        print(model_name)
        policy = common_policies.MlpPolicy
        if agent_type == "DQN":
            policy = deepq.MlpPolicy
        # Create the vectorized environment
        env = DummyVecEnv([make_env(env_id, 0)])
        """
        if agent_type in ["DQN","PPO1","TRPO"]:
            env = DummyVecEnv([make_env(env_id, 0)])
        else:
            env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])
        """
        agent = create_agent(agent_type)
        model = train(env, agent, policy, training_steps)
        model.save(os.path.join('./models', model_name))
        del model  #To make sure model is saved
        for e in env.envs:
            e.close()
            del e
        env.close()
        del env
Пример #7
0
def main():
    env = gym.make("WARFLEET-v0")
    # The algorithms require a vectorized environment to run
    env = DummyVecEnv([lambda: env])
    log_dir = "./logs/"
    model = PPO2(MlpPolicy,
                 env,
                 verbose=1,
                 tensorboard_log=log_dir,
                 cliprange=0.1,
                 gamma=0.99,
                 ent_coef=0.001,
                 vf_coef=0.2)

    #model.learn(total_timesteps=10000000)
    #model.save("PPO2_wf_2")

    done = False
    stage_reward = 0
    input("Training is finished, press to play a game: ")

    model = PPO2.load("trained_agents/PPO2_wf_2",
                      env=env,
                      tensorboard_log=log_dir)

    obs = env.reset()

    while not done:
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        stage_reward += reward
        # env.render()
    env.close()
Пример #8
0
def train():
    def callback(_locals, _globals):
        # Save model
        _locals['self'].save(MODEL_NAME)

    envs = [create_env_headless for _ in range(ENV_COUNT)]
    vec_envs = SubprocVecEnv(envs)
    model = PPO2('CnnPolicy',
                 vec_envs,
                 verbose=1,
                 ent_coef=0.0001,
                 n_steps=256)

    if not os.path.isfile(MODEL_NAME):
        model.save(MODEL_NAME)
        vec_envs.close()
        print("Run again to train")
    else:
        model.learn(total_timesteps=TIMESTEPS, callback=callback)
        model.save(MODEL_NAME)
        vec_envs.close()
        print("Training Done")

        # Evaluation
        print("Evaluation")
        vec_env = create_env_headless()
        vec_env = DummyVecEnv([lambda: vec_env])
        model = PPO2.load(MODEL_NAME)
        print(evaluate_policy(model, vec_env, n_eval_episodes=100))
        print(evaluate_policy(model, vec_env, n_eval_episodes=100))
        vec_env.close()
Пример #9
0
def train():
    if not os.path.isdir("log/"):
        os.mkdir("log")

    if ENV_COUNT == 1:
        envs = create_env_headless()
        env_id = str(time.time())[-6:]
        envs = Monitor(envs,
                       "log/" + MODEL_NAME + "-" + env_id,
                       allow_early_resets=False)
        vec_envs = DummyVecEnv([lambda: envs])
    else:
        vec_envs = []

        def make_env():
            env_id = str(time.time())[-6:]
            env = create_env_headless()
            return Monitor(env,
                           "log/" + MODEL_NAME + "-" + env_id,
                           allow_early_resets=False)

        for _ in range(ENV_COUNT):
            vec_envs.append(make_env)
        vec_envs = SubprocVecEnv(vec_envs)

    model = PPO2('CnnPolicy',
                 vec_envs,
                 verbose=1,
                 ent_coef=0.0001,
                 n_steps=256)
    model.learn(total_timesteps=TIMESTEPS)
    model.save(MODEL_NAME)
    vec_envs.close()

    print("Learning Done!")
Пример #10
0
def train(env_id, num_timesteps, seed, lam, sgd_steps, klcoeff, log):
    """
    Train TRPO model for the mujoco environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    with tf_util.single_threaded_session():
        rank = MPI.COMM_WORLD.Get_rank()
        log_path = './experiments/' + str(
            env_id) + './OURS-LOADED/noent_klcoeffanneal_samesgdsteps' + str(
                sgd_steps) + '_longer_wgae0.95_exp1_2_' + str(seed)
        #log_path = './experiments/'+str(env_id)+'./TRPO-3x/TRPOR-oldsampling/noent_klcoeff'+str(sgd_steps)+'_sgdstep_steps5_'+str(seed)
        if not log:
            if rank == 0:
                logger.configure(log_path)
            else:
                logger.configure(log_path, format_strs=[])
                logger.set_level(logger.DISABLED)
        else:
            if rank == 0:
                logger.configure()
            else:
                logger.configure(format_strs=[])
                logger.set_level(logger.DISABLED)

        workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

        #env = make_mujoco_env(env_id, workerseed)
        def make_env():
            env_out = gym.make(env_id)
            env_out = bench.Monitor(env_out,
                                    logger.get_dir(),
                                    allow_early_resets=True)
            env_out.seed(seed)
            return env_out

        env = DummyVecEnv([make_env])
        env = VecNormalize(env)  #, norm_reward=False, norm_obs=False)

        #env = VecNormalize(env)
        model = TRPO(MlpPolicy,
                     env,
                     timesteps_per_batch=2048,
                     max_kl=0.01,
                     cg_iters=10,
                     cg_damping=0.1,
                     entcoeff=0.0,
                     gamma=0.99,
                     lam=0.95,
                     vf_iters=5,
                     vf_stepsize=1e-3,
                     verbose=1,
                     seed=seed,
                     sgd_steps=sgd_steps,
                     klcoeff=klcoeff,
                     method="multistep-SGD")
        model.learn(total_timesteps=10e6)  #num_timesteps, seed=seed)
        env.close()
Пример #11
0
def record_video(env_id,
                 model,
                 video_length=500,
                 prefix='',
                 video_folder='videos/'):
    """
  :param env_id: (str)
  :param model: (RL model)
  :param video_length: (int)
  :param prefix: (str)
  :param video_folder: (str)
  """
    eval_env = DummyVecEnv([lambda: gym.make(env_id)])
    # Start the video at step=0 and record 500 steps
    eval_env = VecVideoRecorder(env,
                                video_folder=video_folder,
                                record_video_trigger=lambda step: step == 0,
                                video_length=video_length,
                                name_prefix=prefix)

    obs = eval_env.reset()
    for _ in range(video_length):
        action, _ = model.predict(obs)
        obs, _, _, _ = eval_env.step(action)

    # Close the video recorder
    eval_env.close()
def run(learning_steps=4300,
        verbose=0,
        n_steps=128,
        nminibatches=4,
        gamma=0.99,
        learning_rate=2.5e-4,
        ent_coef=0.01,
        tensorboard_log="tensorboard"):
    global inner_env
    inner_env = gym.make(
        'gym_threshold:extended-state-semi-fixed-end-not-adapted-v0')
    env = DummyVecEnv([lambda: inner_env])

    model = PPO2(MlpLstmPolicy,
                 env,
                 verbose=verbose,
                 n_steps=n_steps,
                 nminibatches=nminibatches,
                 gamma=gamma,
                 ent_coef=ent_coef,
                 learning_rate=learning_rate,
                 tensorboard_log=tensorboard_log)
    model.learn(total_timesteps=learning_steps,
                tb_log_name=os.path.basename(__file__).rstrip(".py"),
                callback=tensorboard_callback)

    env.close()
Пример #13
0
def main():
    env_id = "CartPole-v1"
    num_cpu = 1  # Number of processes to use
    evaluation_steps_per_episode = 500
    evaluation_episodes = 10
    render = True
    agent_types = [m.__name__ for m in g_stable_agents]
    for agent_type in agent_types:
        model_name = "{0}-{1}".format(agent_type, env_id)

        print(model_name)
        policy = common_policies.MlpPolicy
        if agent_type == "DQN":
            policy = deepq.MlpPolicy
        # Create the vectorized environment
        env = DummyVecEnv([make_env(env_id=env_id, rank=0, seed=0)])
        agent = create_agent(agent_type)
        model = agent.load(os.path.join('./models', model_name))
        evaluate(env=env,
                 model=model,
                 num_episodes=evaluation_episodes,
                 num_steps=evaluation_steps_per_episode,
                 render=render)
        for e in env.envs:
            e.close()
            del e
        env.close()
        del env
Пример #14
0
def run_test(config):
    """Stable baselines test

    Mandatory configuration settings:
        - 'continuous' agent
        - camera_settings enabled
        - stable_baselines enabled
    """
    env = None
    try:
        # Create Environment
        env = make_env(config)
        env = DummyVecEnv([lambda: env])

        # Initialize DDPG and start learning
        n_actions = env.action_space.shape[-1]
        param_noise = None
        action_noise = OrnsteinUhlenbeckActionNoise(
            mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
        model = DDPG(CnnPolicy, env, verbose=1, param_noise=param_noise,
                     action_noise=action_noise, random_exploration=0.8)
        model.learn(total_timesteps=10000)

    finally:
        if env:
            env.close()
        else:
            clear_carla(config.host, config.port)
        print("-----Carla Environment is closed-----")
Пример #15
0
def learn(env_name, save_file, total_timesteps):
    env = DummyVecEnv([lambda: gym.make(env_name)])
    model = PPO2(CnnPolicy, env, verbose=1)
    model.learn(total_timesteps=total_timesteps)
    model.save(save_file)
    del model
    env.close()
Пример #16
0
def run_model(env_name: str, graphs: List[nx.DiGraph],
              demands: List[List[List[Tuple[np.ndarray, float]]]], model_path: str,
              replay_steps: int = 10,
              env_kwargs: Dict = {},
              parallelism: int = 4,
              policy_name: str = None):
    oblivious_routings = [routing_baselines.shortest_path_routing(graph) for graph in graphs]

    # make env
    env = lambda: gym.make(env_name,
                           dm_sequence=demands,
                           graphs=graphs,
                           oblivious_routings=oblivious_routings,
                           **env_kwargs)

    if policy_name == 'lstm':
        envs = DummyVecEnv([env] * parallelism)
    else:
        envs = DummyVecEnv([env])

    # load
    model = PPO2.load(model_path)

    # execute
    obs = envs.reset()
    state = None
    utilisations = []
    opt_utilisations = []
    oblivious_utilisations = []
    if env_name == 'ddr-iterative-v0':
        replay_steps = replay_steps * envs.envs[0].graphs[
            envs.envs[0].graph_index].number_of_edges()
        for i in range(replay_steps - 1):
            action, state = model.predict(obs, state=state, deterministic=True)
            obs, reward, done, info = envs.step(action)
            print(reward)
            print(info)
            if info[0]['iter_idx'] == 0:
                utilisations.append(info[0]['utilisation'])
                opt_utilisations.append(info[0]['opt_utilisation'])
                oblivious_utilisations.append(info[0]['oblivious_utilisation'])
    else:
        for i in range(replay_steps - 1):
            action, state = model.predict(obs, state=state, deterministic=True)
            obs, reward, done, info = envs.step(action)
            print(reward)
            print(action)
            print(info)
            utilisations.append(info[0]['utilisation'])
            opt_utilisations.append(info[0]['opt_utilisation'])
            oblivious_utilisations.append(info[0]['oblivious_utilisation'])
    print("Mean reward: ", np.mean(np.divide(utilisations, opt_utilisations)))
    print("Mean oblivious reward: ",
          np.mean(np.divide(oblivious_utilisations, opt_utilisations)))
    envs.close()

    return utilisations, opt_utilisations, oblivious_utilisations
def train():
    env = DummyVecEnv([
        lambda: DemoEnv()
    ])  # DQN does not support parrelization through SubprocVecEnv
    model = DQN(MlpPolicy, env, verbose=1, policy_kwargs={'layers': [4]})
    model.learn(total_timesteps=int(2e5))
    model.save("deepq_DemoEnv")
    env.close()
    del model
Пример #18
0
def train(env_id, num_timesteps, seed, lam, sgd_steps, klcoeff, log):
    """
    Train TRPO model for the mujoco environment, for testing purposes
    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    with tf_util.single_threaded_session():
        rank = MPI.COMM_WORLD.Get_rank()
        log_path = './experiments/' + str(
            env_id) + './SAC-M/nips_test19/m' + str(sgd_steps) + '_c' + str(
                0.5) + '_e' + str(klcoeff) + '_' + str(seed)
        #log_path = './experiments/'+str(env_id)+'./TRPO-3x/TRPOR-oldsampling/noent_klcoeff'+str(sgd_steps)+'_sgdstep_steps5_'+str(seed)
        if not log:
            if rank == 0:
                logger.configure(log_path)
            else:
                logger.configure(log_path, format_strs=[])
                logger.set_level(logger.DISABLED)
        else:
            if rank == 0:
                logger.configure()
            else:
                logger.configure(format_strs=[])
                logger.set_level(logger.DISABLED)

        workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

        #env = make_mujoco_env(env_id, workerseed)
        def make_env():
            env_out = gym.make(env_id)
            env_out = bench.Monitor(env_out,
                                    logger.get_dir(),
                                    allow_early_resets=True)
            env_out.seed(seed)
            return env_out

        env = DummyVecEnv([make_env])
        env = VecNormalize(env, norm_reward=False, norm_obs=False)

        #env = VecNormalize(env)
        model = MDPO(MlpPolicy,
                     env,
                     gamma=0.99,
                     verbose=1,
                     seed=seed,
                     buffer_size=1000000,
                     ent_coef=1.0,
                     gradient_steps=sgd_steps,
                     lam=klcoeff,
                     train_freq=1,
                     tsallis_q=1,
                     reparameterize=True,
                     klconst=0.5)
        model.learn(
            total_timesteps=int(num_timesteps))  #num_timesteps, seed=seed)
        env.close()
def record_video(model,
                 env_id=None,
                 eval_env=None,
                 max_video_length=500,
                 video_prefix='',
                 video_folder='videos/',
                 break_early=False,
                 is_recurrent=False):
    """
    :param env_id: (str)
    :param model: (RL model)
    :param max_video_length: (int)
    :param video_prefix: (str)
    :param video_folder: (str)
    """

    # directly passing an environment overrides passing in an env
    if eval_env is None:
        eval_env = DummyVecEnv([lambda: gym.make(env_id)])

    # Start the video at step=0 and record 500 steps
    eval_env = VecVideoRecorder(eval_env,
                                video_folder=video_folder,
                                record_video_trigger=lambda step: step == 0,
                                video_length=max_video_length,
                                name_prefix=video_prefix)

    # according to docs, recurrent policies must have "state" set like this
    if is_recurrent:
        state = None

    # When using VecEnv, done is a vector
    is_single_env = (eval_env.num_envs == 1)
    doneVec = [False for _ in range(model.n_envs)]

    obs = eval_env.reset()

    for _ in range(max_video_length):

        # We need to pass the previous state and a mask for recurrent policies
        # to reset lstm state when a new episode begin
        action, state = model.predict(obs, state=state, mask=doneVec)

        # only allow recurrent models to continually update their state
        if not is_recurrent:
            state = None

        obs, _, done, _ = eval_env.step(action)

        if is_single_env:
            doneVec[0] = copy.deepcopy(done[0])
        else:
            doneVec = copy.deepcopy(done)

    # Close the video recorder
    eval_env.close()
Пример #20
0
def evaluate():
    vec_env = create_env_headless()
    vec_env = DummyVecEnv([lambda: vec_env])
    model = PPO2.load(MODEL_NAME)
    print("After Training evaluation")
    print(evaluate_policy(model, vec_env, n_eval_episodes=1000))
    print(evaluate_policy(model, vec_env, n_eval_episodes=1000))
    print(evaluate_policy(model, vec_env, n_eval_episodes=1000))
    print(evaluate_policy(model, vec_env, n_eval_episodes=1000))
    vec_env.close()
Пример #21
0
def run(learning_steps=4300, verbose=0, gamma=0.99, learning_rate=5e-4, tensorboard_log="tensorboard"):
    global inner_env
    inner_env = gym.make('gym_threshold:semi-fixed-end-not-adapted-maintain-v0')
    env = DummyVecEnv([lambda: inner_env])

    model = DQN(MlpPolicy, env, prioritized_replay=True, verbose=verbose, learning_rate=learning_rate, gamma=gamma,
                tensorboard_log=tensorboard_log)
    model.learn(total_timesteps=learning_steps, tb_log_name=os.path.basename(__file__).rstrip(".py"),
                callback=tensorboard_callback)

    env.close()
Пример #22
0
def run(learning_steps=4300):
    global inner_env
    inner_env = gym.make('gym_threshold:threshold-intra_process-v0')
    env = DummyVecEnv([lambda: inner_env])

    model = PPO2(MlpPolicy, env, verbose=1, n_steps=128, nminibatches=4,
                 tensorboard_log="tensorboard")
    model.learn(total_timesteps=learning_steps, tb_log_name=os.path.basename(__file__).rstrip(".py"),
                callback=tensorboard_callback)

    env.close()
Пример #23
0
def play(env_name, load_file, total_timesteps):
    env = DummyVecEnv([lambda: gym.make(env_name)])
    model = PPO2.load(load_file, verbose=1)
    obs = env.reset()
    for i in range(total_timesteps):
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        # env.render() # dummy
        if done:
            print(info[0]['episode'])
    del model
    env.close()
Пример #24
0
def play():
    vec_env = create_env()
    vec_env = DummyVecEnv([lambda: vec_env])

    model = PPO2.load(MODEL_NAME)

    obs = vec_env.reset()
    game_count = 0
    while game_count < 1000:
        action = model.predict(obs)[0]
        obs, reward, done, info = vec_env.step(action)
        if done:
            game_count += 1
    vec_env.close()
Пример #25
0
def main():
    reward_averages = []
    reward_std = []
    training_times = []
    total_env = 0
    for num_envs in NUM_ENVS:
        total_env += num_envs
        print(f'process:{num_envs}')

        if num_envs == 1:
            train_env = DummyVecEnv([lambda : gym.make(ENV_ID)])
        else:
            train_env = SubprocVecEnv([make_env(ENV_ID, i+total_env) for i in range(num_envs)], start_method='spawn')

        eval_env = DummyVecEnv([lambda: gym.make(ENV_ID)])

        rewards = []
        times = []
        for experiment in range(NUM_EXPERIMENTS):
            train_env.reset()
            model = PPO2('MlpPolicy', train_env,verbose=0)
            start = time.time()
            model.learn(total_timesteps=NUM_STEPS)
            times.append(time.time() - start)

            mean_reward = evaluate(model, eval_env, num_episodes=NUM_EPISODES)
            rewards.append(mean_reward)

        train_env.close()
        eval_env.close()

        reward_averages.append(np.mean(rewards)) # 平均報酬
        reward_std.append(np.std(rewards)) # 標準偏差
        training_times.append(np.mean(times)) # 学習速度

        # プロセスと平均報酬
    plt.errorbar(NUM_ENVS, reward_averages, yerr=reward_std, capsize=2)
    plt.xlabel('number of envs')
    plt.ylabel('mean reward')
    plt.savefig('./data/process_mean.png')
    plt.show()

    # # プロセスと秒間ステップ数
    training_steps_per_second = [NUM_STEPS / t for t in training_times]
    plt.bar(range(len(NUM_ENVS)), training_steps_per_second)
    plt.xticks(range(len(NUM_ENVS)), NUM_ENVS)
    plt.xlabel('number of envs')
    plt.ylabel('steps per second')
    plt.savefig('./data/process_step.png')
    plt.show()
def run(env_string, policy=MlpPolicy, learning_steps=4300, verbose=0, n_steps=128, nminibatches=4, gamma=0.99,
        learning_rate=2.5e-4, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, cliprange=0.2, cliprange_vf=None,
        lam=0.95, policy_kwargs=None, tensorboard_log="tensorboard"):
    global inner_env
    inner_env = gym.make(env_string)
    env = DummyVecEnv([lambda: inner_env])

    model = PPO2(policy=policy, env=env, verbose=verbose, n_steps=n_steps, nminibatches=nminibatches, gamma=gamma,
                 ent_coef=ent_coef, learning_rate=learning_rate, vf_coef=vf_coef, max_grad_norm=max_grad_norm,
                 cliprange=cliprange, cliprange_vf=cliprange_vf, lam=lam, policy_kwargs=policy_kwargs,
                 tensorboard_log=tensorboard_log)
    model.learn(total_timesteps=learning_steps, tb_log_name=os.path.basename(__file__).rstrip(".py"),
                callback=tensorboard_callback)

    env.close()
Пример #27
0
def main():

    cmd_parser = cmd_parse()
    options = cmd_parser.parse_args()

    ## Get the Stock Ticker data ##
    # print("The Stock ticker used here is ", options.ticker)

    file = Path("./data/" + options.ticker + ".csv")
    if file.is_file():
        df = pd.read_csv('./data/' + options.ticker + '.csv')
        df = df.sort_values('Date')
        print("Loading ticker data from: " + "./data/" + options.ticker +
              ".csv")
    else:
        print(
            "Data file for ticker does not exist. Please download data first to ./data/"
            + options.ticker + ".csv")
    training_logs_path = options.output_file + "_training_logs.csv"
    eval_logs_path = options.output_file + "_eval_logs"

    ## Get the training set size ##
    print("The options.training_set_size is ", options.training_set_size)

    ## Get the number of look back days ##
    print("The options.look-back-days here is: ", options.look_back_days)

    ## Get the model we are using to train the agent ##
    print("The model to train the agent here is: ", options.model)

    # The algorithms require a vectorized environment to run
    env = DummyVecEnv([
        lambda: StockTradingEnv(df, options.look_back_days, options.
                                training_set_size, eval_logs_path)
    ])

    if options.model == "PPO2":
        model = PPO2(MlpPolicy, env, verbose=1)
        model.learn(total_timesteps=options.training_set_size)

    np.savetxt(training_logs_path, model.training_rewards, delimiter=",")
    obs = env.reset()
    for i in range(options.training_set_size, len(df['Date'])):
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        env.render(title=options.ticker)
    env.close()
Пример #28
0
    def train_drf(self, df, ohlc_df, train_len):
        train_df = df.iloc[:train_len].copy()
        test_df = df.iloc[train_len:].copy()
        train_ohlc = ohlc_df.iloc[:train_len].copy()
        test_ohlc = ohlc_df.iloc[train_len:].copy()

        env = DummyVecEnv([lambda: TradingEnv(train_df.drop('close', axis=1), train_ohlc, -1)])
        # env = SubprocVecEnv([make_env(train_provider, i) for i in range(4)])
        # model = PPO2(MlpLnLstmPolicy, env,  verbose=1, nminibatches=1, tensorboard_log=log_dir) # MlpLnLstmPolicy, CnnLnLstmPolicy
        model = PPO2(MlpLnLstmPolicy, env, verbose=1, nminibatches=1, tensorboard_log='./Model')  # MlpLnLstmPolicy, CnnLnLstmPolicy
        # %tensorboard --logdir log_dir
        # tb=TensorBoardColab(startup_waiting_time=1)
        # tb=SummaryWriter('./Graph')

        model.learn(total_timesteps=10000)
        env.close()
        model.save('./Model/rf_ppo2')
        return model
Пример #29
0
def main():
    train_env = SubprocVecEnv([make_env(ENV_ID, i) for i in range(NUM_ENV)])
    model = PPO2('MlpPolicy', train_env, verbose=1)
    model.learn(total_timesteps=10000)
    test_env = DummyVecEnv([lambda: gym.make(ENV_ID)])

    state = test_env.reset()
    for i in range(200):
        test_env.render()
        action, _ = model.predict(state)
        state, rewards, done, info = test_env.step(action)

        # エピソード完了
        if done:
            break

    # 環境のクローズ
    test_env.close()
Пример #30
0
def play():
    vec_env = create_env()
    vec_env = DummyVecEnv([lambda: vec_env])
    model = PPO2.load(MODEL_NAME)
    model.set_env(vec_env)

    game_count = 0
    wins = 0
    obs = vec_env.reset()
    while game_count < 100:
        action = model.predict(obs)[0]
        obs, reward, done, info = vec_env.step(action)
        if done:
            game_count += 1
            if reward == 1:
                wins += 1
            vec_env.reset()
    print(wins / game_count)
    vec_env.close()