Exemplo n.º 1
0
def main():
    train_env = SubprocVecEnv([make_env(ENV_ID, i) for i in range(NUM_ENV)])
    model = PPO2("MlpPolicy", env, verbose=1)
    model.learn(total_timesteps=100000)
    test_env = DummyVecEnv([lambda: gym.make(ENV_ID)])

    state = env.reset()

    for i in range(200):
        test_env.render()
        action, _ = model.predict(state, deterministic=True)
        state, reward, done, info = test_env.step(action)
        if done:
            break

    env.close()
Exemplo n.º 2
0
def stock_trade(stock_file):
    day_profits = []
    df = pd.read_csv(stock_file)
    df = df.sort_values('date')

    # The algorithms require a vectorized environment to run
    env = DummyVecEnv([lambda: StockTradingEnv(df)])

    model = PPO2(MlpPolicy,
                 env,
                 verbose=0,
                 tensorboard_log='./log',
                 gamma=0.95,
                 n_steps=20,
                 learning_rate=2.5e-2)
    model.learn(total_timesteps=int(1e5))

    df_test = pd.read_csv(stock_file.replace('train', 'test'))

    env = DummyVecEnv([lambda: StockTradingEnv(df_test)])
    obs = env.reset()
    for i in range(len(df_test) - 1):
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        profit = env.render()
        day_profits.append(profit)
        if done:
            break
    return day_profits
Exemplo n.º 3
0
def DRL_prediction(df,
                   model,
                   name,
                   last_state,
                   iter_num,
                   unique_trade_date,
                   rebalance_window,
                   turbulence_threshold,
                   initial):
    ### make a prediction based on trained model###

    ## trading env
    trade_data = data_split(df, start=unique_trade_date[iter_num - rebalance_window], end=unique_trade_date[iter_num])
    env_trade = DummyVecEnv([lambda: StockEnvTrade(trade_data,
                                                   turbulence_threshold=turbulence_threshold,
                                                   initial=initial,
                                                   previous_state=last_state,
                                                   model_name=name,
                                                   iteration=iter_num)])
    obs_trade = env_trade.reset()

    for i in range(len(trade_data.index.unique())):
        action, _states = model.predict(obs_trade)
        obs_trade, rewards, dones, info = env_trade.step(action)
        if i == (len(trade_data.index.unique()) - 2):
            # print(env_test.render())
            last_state = env_trade.render()

    df_last_state = pd.DataFrame({'last_state': last_state})
    df_last_state.to_csv('results/last_state_{}_{}.csv'.format(name, i), index=False)
    return last_state
Exemplo n.º 4
0
def run_PPO2_model(test_data_file, model_path):
    model = PPO2.load(model_path)

    day_profits = []
    buy_hold_profit = []

    df_test = pd.read_csv(test_data_file)

    env_test = DummyVecEnv([lambda: StockTradingEnv_US(df_test)])

    obs = env_test.reset()
    no_of_shares = 0
    buy_hold_commission = 0
    for i in range(len(df_test) - 1):
        if i > 22:
            break
        action, _states = model.predict(obs)
        obs, rewards, done, info = env_test.step(action)
        profit = env_test.render()
        day_profits.append(profit)
        if i == 0:
            buy_hold_profit.append(0)
            no_of_shares = INITIAL_ACCOUNT_BALANCE // df_test.iloc[0]['Close']
            buy_hold_commission = no_of_shares * df_test.iloc[0][
                'Close'] * 0.001
            print('Buy ' + str(no_of_shares) + ' shares and hold')
        else:
            buy_hold_profit_per_step = no_of_shares * (
                df_test.iloc[i]['Close'] -
                df_test.iloc[0]['Close']) - buy_hold_commission
            buy_hold_profit.append(buy_hold_profit_per_step)
            print('Buy and Hold: ' + '*' * 40)
            print('No of shares: ' + str(no_of_shares) +
                  ' average cost per share ' + str(df_test.iloc[0]['Close']))
            print('profit is ' + str(buy_hold_profit_per_step))
Exemplo n.º 5
0
def main():

    cmd_parser = cmd_parse()
    options = cmd_parser.parse_args()

    ## Get the Stock Ticker data ##
    # print("The Stock ticker used here is ", options.ticker)

    file = Path("./data/" + options.ticker + ".csv")
    if file.is_file():
        df = pd.read_csv('./data/' + options.ticker + '.csv')
        df = df.sort_values('Date')
        print("Loading ticker data from: " + "./data/" + options.ticker +
              ".csv")
    else:
        print(
            "Data file for ticker does not exist. Please download data first to ./data/"
            + options.ticker + ".csv")
    training_logs_path = options.output_file + "_training_logs.csv"
    eval_logs_path = options.output_file + "_eval_logs"

    ## Get the training set size ##
    print("The options.training_set_size is ", options.training_set_size)

    ## Get the number of look back days ##
    print("The options.look-back-days here is: ", options.look_back_days)

    ## Get the model we are using to train the agent ##
    print("The model to train the agent here is: ", options.model)

    # The algorithms require a vectorized environment to run
    env = DummyVecEnv([
        lambda: StockTradingEnv(df, options.look_back_days, options.
                                training_set_size, eval_logs_path)
    ])

    if options.model == "PPO2":
        model = PPO2(MlpPolicy, env, verbose=1)
        model.learn(total_timesteps=options.training_set_size)

    np.savetxt(training_logs_path, model.training_rewards, delimiter=",")
    obs = env.reset()
    for i in range(options.training_set_size, len(df['Date'])):
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        env.render(title=options.ticker)
    env.close()
Exemplo n.º 6
0
def main():
    train_env = SubprocVecEnv([make_env(ENV_ID, i) for i in range(NUM_ENV)])
    model = PPO2('MlpPolicy', train_env, verbose=1)
    model.learn(total_timesteps=10000)
    test_env = DummyVecEnv([lambda: gym.make(ENV_ID)])

    state = test_env.reset()
    for i in range(200):
        test_env.render()
        action, _ = model.predict(state)
        state, rewards, done, info = test_env.step(action)

        # エピソード完了
        if done:
            break

    # 環境のクローズ
    test_env.close()
def test(obj, gif_steps):
    if isinstance(obj, str):
        model, env = load_from_name(obj, best=True)
    else:
        raise ValueError("Unexpected input type to load")
    
    # print("ENV", env)
    # env.env.env.early_low_termination = True
    env = DummyVecEnv([lambda: env])
    
    obs = env.reset()
    if gif_steps == 0:
        total_reward = 0
        ep_len = 0
        for _ in range(10000):
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)
            total_reward += reward
            ep_len += 1
            if done:
                print("EP length:", ep_len, " reward:", total_reward)
                total_reward = 0
                ep_len = 0
                obs = env.reset()
            frame = env.render()
    else:
        gif_frames = list()
        rewards = list()
        for i in range(gif_steps):
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)
            if 'frames' in info[0]:
                gif_frames.extend(info[0]['frames'])
        
            frame = env.render(mode='rgb_array')
            gif_frames.append(frame)
            rewards.append(reward)

        print("REWARD", sum(rewards))
        import imageio
        render_path = os.path.join(RENDERS, obj + '.gif')
        print("saving to ", render_path)
        os.makedirs(os.path.dirname(render_path), exist_ok=True)
        imageio.mimsave(render_path, gif_frames[::5], subrectangles=True, duration=0.05)
    def test_gym_integration(self):
        g = nx.Graph()
        g.add_edges_from([(0,1)])
        nx.set_node_attributes(g, {0: (0,1), 1: (1,2)}, "coords")
        orders = [(1,1,1,1,0.5)]
        drivers = np.ones((2), dtype=int)

        env_id = "TaxiEnvBatchTest-v01"
        gym.envs.register(
            id=env_id,
            entry_point='gym_taxi.envs:TaxiEnvBatch',
            kwargs={
                'world': g,
                'orders': orders,
                'order_sampling_rate': 1,
                'drivers_per_node': drivers,
                'n_intervals': 10,
                'wc': 0.5
            }
        )

        DATA_PATH = os.path.join(os.environ['ALLDATA_PATH'], "macaoFiles", "taxi_env_batch_test")
        if os.path.isdir(DATA_PATH):
            shutil.rmtree(DATA_PATH)
        os.makedirs(DATA_PATH)

        def make_env():
            env = gym.make(env_id)
            env.seed(1)
            return env
        env = DummyVecEnv([make_env])

        model = A2C(MlpPolicy, env, verbose=1)
        model.learn(total_timesteps=10)

        obs = env.reset()
        images = []
        img = env.render(mode="rgb_array")
        images.append(img)
        for _ in range(10):
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            images.append(env.render(mode="rgb_array"))
        imageio.mimwrite(os.path.join(DATA_PATH, 'taxi_batch_a2c.gif'), [np.array(img) for i, img in enumerate(images)], format="GIF-PIL", fps=5)
Exemplo n.º 9
0
def main():
    train_env = DummyVecEnv([make_env(ENV_ID, i) for i in range(NUM_ENV)])
    model = PPO2('CnnPolicy', train_env, verbose=0)
    model.learn(total_timesteps=1280000, callback=callback)

    test_env = DummyVecEnv([make_env(ENV_ID, 9)])

    state = test_env.reset()
    total_reward = 0
    while True:
        test_env.render()
        time.sleep(1 / 60)
        action, _ = model.predict(state)
        state, reward, done, info = test_env.step(action)

        total_reward += reward[0]
        if done:
            print('reward:', total_reward)
            state = test_env.reset()
            total_reward = 0
Exemplo n.º 10
0
def run():
    """
    Run a trained model for the pong problem
    """
    env = gym.make('CarRacing-v0')
    env = DummyVecEnv([lambda: env])

    # model = PPO2.load("CarRacing_model_PPO1_"+ str(5) +".pkl", env)
    model = PPO2.load("CarRacing_model_PPO2_5.pkl", env)
    avg_rew = evaluate(model=model, env=env, num_steps=10000)
    while True:
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            env.render()
            action, _ = model.predict(obs)
            obs, rew, done, _ = env.step(action)

            episode_rew += rew
        print("Episode reward", episode_rew)
def run_baseline_ddpg(env_name, train=True):
    import numpy as np
    # from stable_baselines.ddpg.policies import MlpPolicy
    from stable_baselines.common.vec_env import DummyVecEnv
    from stable_baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise
    from stable_baselines import DDPG

    env = gym.make(env_name)
    env = DummyVecEnv([lambda: env])

    if train:
        # mlp
        from stable_baselines.ddpg.policies import FeedForwardPolicy
        class CustomPolicy(FeedForwardPolicy):
            def __init__(self, *args, **kwargs):
                super(CustomPolicy, self).__init__(*args, **kwargs,
                                                layers=[64, 64, 64],
                                                layer_norm=True,
                                                feature_extraction="mlp")

        # the noise objects for DDPG
        n_actions = env.action_space.shape[-1]
        param_noise = None
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions)+0.15, sigma=0.3 * np.ones(n_actions))
        model = DDPG(CustomPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, 
            tau=0.01, observation_range=(env.observation_space.low, env.observation_space.high),
            critic_l2_reg=0, actor_lr=1e-3, critic_lr=1e-3, memory_limit=100000)
        model.learn(total_timesteps=1e5)
        model.save("checkpoints/ddpg_" + env_name)

    else:
        model = DDPG.load("checkpoints/ddpg_" + env_name)

        obs = env.reset()
        while True:
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            env.render()
            print("state: ", obs, " reward: ", rewards, " done: ", dones, "info: ", info)

    del model # remove to demonstrate saving and loading
Exemplo n.º 12
0
def test():
    env = DummyVecEnv([lambda: PrticleEnv(alpha, win_thre)])

    model = PPO2.load("ppo2_particle")

    # Enjoy trained agent
    obs = env.reset()
    dones = False
    x = 0
    y = 0
    x_prev = 20
    y_prev = 20
    while not dones:
        if int(x) != int(x_prev) or int(y) != int(y_prev):
            env.render()
        x_prev, y_prev = x, y

        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        x = obs[0][0]
        y = obs[0][2]
    env.close()
Exemplo n.º 13
0
    def test(self, model_epoch: int = 0, should_render: bool = True):
        train_provider, test_provider = self.data_provider.split_data_train_test(
            self.train_split_percentage)

        del train_provider

        init_envs = DummyVecEnv(
            [make_env(test_provider) for _ in range(self.n_envs)])

        model_path = path.join('data', 'agents',
                               f'{self.study_name}__{model_epoch}.pkl')
        model = self.Model.load(model_path, env=init_envs)

        test_env = DummyVecEnv([make_env(test_provider) for _ in range(1)])

        self.logger.info(f'Testing model ({self.study_name}__{model_epoch})')

        zero_completed_obs = np.zeros((self.n_envs, ) +
                                      init_envs.observation_space.shape)
        zero_completed_obs[0, :] = test_env.reset()

        state = None
        rewards = []

        for _ in range(len(test_provider.data_frame)):
            action, state = model.predict(zero_completed_obs, state=state)
            obs, reward, _, __ = test_env.step([action[0]])

            zero_completed_obs[0, :] = obs

            rewards.append(reward)

            if should_render:
                test_env.render(mode='human')

        self.logger.info(
            f'Finished testing model ({self.study_name}__{model_epoch}): ${"{:.2f}".format(np.sum(rewards))}'
        )
Exemplo n.º 14
0
def view_ppo2_mlplstm():
    env = gimbal(5, 500)
    env = DummyVecEnv([lambda: gimbal(5, 500)])
    model = PPO2.load("./models/baseline_ppo2_t6_dynamicR")
    success_rate = 0
    reward_avg = 0
    for episodes in range(50):
        obs = env.reset()
        state = None
        done = [False]
        r = 0
        while True:
            action, state = model.predict(obs, state=state, mask=done)
            obs, rewards, done, _ = env.step(action)
            r += rewards
            env.render()
            if done:
                if r > -100:
                    success_rate += 1
                    reward_avg += r
                break
    print("Success rate: ", success_rate, "Avg rewards: ",
          (reward_avg / success_rate))
Exemplo n.º 15
0
def sb_lstm():
    train_env = DummyVecEnv([
        lambda: Environment(mode="train",
                            interval=INTERVAL,
                            pair=PAIR,
                            algo=ALGO,
                            data_features=FEATURES)
    ])
    model = PPO2('MlpLstmPolicy', train_env, nminibatches=1, verbose=1)
    model.learn(TRAIN_TIMESTEPS)
    validate_env = DummyVecEnv([
        lambda: Environment(mode="validate",
                            interval=INTERVAL,
                            pair=PAIR,
                            algo=ALGO,
                            data_features=FEATURES)
    ])
    obs = validate_env.reset()
    state = None
    done = [False for _ in range(validate_env.num_envs)]
    for _ in range(len(validate_env.envs[0].df)):
        action, state = model.predict(obs, state=state, mask=done)
        obs, reward, done, _ = validate_env.step(action)
        validate_env.render()
Exemplo n.º 16
0
def test_cartpole():
    env = gym.make('CartPole-v0')
    env = DummyVecEnv([lambda: env])

    model = PPO2(MlpPolicy, env)
    model.learn(total_timesteps=100000)

    rewards = []
    for i in range(10):
        done = False
        cum_rewards = 0
        obs = env.reset()
        while not done:
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)
            cum_rewards += reward
            env.render()
        rewards.append(cum_rewards)
        print(cum_rewards)
    avg_rewards = sum(rewards) / len(rewards)
    print('average', avg_rewards)
    assert avg_rewards >= 200

    env.close()
Exemplo n.º 17
0
def main():
    train_env = DummyVecEnv([make_env(ENV_ID, i) for i in range(NUM_ENV)])
    model = PPO2("CnnPolicy", train_env, verbose=0, cliprange=0.1)
    model = PPO2.load("./logs/best_model.zip", env=train_env, verbose=0)
    callback = SaveOnBestTrainingRewardCallback(check_freq=10, log_dir=log_dir)
    #model.learn(total_timesteps=1280000, callback=callback)

    test_env = DummyVecEnv([make_env(ENV_ID, 9)])

    state = test_env.reset()
    total_reward = 0
    while True:
        test_env.render()
        time.sleep(1 / 60)

        action, _ = model.predict(state)

        state, reward, done, info = test_env.step(action)

        total_reward += reward[0]
        if done:
            print(f"reward:{total_reward}")
            state = test_env.reset()
            total_reward = 0
Exemplo n.º 18
0
def main():
    # 学習環境の生成
    train_env = DummyVecEnv([make_env(ENV_ID, i) for i in range(NUM_ENV)])

    # モデルの生成
    model = PPO2('CnnPolicy', train_env, verbose=0, cliprange=0.1)

    # モデルの読み込み
    # model = PPO2.load('breakout_model', env=train_env, verbose=0)

    # モデルの学習
    model.learn(total_timesteps=1280000, callback=callback)

    # テスト環境の生成
    test_env = DummyVecEnv([make_env(ENV_ID, 9)])

    # モデルのテスト
    state = test_env.reset()
    total_reward = 0
    while True:
        # 環境の描画
        test_env.render()
        time.sleep(1 / 60)

        # モデルの推論
        action, _ = model.predict(state)

        # 1ステップ実行
        state, reward, done, info = test_env.step(action)

        # エピソードの完了
        total_reward += reward[0]
        if done:
            print('reward:', total_reward)
            state = test_env.reset()
            total_reward = 0
def main(model_name, algo, testRange, isTargetPositionFixed, isDiscrete):
    panda_env = PandaGraspGymEnv(urdfRoot=object_data.getDataPath(), isRendering=True, useIK=True, isDiscrete=isDiscrete,
                                 numControlledJoints=7, isTargetPositionFixed=isTargetPositionFixed)
    env = DummyVecEnv([lambda: panda_env])

    if algo == "DDPG":
        model = DDPG.load(model_name)
    else:
        model = DQN.load(model_name)
    obs = env.reset()

    images = []
    img = env.get_images()

    for i in range(testRange):
        images.append(img)
        action, _states = model.predict(obs, deterministic=True)
        print("Step: {} Action: {}".format(i, action))
        obs, rewards, done, info = env.step(action)
        env.render(mode='human')
        img = env.get_images()

    os.makedirs(gif_dir, exist_ok=True)
    imageio.mimsave(gif_dir + model_name + '.gif', [np.array(img[0]) for i, img in enumerate(images) if i % 2 == 0], fps=29)
Exemplo n.º 20
0
    def run(self, max_episodes=500, max_timesteps=10000):
        """
            Run the PPO RL algorithm provided by stable baselines library
            (https://github.com/hill-a/stable-baselines) and save
            the generated model back to training job S3 bucket
        """
        unity_file = self.download_unity_env()
        env = self.get_gym_env(unity_file)
        # ========================================== #

        # RL stable baselines algorithms require a vectorized environment to run
        env = DummyVecEnv([lambda: env])

        model = PPO2(MlpPolicy, env, verbose=1)
        model.learn(total_timesteps=max_timesteps)

        obs = env.reset()
        for i in range(max_episodes):
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            env.render()

        sb_model_path = os.path.join('/tmp', 'ppo2_rldemo_sb')
        model.save(sb_model_path)

        # Note: the content of /opt/ml/model and /opt/ml/output is automatically uploaded
        # to previously selected bucket (by the estimator) at the end of the execution
        # os.environ['SM_MODEL_DIR'] correspongs to /opt/ml/model
        model_path = os.path.join(os.environ['SM_MODEL_DIR'], 'ppo2_rldemo')

        # Note: this model can not be directly employed in Unity ml-agents
        #       it has to be converted into Barracuda format
        generate_checkpoint_from_model(sb_model_path, model_path)

        # ========================================== #
        BaselinePPOTrainer.close_env(env)
Exemplo n.º 21
0
def train_multitask(df,
                    unique_trade_date,
                    timesteps=10,
                    policy="MlpPolicy",
                    model_name="multitask"):
    # df of all intermixed values
    # get out the individual tickers and switch out the dates
    # timesteps = num training steps per date
    start = time.time()
    df = data_split(df,
                    start=unique_trade_date[0],
                    end=unique_trade_date[len(unique_trade_date) - 1])
    last_state, initial = [], True
    model = None
    for i in range(len(unique_trade_date) - 2):
        for ticker in df["tic"].unique():
            # Interval is every two days so we can optimize on the change in account value
            start_date, end_date = unique_trade_date[i], unique_trade_date[i +
                                                                           2]
            quanta_df = data_split(df, start=start_date, end=end_date)
            quanta_df = quanta_df[quanta_df["tic"] == ticker]
            if len(quanta_df.index) < 2:
                continue
            quanta_df = quanta_df.reset_index()
            quanta_env = DummyVecEnv([
                lambda: StockEnvTrade(quanta_df,
                                      previous_state=last_state,
                                      initial=initial,
                                      log_interval=1)
            ])
            quanta_env.reset()
            model = train_PPO_update(model,
                                     quanta_env,
                                     timesteps,
                                     policy=policy)
            last_state = quanta_env.render()
        initial = False

    model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
    end = time.time()
    print('Training time (Multitask): ', (end - start) / 60, ' minutes')
    return model
Exemplo n.º 22
0
    def DRL_prediction(self, model, name, last_state, iter_num,
                       turbulence_threshold, initial):
        ### make a prediction based on trained model###

        ## trading env
        trade_data = data_split(
            self.df,
            start=self.unique_trade_date[iter_num - self.rebalance_window],
            end=self.unique_trade_date[iter_num])
        trade_env = DummyVecEnv([
            lambda: StockTradingEnv(trade_data,
                                    self.stock_dim,
                                    self.hmax,
                                    self.initial_amount,
                                    self.buy_cost_pct,
                                    self.sell_cost_pct,
                                    self.reward_scaling,
                                    self.state_space,
                                    self.action_space,
                                    self.tech_indicator_list,
                                    turbulence_threshold=turbulence_threshold,
                                    initial=initial,
                                    previous_state=last_state,
                                    model_name=name,
                                    mode='trade',
                                    iteration=iter_num,
                                    print_verbosity=self.print_verbosity)
        ])

        trade_obs = trade_env.reset()

        for i in range(len(trade_data.index.unique())):
            action, _states = model.predict(trade_obs)
            trade_obs, rewards, dones, info = trade_env.step(action)
            if i == (len(trade_data.index.unique()) - 2):
                # print(env_test.render())
                last_state = trade_env.render()

        df_last_state = pd.DataFrame({'last_state': last_state})
        df_last_state.to_csv('results/last_state_{}_{}.csv'.format(name, i),
                             index=False)
        return last_state
Exemplo n.º 23
0
def drl_prediction(df, model, name, last_state, turbulence_threshold):
    """Make a prediction based on trained model."""
    trade_data = data_split(df, start=20160102, end=2021010)
    env_trade = DummyVecEnv([
        lambda: StockEnvTrade(trade_data,
                              turbulence_threshold=turbulence_threshold,
                              previous_state=last_state,
                              model_name=name)
    ])
    obs_trade = env_trade.reset()

    for i in range(len(trade_data.index.unique())):
        action, _states = model.predict(obs_trade)
        obs_trade, rewards, dones, info = env_trade.step(action)
        if i == (len(trade_data.index.unique()) - 2):
            last_state = env_trade.render()

    df_last_state = pd.DataFrame({'last_state': last_state})
    df_last_state.to_csv('results/last_state_{}.csv'.format(name), index=False)
    return last_state
Exemplo n.º 24
0
def stock_trade(stock_file_train):
    df_train = pd.read_csv(stock_file_train)
    df_train = df_train.sort_values('date')

    # The algorithms require a vectorized environment to run
    env_train = DummyVecEnv([lambda: StockTradingEnv(df_train)])

    model = PPO2(MlpPolicy, env_train, verbose=0, tensorboard_log='./log')
    # model = DQN("MlpPolicy", env_train, verbose=0, tensorboard_log='./log')
    model.learn(total_timesteps=int(1e4))

    # -----------------Test Model --------------------------------------
    day_profits = []
    buy_hold_profit = []

    df_test = pd.read_csv(stock_file_train.replace('train', 'test'))

    env_test = DummyVecEnv([lambda: StockTradingEnv(df_test)])
    obs = env_test.reset()
    no_of_shares = 0
    for i in range(len(df_test) - 1):
        action, _states = model.predict(obs)
        obs, rewards, done, info = env_test.step(action)
        profit = env_test.render()
        day_profits.append(profit)
        if i == 0:
            buy_hold_profit.append(0)
            no_of_shares = INITIAL_ACCOUNT_BALANCE // df_test.iloc[0]['close']
            print('Buy ' + str(no_of_shares) + ' shares and hold')
        else:
            buy_hold_profit.append(
                no_of_shares *
                (df_test.iloc[i]['close'] - df_test.iloc[i - 1]['close']))
        if done:
            break
    return day_profits, buy_hold_profit
Exemplo n.º 25
0
def fund_trade(ts_code):
    day_profits = []
    df = read_fund_nav(ts_code, 3000).head(-30)
    df = df.sort_values(by='end_date', ascending=True).reset_index(drop=True)
    # todo 最后一行包含了估值,导致报错,临时删除
    df.drop([len(df) - 1], inplace=True)

    # The algorithms require a vectorized environment to run
    env = DummyVecEnv([lambda: FundTradingEnv(df)])
    model = PPO2(MlpPolicy, env, verbose=0, tensorboard_log='./log',seed=1)
    model.learn(total_timesteps=int(1e4))
    print("开始测试")
    df_test = df.tail(30).reset_index(drop=True)

    env = DummyVecEnv([lambda: FundTradingEnv(df_test)])
    obs = env.reset()
    for i in range(len(df_test) - 1):
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        profit = env.render()
        day_profits.append(profit)
        if done:
            break
    return day_profits
from gym import spaces
import numpy as np

# n_cpu = 4
# total_timesteps = 200000000
# # total_timesteps = 200000
# env = SubprocVecEnv([lambda: gym.make('WalkingSpider-v0') for i in range(n_cpu)])
# model = PPO2(MlpPolicy, env, verbose=1)
# model.learn(total_timesteps=total_timesteps)
# model.save("experience_learned/ppo2_WalkingSpider_v0_testing")
# del model # remove to demonstrate saving and loading

# # # Enjoy trained agent
model = PPO2.load("experience_learned/ppo2_WalkingSpider_v0_testing_3")
print("Enjoy trained agent")
env = DummyVecEnv([lambda: gym.make('WalkingSpider-v0')])
obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()

# Random Environment
# env = gym.make('WalkingSpider-v0')
# env.reset()
# for _ in range(1000):
#     env.render()
#     observation, reward, done, info = env.step(env.action_space.sample()) # take a random action

# print("Obs Shape ", observation, " Action Shape ", env.action_space.sample().shape)
Exemplo n.º 27
0
        model.save(path.join(SAVE_DIR, model_name), cloudpickle=True)

        obs = testEnv.reset()

        # Test for consecutive 2000 days
        for testNo in range(365 * 5):
            action, _states = model.predict(obs)
            if np.isnan(action).any():
                print(testNo)
            obs, rewards, done, info = testEnv.step(action)
            if done:
                print("Done")
                break
            profit_list.append(info[0]['profit'])
            act_profit_list.append(info[0]['actual_profit'])
            singleDay_record = testEnv.render(mode="detail")
            singleDay_record['testNo'] = testNo
            singleDay_record['rewards'] = rewards[0]
            detail_list.append(singleDay_record)

            if testNo % 365 == 0:
                print("\n============= TESTING " + str(testNo) +
                      " =============\n")
                testEnv.render()

        detail_fileName = detail_fileName_model[:-5] + str(tstep) + '-' + str(
            modelNo) + detail_fileName_model[-4:]
        pickle.dump(detail_list,
                    open(path.join(SAVE_DIR, detail_fileName), "wb"))

        final_result.append({
def stock_trade_US(stock_file_train, no_of_test_trading_days):
    df_train = pd.read_csv(stock_file_train)
    # df_train = df_train.sort_values('date')

    # The algorithms require a vectorized environment to run
    env_train = DummyVecEnv([lambda: StockTradingEnv_US(df_train)])

    total_timesteps = int(4e4)
    # total_timesteps = int(1e5)

    model = PPO2('MlpPolicy',
                 env_train,
                 verbose=0,
                 tensorboard_log='./log',
                 seed=12345).learn(total_timesteps=total_timesteps)

    # Random Agent, after training
    # mean_reward, std_reward = evaluate_policy(model, env_train, n_eval_episodes=100)
    # print(f"after training, mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

    # -----------------Test Model --------------------------------------

    import sys
    sys.stdout = open(
        f'./output/output_SPY_{total_timesteps}_days_{no_of_test_trading_days}.txt',
        'wt')

    day_profits = []
    buy_hold_profit = []

    df_test_raw = pd.read_csv(stock_file_train.replace('train', 'test'))
    #start from random day
    # df_test = df_test_raw.iloc[200:].reset_index(drop=True)
    df_test = df_test_raw

    df_test = df_test.drop(['Adj Close'], axis=1)

    env_test = DummyVecEnv([lambda: StockTradingEnv_US(df_test)])
    obs = env_test.reset()
    no_of_shares = 0
    buy_hold_commission = 0
    for n in range(len(df_test) - 1):
        if n > no_of_test_trading_days:
            break

        action, _states = model.predict(obs)

        # let agent start with a buy all
        # if n == 0:
        #     action[0][0] = 0
        #     action[0][1] = 1

        obs, rewards, done, info = env_test.step(action)
        profit = env_test.render()
        day_profits.append(profit)

        if n == 0:
            buy_hold_profit.append(0)
            no_of_shares = INITIAL_ACCOUNT_BALANCE // df_test.iloc[0]['Close']
            buy_hold_commission = no_of_shares * df_test.iloc[0][
                'Close'] * 0.001
            print('Buy ' + str(no_of_shares) + ' shares and hold')
        else:
            buy_hold_profit_per_step = no_of_shares * (
                df_test.iloc[n]['Close'] -
                df_test.iloc[0]['Close']) - buy_hold_commission
            buy_hold_profit.append(buy_hold_profit_per_step)
            print('Buy and Hold: ' + '*' * 40)
            print('No of shares: ' + str(no_of_shares) +
                  ' average cost per share ' + str(df_test.iloc[0]['Close']))
            print('profit is ' + str(buy_hold_profit_per_step))

        if done:
            break

    good_model = False
    if day_profits[-1] > buy_hold_profit[-1]:
        good_model = True

    return day_profits, buy_hold_profit, good_model, model, total_timesteps
Exemplo n.º 29
0
import pandas as pd

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import A2C

from env.BitcoinTradingEnv import BitcoinTradingEnv

df = pd.read_csv('./data/bitstamp.csv')
df = df.sort_values('Timestamp')

slice_point = int(len(df) - 50000)

train_df = df[:slice_point]
test_df = df[slice_point:]

train_env = DummyVecEnv([lambda: BitcoinTradingEnv(train_df, serial=True)])

model = A2C(MlpPolicy, train_env, verbose=1, tensorboard_log="./tensorboard/")
model.learn(total_timesteps=200000)

test_env = DummyVecEnv([lambda: BitcoinTradingEnv(test_df, serial=True)])

obs = test_env.reset()
for i in range(50000):
    action, _states = model.predict(obs)
    obs, rewards, done, info = test_env.step(action)
    test_env.render(mode="system", title="BTC")

test_env.close()
Exemplo n.º 30
0
    def test(self,
             model_epoch: int = 0,
             render_env: bool = True,
             render_report: bool = True,
             save_report: bool = False):
        train_provider, test_provider = self.data_provider.split_data_train_test(
            self.train_split_percentage)

        del train_provider

        init_envs = DummyVecEnv(
            [make_env(test_provider) for _ in range(self.n_envs)])

        model_path = path.join('data', 'agents',
                               f'{self.study_name}__{model_epoch}.pkl')
        model = self.Model.load(model_path, env=init_envs)

        test_env = DummyVecEnv([make_env(test_provider) for _ in range(1)])

        self.logger.info(f'Testing model ({self.study_name}__{model_epoch})')

        zero_completed_obs = np.zeros((self.n_envs, ) +
                                      init_envs.observation_space.shape)
        zero_completed_obs[0, :] = test_env.reset()

        state = None
        rewards = []

        for _ in range(len(test_provider.data_frame)):
            action, state = model.predict(zero_completed_obs, state=state)
            obs, reward, done, info = test_env.step([action[0]])

            zero_completed_obs[0, :] = obs

            rewards.append(reward)

            if render_env:
                test_env.render(mode='human')

            if done:
                net_worths = pd.DataFrame({
                    'Date': info[0]['timestamps'],
                    'Balance': info[0]['net_worths'],
                })

                net_worths.set_index('Date', drop=True, inplace=True)
                returns = net_worths.pct_change()[1:]

                if render_report:
                    qs.plots.snapshot(returns.Balance,
                                      title='RL Trader Performance')

                if save_report:
                    reports_path = path.join(
                        'data', 'reports',
                        f'{self.study_name}__{model_epoch}.html')
                    qs.reports.html(returns.Balance, file=reports_path)

        self.logger.info(
            f'Finished testing model ({self.study_name}__{model_epoch}): ${"{:.2f}".format(np.sum(rewards))}'
        )