示例#1
0
def optimize_agent(trial):
    env_params = optimize_envs(trial)
    print("Trial with params")
    print(env_params)
    train_env = DummyVecEnv(
        [lambda: BitcoinTradingEnv(train_df, **env_params)])
    test_env = DummyVecEnv([lambda: BitcoinTradingEnv(test_df, **env_params)])

    model_params = optimize_ppo2(trial)
    # model = PPO2(MlpLnLstmPolicy, train_env, verbose=0, nminibatches=1,
    #              tensorboard_log="./tensorboard", **model_params)

    model = PPO2(MlpLnLstmPolicy,
                 train_env,
                 verbose=1,
                 nminibatches=1,
                 **model_params)

    last_reward = -np.finfo(np.float16).max
    evaluation_interval = int(len(train_df) / n_evaluations)

    for eval_idx in range(n_evaluations):
        print("Eval index: " + str(eval_idx))
        try:
            model.learn(evaluation_interval)
        except AssertionError:
            raise

        rewards = []
        n_episodes, reward_sum = 0, 0.0

        obs = test_env.reset()
        while n_episodes < n_test_episodes:
            action, _ = model.predict(obs)
            obs, reward, done, _ = test_env.step(action)
            reward_sum += reward

            if done:
                rewards.append(reward_sum)
                reward_sum = 0.0
                n_episodes += 1
                obs = test_env.reset()

        last_reward = np.mean(rewards)
        trial.report(-1 * last_reward, eval_idx)

        if trial.should_prune(eval_idx):
            raise optuna.structs.TrialPruned()

    return -1 * last_reward
示例#2
0
    def test(self, model_instance: 0):

        study_name = 'ppo2_' + self.reward_strategy
        study = optuna.load_study(study_name=study_name,
                                  storage=self.params_db_file)
        params = study.best_trial.params

        test_env = DummyVecEnv([
            lambda: BitcoinTradingEnv(self.test_df,
                                      reward_func=self.reward_strategy,
                                      forecast_len=int(params['forecast_len']),
                                      confidence_interval=params[
                                          'confidence_interval'])
        ])

        model_params = self.model_params(params)

        model = PPO2.load(os.path.join(
            '.', 'agents',
            'ppo2_' + reward_strategy + '_' + str(model_instance) + '.pkl'),
                          env=test_env)

        obs, done = test_env.reset(), False
        while not done:
            action, _states = model.predict(obs)
            obs, reward, done, info = test_env.step(action)

            test_env.render(mode="human")
示例#3
0
def optimize_envs(trial):
    params = {
        'n_forecasts': int(trial.suggest_loguniform('n_forecasts', 4, 100)),
        'confidence_interval': trial.suggest_uniform('confidence_interval', 0.7, 0.99),
    }

    df = pd.read_csv('./data/coinbase_hourly.csv')
    df = df.drop(['Symbol'], axis=1)

    test_len = int(len(df) * 0.2)
    train_len = int(len(df)) - test_len

    train_df = df[:train_len]
    test_df = df[train_len:]

    train_env = DummyVecEnv([lambda: BitcoinTradingEnv(
        train_df, reward_func='profit', **params)])
    test_env = DummyVecEnv([lambda: BitcoinTradingEnv(
        test_df, reward_func='profit', **params)])

    return train_env, test_env
示例#4
0
import torch.optim as optim 
import torch.nn.functional as F 
from torch.autograd import Variable
import matplotlib.pyplot as plt
import functools
from env.BitcoinTradingEnv import BitcoinTradingEnv
from env.indicators import prepare_indicators
from REINFORCE import PolicyNetwork, update_policy

if __name__ == '__main__':

    sdf = prepare_indicators('data/bitstampUSD_1-min_data_2012-01-01_to_2019-08-12.csv')
    N = 500_000
    train_df = sdf[:N]

    train_env = BitcoinTradingEnv(train_df, lookback_window_size=60, 
                            commission=1e-4, initial_balance=1000, serial=False)
    
    input_dim, seq_length = train_env.observation_space.shape
    output_dim1 = train_env.action_space.nvec[0]
    output_dim2 = train_env.action_space.nvec[1]
    hidden_dim = 128
    lstm_layers = 2

    # choose device
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    #device = torch.device('cpu')
    print(f"Device used: {device}")

    policy_net = PolicyNetwork(input_dim, output_dim1, output_dim2, hidden_dim, n_layers=lstm_layers)
    # Loading the best model
    model_name = 'model/state_dict3.pt'
示例#5
0
#Hyperparameters
T_horizon   = 30
n_episodes    = 10000
print_interval = 2

config = {
    'lr':           0.0005,
    'gamma':        0.90,
    'lmbda':        0.95,
    'eps_clip':     0.1,
    'K_epoch':      3,
}

df = pd.read_csv('./data/1 Dec 2017 - 1 Dec 2018.csv')

test_env = BitcoinTradingEnv(df, serial=True)

print('observation space:', test_env.observation_space.shape)
print('action space:', test_env.action_space)

memory = Memory()

model = MODEL(c_in=test_env.observation_space.shape[0],
                   c_out=test_env.action_space.n,
                   seq_len=test_env.observation_space.shape[1])
model = model.to(device)

agent = PPO(model=model, memory=memory, config=config, device=device)

if os.path.exists('./save/model.m5'):
    agent.model.load_state_dict(torch.load('./save/model.m5'))
示例#6
0
print(params)

df = pd.read_csv('./data/coinbase_hourly.csv')
df = df.drop(['Symbol'], axis=1)
df = df.sort_values(['Date'])

test_len = int(len(df) * 0.2)
train_len = 100  # int(len(df)) - test_len

train_df = df[:train_len]
test_df = df[train_len:]

train_env = DummyVecEnv([
    lambda: BitcoinTradingEnv(train_df,
                              n_forecasts=int(params['n_forecasts']),
                              confidence_interval=params['confidence_interval']
                              )
])

model_params = {
    'n_steps': int(params['n_steps']),
    'gamma': params['gamma'],
    'learning_rate': params['learning_rate'],
    'ent_coef': params['ent_coef'],
    'cliprange': params['cliprange'],
    'noptepochs': int(params['noptepochs']),
    'lam': params['lam']
}

model = PPO2(MlpLstmPolicy,
             train_env,
示例#7
0
df = requestCandles(api, gran, from_, to, instr)

saveDf = True
filename = join('data', '{}.{}.out'.format(instr, gran))

if saveDf:
    df.to_csv(filename)

test_len = int(len(df) * 0.2)
train_len = int(len(df)) - test_len

train_df = df[:train_len]
test_df = df[train_len:]

# ====== ENVIRONMENT SETUP =======
trainEnv = DummyVecEnv([lambda: BitcoinTradingEnv(train_df)])

testEnv = DummyVecEnv([lambda: BitcoinTradingEnv(test_df)])

model_params = {
    'n_steps': 243,
    'gamma': 0.94715,
    'learning_rate': 0.00157,
    'ent_coef': 2.29869,
    'cliprange': 0.38388,
    'noptepochs': 35,
    'lam': 0.89837,
}

# This is stupid
if curr_idx == -1:
示例#8
0
    percentageToUse=mainparams.get('dataset_percentage'))

for td in testDirs:

    params = getConfiguration(join(td, 'config.yaml'))

    # ====== IMPORT MODEL ======
    modelToUse = selectFunctionAccordingToParams('model', params.get('model'))
    polictyToUse = selectFunctionAccordingToParams('policy',
                                                   params.get('policy'))

    # ====== ENVIRONMENT SETUP =======
    trainEnv = DummyVecEnv([
        lambda: BitcoinTradingEnv(train_df,
                                  reward_func=params.get('reward_strategy'),
                                  forecast_len=params.get('forecast_len'),
                                  confidence_interval=params.get(
                                      'confidence_interval'))
    ])

    testEnv = DummyVecEnv([
        lambda: BitcoinTradingEnv(test_df,
                                  reward_func=params.get('reward_strategy'),
                                  forecast_len=params.get('forecast_len'),
                                  confidence_interval=params.get(
                                      'confidence_interval'))
    ])

    boardDir = join(td, 'tensorboard')
    if not exists(boardDir):
        makedirs(boardDir)
示例#9
0
df = pd.read_csv('./data/coinbase_hourly.csv')
df = df.drop(['Symbol'], axis=1)
df = df.sort_values(['Date'])
df = add_indicators(df.reset_index())

test_len = int(len(df) * 0.2)
train_len = int(len(df)) - test_len

train_df = df[:train_len]
test_df = df[train_len:]

train_env = DummyVecEnv([
    lambda: BitcoinTradingEnv(train_df,
                              reward_func="calmar",
                              forecast_len=int(params['forecast_len']),
                              confidence_interval=params['confidence_interval']
                              )
])

test_env = DummyVecEnv([
    lambda: BitcoinTradingEnv(test_df,
                              reward_func="calmar",
                              forecast_len=int(params['forecast_len']),
                              confidence_interval=params['confidence_interval']
                              )
])

model_params = {
    'n_steps': int(params['n_steps']),
    'gamma': params['gamma'],
示例#10
0
df = pd.read_csv('./data/wdo_small.csv')
# df = df.drop(['Symbol'], axis=1)
df = df.sort_values(['Date'])
df = add_indicators(df.reset_index())

test_len = int(len(df) * 0.2)
train_len = int(len(df)) - test_len

test_df = df[train_len:]

profit_study = optuna.load_study(study_name='ppo2_profit',
                                 storage='sqlite:///params.db')
profit_env = DummyVecEnv([
    lambda: BitcoinTradingEnv(
        test_df,
        reward_func="profit",
        forecast_len=int(profit_study.best_trial.params['forecast_len']),
        confidence_interval=profit_study.best_trial.params[
            'confidence_interval'])
])

sortino_study = optuna.load_study(study_name='ppo2_sortino',
                                  storage='sqlite:///params.db')
sortino_env = DummyVecEnv([
    lambda: BitcoinTradingEnv(
        test_df,
        reward_func="profit",
        forecast_len=int(sortino_study.best_trial.params['forecast_len']),
        confidence_interval=sortino_study.best_trial.params[
            'confidence_interval'])
])
示例#11
0
    def train(self):
        if not self.train_df:
            self.logger.info("Running built-in data preparation")
            self.prepare_data()
        else:
            self.logger.info("Using provided data (Length: %d)" %
                             len(self.train_df))

        study_name = 'ppo2_' + self.reward_strategy

        study = optuna.load_study(study_name=study_name,
                                  storage=self.params_db_file)
        params = study.best_trial.params

        train_env = DummyVecEnv([
            lambda: BitcoinTradingEnv(self.train_df,
                                      reward_func=self.reward_strategy,
                                      forecast_len=int(params['forecast_len']),
                                      confidence_interval=params[
                                          'confidence_interval'])
        ])

        test_env = DummyVecEnv([
            lambda: BitcoinTradingEnv(self.test_df,
                                      reward_func=self.reward_strategy,
                                      forecast_len=int(params['forecast_len']),
                                      confidence_interval=params[
                                          'confidence_interval'])
        ])

        model_params = self.model_params(params)

        model = PPO2(MlpLnLstmPolicy,
                     train_env,
                     verbose=0,
                     nminibatches=1,
                     tensorboard_log=os.path.join('.', 'tensorboard'),
                     **model_params)

        models_to_train = 1
        self.logger.info("Training {} model instances".format(models_to_train))

        for idx in range(
                0, models_to_train):  #Not sure why we are doing this, tbh
            self.logger.info('[', idx, '] Training for: ', len(self.train_df),
                             ' time steps')

            model.learn(total_timesteps=len(self.train_df))

            obs = test_env.reset()
            done, reward_sum = False, 0

            while not done:
                action, _states = model.predict(obs)
                obs, reward, done, info = test_env.step(action)
                reward_sum += reward

            self.logger.info('[', idx, '] Total reward: ', reward_sum,
                             ' (' + self.reward_strategy + ')')
            model.save(
                os.path.join(
                    '.', 'agents',
                    'ppo2_' + self.reward_strategy + '_' + str(idx) + '.pkl'))

        self.logger.info("Trained {} model instances".format(models_to_train))
示例#12
0
params = study.best_trial.params

print("Testing PPO2 agent with params:", params)
print("Best trial:", study.best_trial.value)

df = pd.read_csv('./data/coinbase_hourly.csv')
df = df.drop(['Symbol'], axis=1)
df = df.sort_values(['Date'])
df = add_indicators(df.reset_index())

test_len = int(len(df) * 0.2)
train_len = int(len(df)) - test_len

test_df = df[train_len:]

test_env = DummyVecEnv([lambda: BitcoinTradingEnv(
    test_df, reward_func="sortino", forecast_len=int(params['forecast_len']), confidence_interval=params['confidence_interval'])])

model_params = {
    'n_steps': int(params['n_steps']),
    'gamma': params['gamma'],
    'learning_rate': params['learning_rate'],
    'ent_coef': params['ent_coef'],
    'cliprange': params['cliprange'],
    'noptepochs': int(params['noptepochs']),
    'lam': params['lam'],
}

model = PPO2.load('./agents/ppo2_sortino_3.pkl', env=test_env)

obs, done = test_env.reset(), False
while not done:
示例#13
0
import pandas as pd
import torch
from env.BitcoinTradingEnv import BitcoinTradingEnv
import numpy as np
import torch.optim as optim
from agent.ACER import ACER
from model.GRUFCN.models.RNN_FCN import MGRU_FCN as MODEL
from agent.ActorCritic import ActorCritic
from utils.ReplayMemory import ReplayBuffer
import gym
from torch.distributions.categorical import Categorical
import os

df = pd.read_csv('./data/1 Dec 2019 - 1 Dec 2020.csv')

test_env = BitcoinTradingEnv(df, serial=True)

print('observation space:', test_env.observation_space.shape)
print('action space:', test_env.action_space)

model = MODEL(c_in=test_env.observation_space.shape[0],
              c_out=test_env.action_space.n,
              seq_len=test_env.observation_space.shape[1])

#acer = ACER(model=model, memory=memory, config=acer_config)

if os.path.exists('./save/model.m5'):
    model.load_state_dict(torch.load('./save/model.m5'))

avg_t = 0
avg_r = 0
示例#14
0
def objective(trial):
    # Define what to optimize in environment
    envParams = {
        'reward_func':
        reward_strategy,
        'forecast_len':
        int(trial.suggest_loguniform('forecast_len', 1, 200)),
        'confidence_interval':
        trial.suggest_uniform('confidence_interval', 0.7, 0.99),
    }
    train_df, test_df = getDatasets(
        params.get('input_data_file'),
        percentageToUse=params.get('dataset_percentage'))
    trainEnv = DummyVecEnv([lambda: BitcoinTradingEnv(train_df, **envParams)])
    testEnv = DummyVecEnv([lambda: BitcoinTradingEnv(test_df, **envParams)])

    # Define what to optimize in agent
    agentParams = {
        'n_steps': int(trial.suggest_loguniform('n_steps', 16, 2048)),
        'gamma': trial.suggest_loguniform('gamma', 0.9, 0.9999),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1.),
        'ent_coef': trial.suggest_loguniform('ent_coef', 1e-8, 1e-1),
        'cliprange': trial.suggest_uniform('cliprange', 0.1, 0.4),
        'noptepochs': int(trial.suggest_loguniform('noptepochs', 1, 48)),
        'lam': trial.suggest_uniform('lam', 0.8, 1.)
    }

    model = PPO2(MlpLnLstmPolicy,
                 trainEnv,
                 verbose=0,
                 nminibatches=1,
                 **agentParams)

    # Run optimizer
    last_reward = -np.finfo(np.float16).max
    evaluation_interval = int(len(train_df) / params.get('n_test_episodes'))

    for eval_idx in range(params.get('n_evaluations')):
        try:
            model.learn(evaluation_interval)
        except AssertionError:
            raise

        rewards = []
        n_episodes, reward_sum = 0, 0.0

        obs = testEnv.reset()
        while n_episodes < params.get('n_test_episodes'):
            action, _ = model.predict(obs)
            obs, reward, done, _ = testEnv.step(action)
            reward_sum += reward

            if done:
                rewards.append(reward_sum)
                reward_sum = 0.0
                n_episodes += 1
                obs = testEnv.reset()

        last_reward = np.mean(rewards)
        trial.report(-1 * last_reward, eval_idx)

        if trial.should_prune(eval_idx):
            raise optuna.structs.TrialPruned()

    return -1 * last_reward
import pandas as pd

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import A2C

from env.BitcoinTradingEnv import BitcoinTradingEnv

df = pd.read_csv('./data/bitstamp.csv')
df = df.sort_values('Timestamp')

slice_point = int(len(df) - 50000)

train_df = df[:slice_point]
test_df = df[slice_point:]

train_env = DummyVecEnv([lambda: BitcoinTradingEnv(train_df, serial=True)])

model = A2C(MlpPolicy, train_env, verbose=1, tensorboard_log="./tensorboard/")
model.learn(total_timesteps=200000)

test_env = DummyVecEnv([lambda: BitcoinTradingEnv(test_df, serial=True)])

obs = test_env.reset()
for i in range(50000):
    action, _states = model.predict(obs)
    obs, rewards, done, info = test_env.step(action)
    test_env.render(mode="system", title="BTC")

test_env.close()
示例#16
0
print("Training PPO2 agent with params:", params)
print("Best trial reward:", -1 * study.best_trial.value)

df = pd.read_csv(input_data_file)
df = df.drop(['Symbol'], axis=1)
df = df.sort_values(['Date'])
df = add_indicators(df.reset_index())

test_len = int(len(df) * 0.2)
train_len = int(len(df)) - test_len

train_df = df[:train_len]
test_df = df[train_len:]

train_env = DummyVecEnv([lambda: BitcoinTradingEnv(
    train_df, reward_func=reward_strategy, forecast_len=int(params['forecast_len']), confidence_interval=params['confidence_interval'])])

test_env = DummyVecEnv([lambda: BitcoinTradingEnv(
    test_df, reward_func=reward_strategy, forecast_len=int(params['forecast_len']), confidence_interval=params['confidence_interval'])])

model_params = {
    'n_steps': int(params['n_steps']),
    'gamma': params['gamma'],
    'learning_rate': params['learning_rate'],
    'ent_coef': params['ent_coef'],
    'cliprange': params['cliprange'],
    'noptepochs': int(params['noptepochs']),
    'lam': params['lam'],
}

if curr_idx == -1:
示例#17
0
df = df.sort_values(['Date'])
df = add_indicators(df.reset_index())

test_len = int(len(df) * 0.2)
train_len = int(len(df)) - test_len

train_df = df[:train_len]
test_df = df[train_len:]

# Enable multiprocess environment
n_cpu = 32

train_env = SubprocVecEnv([
    lambda: BitcoinTradingEnv(train_df,
                              reward_func=reward_strategy,
                              forecast_len=int(params['forecast_len']),
                              confidence_interval=params['confidence_interval']
                              ) for i in range(n_cpu)
])

test_env = SubprocVecEnv([
    lambda: BitcoinTradingEnv(test_df,
                              reward_func=reward_strategy,
                              forecast_len=int(params['forecast_len']),
                              confidence_interval=params['confidence_interval']
                              ) for i in range(n_cpu)
])

model_params = {
    'n_steps': int(params['n_steps']),
    'gamma': params['gamma'],