def get_stock_data(start_date:str, end_date:str, stocks_tradable:List[str], tech_indicator_list:List[str]): """ start_date and end_date include the whole period from train, validation to test time periods """ df = YahooDownloader(start_date=start_date, end_date=end_date, ticker_list=stocks_tradable).fetch_data() fe = FeatureEngineer(use_technical_indicator=True, # tech_indicator_list = config.TECHNICAL_INDICATORS_LIST, tech_indicator_list=tech_indicator_list, use_turbulence=False, user_defined_feature=False) processed = fe.preprocess_data(df) list_ticker = processed["tic"].unique().tolist() list_date = list(pd.date_range(processed['date'].min(),processed['date'].max()).astype(str)) combination = list(itertools.product(list_date,list_ticker)) processed_full = pd.DataFrame(combination,columns=["date","tic"]).merge(processed,on=["date","tic"],how="left") processed_full = processed_full[processed_full['date'].isin(processed['date'])] processed_full = processed_full.sort_values(['date','tic']) processed_full = processed_full.fillna(0) return processed_full, list_date
def raw_data_preprocess( prp_data_path, df, beg_date, end_date, tech_id_list, ): if os.path.exists(prp_data_path): import pandas as pd df = pd.read_pickle(prp_data_path) # DataFrame of Pandas else: from finrl.preprocessing.preprocessors import FeatureEngineer fe = FeatureEngineer( use_technical_indicator=True, tech_indicator_list=tech_id_list, use_turbulence=True, user_defined_feature=False, ) df = fe.preprocess_data(df) # preprocess raw_df df = df[(df.date >= beg_date) & (df.date < end_date)] df = df.sort_values(["date", "tic"], ignore_index=True) df.index = df.date.factorize()[0] df.to_pickle(prp_data_path) print('| df.columns.values:', df.columns.values) assert all(df.columns.values == [ 'date', 'open', 'high', 'low', 'close', 'volume', 'tic', 'day', 'macd', 'boll_ub', 'boll_lb', 'rsi_30', 'cci_30', 'dx_30', 'close_30_sma', 'close_60_sma', 'turbulence' ]) return df
def get_feature_engineered_df(df): fe = FeatureEngineer(df.copy(), use_technical_indicator=True, tech_indicator_list=config.TECHNICAL_INDICATORS_LIST, use_turbulence=True, user_defined_feature=False) df = fe.preprocess_data() return df
def get_yahoo_data(start, end): df = YahooDownloader(start_date=start, end_date=end, ticker_list=config.DOW_30_TICKER).fetch_data() df.sort_values(['date', 'tic'], ignore_index=True) x = df.tic.unique() templ = [] # get intersection data, smallest data for name, group in df.groupby('date'): g = group.tic.unique() if len(templ) == 0: templ = [i for i in g if i in x] else: templ = [i for i in g if i in templ] data_merge = pd.DataFrame(columns=list(df.columns)) x = np.array(templ).reshape(-1, 1) temp_df = pd.DataFrame.from_records(x, columns=['tic']) for name, group in df.groupby('date'): temp_df['date'] = name result_outer = pd.merge(group, temp_df, on=['date', 'tic']) result_outer = result_outer.sort_values(['date', 'tic'], ignore_index=True) assert len(result_outer) == len(temp_df.tic.unique()) data_merge = data_merge.append(result_outer) df = data_merge fe = FeatureEngineer(use_technical_indicator=True, tech_indicator_list=config.TECHNICAL_INDICATORS_LIST, use_turbulence=True, user_defined_feature=False) processed = fe.preprocess_data(df) processed.sort_values(['date', 'tic'], ignore_index=True) return processed
def load_stock_trading_data(): from finrl.config import config cwd = './env/FinRL' raw_data_path = f'{cwd}/StockTradingEnv_raw_data.df' processed_data_path = f'{cwd}/StockTradingEnv_processed_data.df' os.makedirs(cwd, exist_ok=True) print("==============Start Fetching Data===========") if os.path.exists(raw_data_path): raw_df = pd.read_pickle(raw_data_path) # DataFrame of Pandas print('| raw_df.columns.values:', raw_df.columns.values) else: from finrl.marketdata.yahoodownloader import YahooDownloader raw_df = YahooDownloader( start_date=config.START_DATE, end_date=config.END_DATE, ticker_list=config.DOW_30_TICKER, ).fetch_data() raw_df.to_pickle(raw_data_path) print("==============Start Feature Engineering===========") if os.path.exists(processed_data_path): processed_df = pd.read_pickle(processed_data_path) # DataFrame of Pandas print('| processed_df.columns.values:', processed_df.columns.values) else: from finrl.preprocessing.preprocessors import FeatureEngineer fe = FeatureEngineer( use_technical_indicator=True, tech_indicator_list=config.TECHNICAL_INDICATORS_LIST, use_turbulence=True, user_defined_feature=False, ) processed_df = fe.preprocess_data(raw_df) processed_df.to_pickle(processed_data_path) # Training & Trading data split from finrl.preprocessing.data import data_split train_df = data_split(processed_df, '2008-03-19', '2016-01-01') # 1963/3223 eval_df = data_split(processed_df, '2016-01-01', '2021-01-01') # 1260/3223 return train_df, eval_df
def main(): initialize() parser = build_parser() options = parser.parse_args() if options.mode == "train": import finrl.autotrain.training finrl.autotrain.training.train_one() elif options.mode == "download_data": print('Download Data Begin') dow_30 = si.tickers_dow() # ETF #dftmp = pd.read_csv('data/etf_tom.csv',index_col=0) #dow_30 = dftmp.tic.unique() # DOW30 dftmp = pd.read_csv('data/tom_dow_done_data.csv', index_col=0) dow_30 = dftmp.tic.unique() #dow_30 = ['DSS','AAPL','INFY'] #dow_30 = ['^DJI'] price_data = {ticker: si.get_data(ticker) for ticker in dow_30} df = reduce(lambda x, y: x.append(y), price_data.values()) df.reset_index(inplace=True) df = df.rename(columns={'index': 'date', 'ticker': 'tic'}) fe = FeatureEngineer(use_technical_indicator=True, use_turbulence=False, user_defined_feature=False) df = fe.preprocess_data(df) now = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") df.to_csv(config.DATA_SAVE_DIR + "/" + "dow30_" + now + ".csv", index=False) print('Download Complete')
def prepare_data() -> (dict, pd.DataFrame): processed = load_from_cache() if processed is None: print("==============Start Fetching Data===========") df = YahooDownloader( start_date=config.START_DATE, end_date=config.END_DATE, ticker_list=config.CURRENT_TICKER, ).fetch_data() print("==============Start Feature Engineering===========") fe = FeatureEngineer( use_technical_indicator=True, tech_indicator_list=config.TECHNICAL_INDICATORS_LIST, use_turbulence=False, user_defined_feature=False, ) processed = fe.preprocess_data(df) save(processed) # calculate state action space stock_dimension = len(processed.tic.unique()) state_space = (2 * stock_dimension + len(config.TECHNICAL_INDICATORS_LIST) * stock_dimension * 2) env_kwargs = { "hmax": 100, "initial_amount": 1000000, "buy_cost_pct": 0.01, "sell_cost_pct": 0.01, "state_space": state_space, "stock_dim": stock_dimension, "tech_indicator_list": config.TECHNICAL_INDICATORS_LIST, "action_space": stock_dimension, "reward_scaling": 1e-4 } return env_kwargs, processed
print(tech_indicator_list) # <a id='3.2'></a> # ## 4.2 Perform Feature Engineering # In[14]: fe = FeatureEngineer( use_technical_indicator=True, tech_indicator_list = tech_indicator_list, use_turbulence=False, user_defined_feature = False) data_df = fe.preprocess_data(data_df) # In[15]: data_df.head() # <a id='4'></a> # # Part 5. Build Environment # Considering the stochastic and interactive nature of the automated stock trading tasks, a financial task is modeled as a **Markov Decision Process (MDP)** problem. The training process involves observing stock price change, taking an action and reward's calculation to have the agent adjusting its strategy accordingly. By interacting with the environment, the trading agent will derive a trading strategy with the maximized rewards as time proceeds. # # Our trading environments, based on OpenAI Gym framework, simulate live stock markets with real market data according to the principle of time-driven simulation. # # The action space describes the allowed actions that the agent interacts with the environment. Normally, action a includes three actions: {-1, 0, 1}, where -1, 0, 1 represent selling, holding, and buying one share. Also, an action can be carried upon multiple shares. We use an action space {-k,…,-1, 0, 1, …, k}, where k denotes the number of shares to buy and -k denotes the number of shares to sell. For example, "Buy 10 shares of AAPL" or "Sell 10 shares of AAPL" are 10 or -10, respectively. The continuous action space needs to be normalized to [-1, 1], since the policy is defined on a Gaussian distribution, which needs to be normalized and symmetric.
def train_one(): """ train an agent """ print("==============Start Fetching Data===========") df = YahooDownloader( start_date=config.START_DATE, end_date=config.END_DATE, ticker_list=config.DOW_30_TICKER, ).fetch_data() print("==============Start Feature Engineering===========") fe = FeatureEngineer( use_technical_indicator=True, tech_indicator_list=config.TECHNICAL_INDICATORS_LIST, use_turbulence=True, user_defined_feature=False, ) processed = fe.preprocess_data(df) # Training & Trading data split train = data_split(processed, config.START_DATE, config.START_TRADE_DATE) trade = data_split(processed, config.START_TRADE_DATE, config.END_DATE) # calculate state action space stock_dimension = len(train.tic.unique()) state_space = (1 + 2 * stock_dimension + len(config.TECHNICAL_INDICATORS_LIST) * stock_dimension) env_kwargs = { "hmax": 100, "initial_amount": 1000000, "buy_cost_pct": 0.001, "sell_cost_pct": 0.001, "state_space": state_space, "stock_dim": stock_dimension, "tech_indicator_list": config.TECHNICAL_INDICATORS_LIST, "action_space": stock_dimension, "reward_scaling": 1e-4 } e_train_gym = StockTradingEnv(df=train, **env_kwargs) e_trade_gym = StockTradingEnv(df=trade, turbulence_threshold=250, **env_kwargs) env_train, _ = e_train_gym.get_sb_env() env_trade, obs_trade = e_trade_gym.get_sb_env() agent = DRLAgent(env=env_train) print("==============Model Training===========") now = datetime.datetime.now().strftime("%Y%m%d-%Hh%M") model_sac = agent.get_model("sac") trained_sac = agent.train_model(model=model_sac, tb_log_name="sac", total_timesteps=80000) print("==============Start Trading===========") df_account_value, df_actions = DRLAgent.DRL_prediction(model=trained_sac, test_data=trade, test_env=env_trade, test_obs=obs_trade) df_account_value.to_csv("./" + config.RESULTS_DIR + "/df_account_value_" + now + ".csv") df_actions.to_csv("./" + config.RESULTS_DIR + "/df_actions_" + now + ".csv") print("==============Get Backtest Results===========") perf_stats_all = BackTestStats(df_account_value) perf_stats_all = pd.DataFrame(perf_stats_all) perf_stats_all.to_csv("./" + config.RESULTS_DIR + "/perf_stats_all_" + now + ".csv")
def get_dataset(datadir,data_type,start_date,end_date): if not data_type in config.SUPPORTED_DATA: raise ValueError('Market type not supported') data_path = os.path.join(datadir,data_type + '.csv') if not os.path.exists(data_path): if data_type == 'dow29': # If we don't have the data, we can download dow data from yahoo finance stock_tickers = config.DOW_30_TICKER_MINUS_VISA indicators = config.TECHNICAL_INDICATORS_LIST print('Getting Data: ') df = YahooDownloader(start_date = '2000-01-01', end_date = '2021-01-01', ticker_list = stock_tickers).fetch_data() fe = FeatureEngineer( use_technical_indicator=True, tech_indicator_list = indicators, use_turbulence=True, user_defined_feature = False) print('Adding Indicators') processed = fe.preprocess_data(df) list_ticker = processed["tic"].unique().tolist() list_date = list(pd.date_range(processed['date'].min(),processed['date'].max()).astype(str)) combination = list(itertools.product(list_date,list_ticker)) processed_full = pd.DataFrame(combination,columns=["date","tic"]).merge(processed,on=["date","tic"],how="left") processed_full = processed_full[processed_full['date'].isin(processed['date'])] processed_full = processed_full.sort_values(['date','tic']) processed_full = processed_full.fillna(0) processed.to_csv(data_path,index = False) elif data_type == 'nas29': # If we don't have the data, we can download dow data from yahoo finance stock_tickers = config.NAS_29_TICKER indicators = config.TECHNICAL_INDICATORS_LIST print('Getting Data: ') df = YahooDownloader(start_date = '2000-01-01', end_date = '2021-01-01', ticker_list = stock_tickers).fetch_data() fe = FeatureEngineer( use_technical_indicator=True, tech_indicator_list = indicators, use_turbulence=True, user_defined_feature = False) print('Adding Indicators') processed = fe.preprocess_data(df) list_ticker = processed["tic"].unique().tolist() list_date = list(pd.date_range(processed['date'].min(),processed['date'].max()).astype(str)) combination = list(itertools.product(list_date,list_ticker)) processed_full = pd.DataFrame(combination,columns=["date","tic"]).merge(processed,on=["date","tic"],how="left") processed_full = processed_full[processed_full['date'].isin(processed['date'])] processed_full = processed_full.sort_values(['date','tic']) processed_full = processed_full.fillna(0) processed.to_csv(data_path,index = False) elif data_type == 'dow290': raise ValueError('Need to add dow29 with zeros crossings to data directory') elif data_type == 'dow29w0': raise ValueError('Need to add dow29 with OUT zeros crossings to data directory') else: raise ValueError('Need to add crypto data to data directory') # Load and subset data full_df = pd.read_csv(data_path) max_date = max(full_df['date']) min_date = min(full_df['date']) if not (min_date <= start_date): warnings.warn('Earliest possible start date is {}: You have chosen {}. The later date will be used'.format(min_date,start_date)) if not (max_date >= end_date): warnings.warn('Latest possible end date is {}: You have chosen {}. The earlier date will be used'.format(max_date,end_date)) to_return = data_split(full_df,start_date,end_date) return to_return
def train_one(): """ train an agent """ print("==============Start Fetching Data===========") df = YahooDownloader( start_date=config.START_DATE, end_date=config.END_DATE, ticker_list=['FXAIX'], ).fetch_data() print("==============Start Feature Engineering===========") fe = FeatureEngineer( use_technical_indicator=True, tech_indicator_list=config.TECHNICAL_INDICATORS_LIST, use_turbulence=True, user_defined_feature=False, ) processed = fe.preprocess_data(df) # Training & Trading data split train = data_split(processed, config.START_DATE, config.START_TRADE_DATE) trade = data_split(processed, config.START_TRADE_DATE, config.END_DATE) # calculate state action space stock_dimension = len(train.tic.unique()) state_space = (1 + 2 * stock_dimension + len(config.TECHNICAL_INDICATORS_LIST) * stock_dimension) env_kwargs = { "hmax": 100, "initial_amount": 1000000, "buy_cost_pct": 0.001, "sell_cost_pct": 0.001, "state_space": state_space, "stock_dim": stock_dimension, "tech_indicator_list": config.TECHNICAL_INDICATORS_LIST, "action_space": stock_dimension, "reward_scaling": 1e-4 } e_train_gym = StockTradingEnv(df=train, **env_kwargs) e_trade_gym = StockTradingEnv(df=trade, turbulence_threshold=250, **env_kwargs) env_train, _ = e_train_gym.get_sb_env() env_trade, obs_trade = e_trade_gym.get_sb_env() agent = DRLAgent(env=env_train) print("==============Model Training===========") now = datetime.datetime.now().strftime("%Y%m%d-%Hh%M") user_input = input('train model? 1 train 0 don\'t train') if user_input == 1: model_sac = agent.get_model("sac") trained_sac = agent.train_model(model=model_sac, tb_log_name="sac", total_timesteps=8000) trained_sac.save("../models/sac_8k" + df.tic[0] + "_frl") else: trained_sac = SAC.load('../models/sac_80k_msft_working') print("==============Start Trading===========") df_account_value, df_actions = DRLAgent.DRL_prediction( trained_sac, e_trade_gym) df_account_value.to_csv("../" + config.RESULTS_DIR + "/SAC_df_account_value_" + df.tic[0] + "_" + now + ".csv") df_actions.to_csv("../" + config.RESULTS_DIR + "/SAC_df_actions_" + df.tic[0] + "_" + now + ".csv") # print("==============Get Backtest Results===========") perf_stats_all = backtest_stats(df_account_value) perf_stats_all = pd.DataFrame(perf_stats_all) perf_stats_all.to_csv("../" + config.RESULTS_DIR + "/SAC_perf_stats_all_" + df.tic[0] + "_" + now + ".csv") #plot acc value actions = df_actions['actions'] x = np.arange(0, df_account_value['account_value'].shape[0]) y = df_account_value['account_value'] points = np.array([x, y]).T.reshape(-1, 1, 2) segments = np.concatenate([points[:-1], points[1:]], axis=1) fig, axs = plt.subplots(2, 1, sharex=True, sharey=False) # plt.plot(x, y) # Use a boundary norm instead cmap = ListedColormap(['r', 'g', 'b']) norm = BoundaryNorm([-100, -0.1, 0.1, 100], cmap.N) lc = LineCollection(segments, cmap=cmap, norm=norm) lc.set_array(actions) lc.set_linewidth(2) line = axs[0].add_collection(lc) # fig.colorbar(line, ax=axs) axs[1].set_xlabel('Trading Day (' + 'From ' + config.START_TRADE_DATE + " to " + config.END_DATE + ')') axs[0].set_ylabel('Account Value (10000 of USD)') axs[0].set_title("Trading Test on " + df.tic[0]) axs[0].set_xlim(x.min(), x.max()) axs[0].set_ylim(y.min(), y.max()) custom_lines = [ Line2D([0], [0], color=cmap(0.), lw=4), Line2D([0], [0], color=cmap(.5), lw=4), Line2D([0], [0], color=cmap(1.), lw=4) ] # lines = ax.plot(data) axs[0].legend(custom_lines, ['Sell', 'Hold', 'Buy']) #plot stock value tx = np.arange(0, df_account_value['account_value'].shape[0]) ty = trade['close'] plt.ylabel('Price (USD)') plt.title(df.tic[0] + " Closing Price") plt.plot(tx, ty) plt.savefig("../" + config.RESULTS_DIR + "/plots/" "SAC_plot_" + df.tic[0] + "_" + now + ".png")
def train_one(): """ train an agent """ print("==============Start Fetching Data===========") df = YahooDownloader( start_date=config.START_DATE, end_date=config.END_DATE, ticker_list=config.DOW_30_TICKER, ).fetch_data() print("==============Start Feature Engineering===========") fe = FeatureEngineer( use_technical_indicator=True, tech_indicator_list=config.TECHNICAL_INDICATORS_LIST, use_turbulence=True, user_defined_feature=False, ) processed = fe.preprocess_data(df) list_ticker = processed["tic"].unique().tolist() list_date = list( pd.date_range(processed['date'].min(), processed['date'].max()).astype(str)) combination = list(itertools.product(list_date, list_ticker)) processed_full = pd.DataFrame(combination, columns=["date", "tic"]).merge(processed, on=["date", "tic"], how="left") processed_full = processed_full[processed_full['date'].isin( processed['date'])] processed_full = processed_full.sort_values(['date', 'tic']) processed_full = processed_full.fillna(0) # Training & Trading data split train = data_split(processed_full, config.START_DATE, config.START_TRADE_DATE) trade = data_split(processed_full, config.START_TRADE_DATE, config.END_DATE) # calculate state action space stock_dimension = len(train.tic.unique()) state_space = (1 + 2 * stock_dimension + len(config.TECHNICAL_INDICATORS_LIST) * stock_dimension) env_kwargs = { "hmax": 100, "initial_amount": 1000000, "buy_cost_pct": 0.001, "sell_cost_pct": 0.001, "state_space": state_space, "stock_dim": stock_dimension, "tech_indicator_list": config.TECHNICAL_INDICATORS_LIST, "action_space": stock_dimension, "reward_scaling": 1e-4 } e_train_gym = StockTradingEnv(df=train, **env_kwargs) env_train, _ = e_train_gym.get_sb_env() agent = DRLAgent(env=env_train) print("==============Model Training===========") now = datetime.datetime.now().strftime("%Y%m%d-%Hh%M") model_sac = agent.get_model("sac") trained_sac = agent.train_model(model=model_sac, tb_log_name="sac", total_timesteps=80000) print("==============Start Trading===========") e_trade_gym = StockTradingEnv(df=trade, turbulence_threshold=250, **env_kwargs) df_account_value, df_actions = DRLAgent.DRL_prediction( model=trained_sac, environment=e_trade_gym) df_account_value.to_csv("./" + config.RESULTS_DIR + "/df_account_value_" + now + ".csv") df_actions.to_csv("./" + config.RESULTS_DIR + "/df_actions_" + now + ".csv") print("==============Get Backtest Results===========") perf_stats_all = backtest_stats(df_account_value) perf_stats_all = pd.DataFrame(perf_stats_all) perf_stats_all.to_csv("./" + config.RESULTS_DIR + "/perf_stats_all_" + now + ".csv")
def train_one(fetch=False): """ train an agent """ if fetch: df = fetch_and_store() else: df = load() counts = df[['date', 'tic']].groupby(['date']).count().tic assert counts.min() == counts.max() print("==============Start Feature Engineering===========") fe = FeatureEngineer( use_technical_indicator=True, tech_indicator_list=config.TECHNICAL_INDICATORS_LIST, use_turbulence=True, # use_turbulence=False, user_defined_feature=False, ) processed = fe.preprocess_data(df) # Training & Trading data split start_date, trade_date, end_date = calculate_split(df, start=config.START_DATE) print(start_date, trade_date, end_date) train = data_split(processed, start_date, trade_date) trade = data_split(processed, trade_date, end_date) print( f'\n******\nRunning from {start_date} to {end_date} for:\n{", ".join(config.CRYPTO_TICKER)}\n******\n' ) # calculate state action space stock_dimension = len(train.tic.unique()) state_space = (1 + (2 * stock_dimension) + (len(config.TECHNICAL_INDICATORS_LIST) * stock_dimension)) env_kwargs = { "hmax": 100, "initial_amount": 100000, "buy_cost_pct": 0.0026, "sell_cost_pct": 0.0026, "state_space": state_space, "stock_dim": stock_dimension, "tech_indicator_list": config.TECHNICAL_INDICATORS_LIST, "action_space": stock_dimension, "reward_scaling": 1e-4 } e_train_gym = StockTradingEnv(df=train, **env_kwargs) e_trade_gym = StockTradingEnv(df=trade, turbulence_threshold=250, make_plots=True, **env_kwargs) env_train, _ = e_train_gym.get_sb_env() env_trade, obs_trade = e_trade_gym.get_sb_env() agent = DRLAgent(env=env_train) print("==============Model Training===========") now = datetime.datetime.now().strftime(config.DATETIME_FMT) model_sac = agent.get_model("sac") trained_sac = agent.train_model( model=model_sac, tb_log_name="sac", # total_timesteps=100 total_timesteps=80000) print("==============Start Trading===========") df_account_value, df_actions = DRLAgent.DRL_prediction( # model=trained_sac, test_data=trade, test_env=env_trade, test_obs=obs_trade trained_sac, e_trade_gym) df_account_value.to_csv( f"./{config.RESULTS_DIR}/df_account_value_{now}.csv") df_actions.to_csv(f"./{config.RESULTS_DIR}/df_actions_{now}.csv") df_txns = pd.DataFrame(e_trade_gym.transactions, columns=['date', 'amount', 'price', 'symbol']) df_txns = df_txns.set_index(pd.DatetimeIndex(df_txns['date'], tz=pytz.utc)) df_txns.to_csv(f'./{config.RESULTS_DIR}/df_txns_{now}.csv') df_positions = pd.DataFrame(e_trade_gym.positions, columns=['date', 'cash'] + config.CRYPTO_TICKER) df_positions = df_positions.set_index( pd.DatetimeIndex(df_positions['date'], tz=pytz.utc)).drop(columns=['date']) df_positions['cash'] = df_positions.astype( {col: np.float64 for col in df_positions.columns}) df_positions.to_csv(f'./{config.RESULTS_DIR}/df_positions_{now}.csv') print("==============Get Backtest Results===========") perf_stats_all = backtest_stats(df_account_value, transactions=df_txns, positions=df_positions) perf_stats_all = pd.DataFrame(perf_stats_all) perf_stats_all.to_csv(f"./{config.RESULTS_DIR}/perf_stats_all_{now}.csv") backtest_plot(df_account_value, baseline_start=trade_date, baseline_end=end_date, positions=df_positions, transactions=df_txns)
def generate_data(rollouts, data_dir, noise_type): # pylint: disable=R0914 """ Generates data """ assert exists(data_dir), "The data directory does not exist..." df = YahooDownloader(start_date = '2009-01-01', end_date = '2021-01-01', ticker_list = ['AAPL']).fetch_data() df.sort_values(['date','tic'],ignore_index=True) fe = FeatureEngineer( use_technical_indicator=True, tech_indicator_list = config.TECHNICAL_INDICATORS_LIST, use_turbulence=True, user_defined_feature = False) processed = fe.preprocess_data(df) list_ticker = processed["tic"].unique().tolist() list_date = list(pd.date_range(processed['date'].min(),processed['date'].max()).astype(str)) combination = list(itertools.product(list_date,list_ticker)) processed_full = pd.DataFrame(combination,columns=["date","tic"]).merge(processed,on=["date","tic"],how="left") processed_full = processed_full[processed_full['date'].isin(processed['date'])] processed_full = processed_full.sort_values(['date','tic']) processed_full = processed_full.fillna(0) processed_full.sort_values(['date','tic'],ignore_index=True) train = data_split(processed_full, '2009-01-01','2019-01-01') trade = data_split(processed_full, '2019-01-01','2021-01-01') stock_dimension = len(train.tic.unique()) state_space = 1 + 2*stock_dimension + len(config.TECHNICAL_INDICATORS_LIST)*stock_dimension env_kwargs = { "hmax": 100, "initial_amount": 1000000, # "buy_cost_pct": 0.001i, # "sell_cost_pct": 0.001, "transaction_cost_pct": 0.001, "state_space": state_space, "stock_dim": stock_dimension, "tech_indicator_list": config.TECHNICAL_INDICATORS_LIST, "action_space": stock_dimension, "reward_scaling": 1e-4 } e_train_gym = StockTradingEnv(df = train, **env_kwargs) env_train, _ = e_train_gym.get_sb_env() env = env_train # env = gym.make("CarRacing-v0") seq_len = 10000 for i in range(rollouts): env.reset() # env.env.viewer.window.dispatch_events() if noise_type == 'white': a_rollout = [env.action_space.sample() for _ in range(seq_len)] elif noise_type == 'brown': a_rollout = sample_continuous_policy(env.action_space, seq_len, 1. / 50) s_rollout = [] r_rollout = [] d_rollout = [] t = 0 while True: action = a_rollout[t] t += 1 s, r, done, _ = env.step(action) # env.env.viewer.window.dispatch_events() s_rollout += [s] r_rollout += [r] d_rollout += [done] if done: print("> End of rollout {}, {} frames...".format(i, len(s_rollout))) np.savez(join(data_dir, 'rollout_{}'.format(i)), observations=np.array(s_rollout), rewards=np.array(r_rollout), actions=np.array(a_rollout), terminals=np.array(d_rollout)) break
def main(): parser = build_parser() options = parser.parse_args() # Basic setup #Disable warnings warnings.filterwarnings('ignore') # Load the saved data in a pandas DataFrame: data_frame = pd.read_csv("./" + config.DATA_SAVE_DIR + "/" + options.name + ".csv") print("Data Frame shape is: ", data_frame.shape) print("Data Frame format is following: \n\n", data_frame.head()) ## we store the stockstats technical indicator column names in config.py tech_indicator_list = config.TECHNICAL_INDICATORS_LIST print("Technical Indicators that are going to be calculated: ", tech_indicator_list) feature_engineering = FeatureEngineer( use_technical_indicator=True, tech_indicator_list=tech_indicator_list, use_turbulence=True, user_defined_feature=False) processed = feature_engineering.preprocess_data(data_frame) print(processed.sort_values(['date', 'tic'], ignore_index=True).head(10)) training_set = data_split(processed, config.START_DATE, config.START_TRADE_DATE) testing_set = data_split(processed, config.START_TRADE_DATE, config.END_DATE) print("Size of training set: ", len(training_set)) print("Size of testing set: ", len(testing_set)) print("Training set format:\n\n", training_set.head()) print("Testing set format: \n\n", testing_set.head()) stock_dimension = len(training_set.tic.unique()) state_space = 1 + 2 * stock_dimension + len( tech_indicator_list) * stock_dimension print(f"Stock Dimension: {stock_dimension}, State Space: {state_space}") ## ## Save data to file, both training and trading ## if os.path.exists("./" + config.DATA_SAVE_DIR + "/training.txt"): os.remove("./" + config.DATA_SAVE_DIR + "/training.txt") print("The training data file deleted") else: print("The training data file does not exist") if os.path.exists("./" + config.DATA_SAVE_DIR + "/testing.txt"): os.remove("./" + config.DATA_SAVE_DIR + "/testing.txt") print("The testing data file deleted") else: print("The testing data file does not exist") path_training = "./" + config.DATA_SAVE_DIR + "/training.txt" path_testing = "./" + config.DATA_SAVE_DIR + "/testing.txt" with open(path_training, "wb") as f: pickle.dump(training_set, f, pickle.HIGHEST_PROTOCOL) with open(path_testing, "wb") as f: pickle.dump(testing_set, f, pickle.HIGHEST_PROTOCOL) print( "Successfuly completed the task of creation of test and training data files." )
def get_initial_data(numerical_df, sentiment_df, use_turbulence=False): fe = FeatureEngineer(use_turbulence=use_turbulence) numerical_df = fe.preprocess_data(numerical_df) df = numerical_df.merge(sentiment_df, on=["date", "tic"], how="left") df.fillna(0) return df
df = YahooDownloader(start_date = config.START_DATE, end_date = config.END_DATE, ticker_list = config.DOW_30_TICKER).fetch_data() # # Part 4: Preprocess Data # In[9]: fe = FeatureEngineer( use_technical_indicator=True, tech_indicator_list = config.TECHNICAL_INDICATORS_LIST, use_turbulence=True, user_defined_feature = False) processed = fe.preprocess_data(df) # %% Show turbulence # if error open VSCode Settings (JSON) and change # "terminal.integrated.inheritEnv" to true import matplotlib.pyplot as plt import pandas as pd df = plotdf=processed[processed['tic']=='JPM'] df.plot(x="date", y=["turbulence", "close"]) plt.show() # In[10]: processed['log_volume'] = np.log(processed.volume*processed.close) processed['change'] = (processed.close-processed.open)/processed.close
date = [base + timedelta(days=x) for x in range(len(data_df))] data_df['date'] = date ## we store the stockstats technical indicator column names in config.py tech_indicator_list = [ 'macd', 'macds', 'macdh', 'kdjk', 'kdjd', 'close_5_sma', 'close_10_sma', 'close_20_sma', 'close_60_sma' ] print(tech_indicator_list) fe = FeatureEngineer(use_technical_indicator=True, tech_indicator_list=tech_indicator_list, use_turbulence=False, user_defined_feature=False) data_df = fe.preprocess_data(data_df) #Spliting training and testing data train = data_df #change stock dimension when more than one stock for trading stock_dimension = 1 state_space = 1 + 2 * stock_dimension + len( tech_indicator_list) * stock_dimension print(f"Stock Dimension: {stock_dimension}, State Space: {state_space}") env_kwargs = { "hmax": 1, "initial_amount": 100000, "buy_cost_pct": 0, "sell_cost_pct": 0,