def test_hour(tencent): tencent = tencent.iloc[:50] expect_cumulated_result( StockDataFrame(tencent, date_col=TIME_KEY, time_frame='1h').cumulate().iloc[-1], tencent)
def test_cum_append_many_from_empty(tencent): stock = StockDataFrame(date_col=TIME_KEY, time_frame='5m').cum_append(tencent.iloc[:LENGTH]) expect_cumulated(tencent, stock, LENGTH)
def get_tencent(): return StockDataFrame(read_csv(csv), date_column='time_key')
def test_main(): StockDataFrame()
def prepareData(symbol, dataObtainer, startDate, endDate): df = dataObtainer.getHistoricalDataAsDataframe(symbol) # We gather all of the means for our inputs. closeMeans = [] volumeMeans = [] date = startDate dataTimeInterval = timedelta(hours=3) datapointsPerDay = 8 numberOfSamples = 15 * 8 while date < endDate: print("Processing", date, "/", endDate) startIndex = df.index[df["Timestamp"] == date].tolist() if len(startIndex) == 0: date += dataTimeInterval closeMeans.append(closeMeans[-1]) volumeMeans.append(volumeMeans[-1]) continue startIndex = startIndex[0] endIndex = df.index[df["Timestamp"] == date + dataTimeInterval].tolist() if len(endIndex) == 0: date += dataTimeInterval closeMeans.append(closeMeans[-1]) volumeMeans.append(volumeMeans[-1]) continue endIndex = endIndex[0] data = df.iloc[startIndex:endIndex] closeMeans.append(data["Close"].mean()) volumeMeans.append(data["Volume"].mean()) date += dataTimeInterval stock = StockDataFrame({'close': closeMeans}) # The standard RSI is 14 day. rsis = (stock["rsi:112"] / 100).tolist() rsis2 = (stock["rsi:56"] / 100).tolist() rsis3 = (stock["rsi:28"] / 100).tolist() mas = stock["macd:96,208,72"].tolist() mas2 = stock["macd:48,104,36"].tolist() bollUppers = stock["boll.upper:160"].tolist() bollLowers = stock["boll.lower:160"].tolist() import math rsis = [0 if math.isnan(x) else x for x in rsis] rsis2 = [0 if math.isnan(x) else x for x in rsis2] rsis3 = [0 if math.isnan(x) else x for x in rsis3] mas = [0 if math.isnan(x) else x for x in mas] mas2 = [0 if math.isnan(x) else x for x in mas2] bollUppers = [0 if math.isnan(x) else x for x in bollUppers] bollLowers = [0 if math.isnan(x) else x for x in bollLowers] outputIndex = 0 entryAmount = int((len(closeMeans) - numberOfSamples - 1)) formattedData = [] for i in range(0, len(closeMeans) - numberOfSamples, datapointsPerDay): print("Percent of entries created: " + str(i / entryAmount * 100) + "%") close = closeMeans[i:i + numberOfSamples] meanClose = sum(close) / len(close) volume = volumeMeans[i:i + numberOfSamples] rsi = rsis[i:i + numberOfSamples] rsi2 = rsis2[i:i + numberOfSamples] rsi3 = rsis3[i:i + numberOfSamples] ma = mas[i:i + numberOfSamples] ma2 = mas2[i:i + numberOfSamples] maxMA = max(mas) ma = [((m / maxMA) + 1) / 2 for m in ma] bollUpper = bollUppers[i:i + numberOfSamples] maxBollUpper = max(bollUpper) bollUpper = [m / maxBollUpper for m in bollUpper] bollLower = bollLowers[i:i + numberOfSamples] maxBollLower = max(bollLower) bollLower = [m / maxBollLower for m in bollLower] maxClose = max(close) maxVolume = max(volume) for j in range(len(close)): close[j] /= maxClose for j in range(len(volume)): volume[j] /= maxVolume formattedData.append( [close, volume, rsi, rsi2, rsi3, ma, ma2, bollUpper, bollLower]) outputIndex += 1 return formattedData
def test_directive_stringify(stock: StockDataFrame): assert stock.directive_stringify('boll') == 'boll:20,close' assert directive_stringify('boll') == 'boll:20,close'
auto_adjust = True, # download pre/post regular market hours data # (optional, default is False) prepost = True, # use threads for mass downloading? (True/False/Integer) # (optional, default is True) threads = True, # proxy URL scheme use use when downloading? # (optional, default is None) proxy = None ) stock=StockDataFrame(data) # print(stock) stock.alias('open','Open') stock.alias('high','High') stock.alias('low','Low') stock.alias('close','Close') print(stock) cross_up_upper = stock['high'].copy() # `cross_up_upper` is the series of high prices each of which cross up the upper bollinger band. cross_up_upper[ ~ stock['column:high > boll.upper'] ] = np.nan # Set some items of the series to `np.nan` so that mplfinance will not draw markers for those items. cross_down_lower = stock['low'].copy()
def createDataset(self, symbol: str, startDate, endDate, useAllIndicators=True, isAugmenting=False, timePeriodForOutputs=24): """ Creates a dataset. Please make sure that the start and end dates are the beginnings of days. :param symbol: e.g. "BTCUSDT" :param startDate: e.g. datetime(year=2020, month=1, day=1) :param endDate: e.g. datetime(year=2020, month=2, day=1) :param useAllIndicators: if False, only uses the minimum indicators :param isAugmenting: used by createAugmentedDataset when augmenting. :param timePeriodForOutputs: if set to 24, this will generate the labels (percentiles) for the next 24 hours after the 15-day period that appears in the input. """ # These are time-related variables. timezone = "Etc/GMT-0" timezone = pytz.timezone(timezone) outputStartDate = startDate # We need to go back a little earlier to generate indicators such as RSI. startDate -= timedelta(days=DAYS_IN_AN_INPUT + 60) endDate = timezone.localize(endDate) startDate = timezone.localize(startDate) # outputStartDate = timezone.localize(outputStartDate) # We will be collecting our final features and labels in here: self.inputData = [] self.outputData = [] # This dataframe has all the raw data we need to generate the dataset. df = self.dataObtainer.getHistoricalDataAsDataframe(symbol) # First, we will gather all of the means for our inputs... closeMeans = [] volumeMeans = [] # ... also, we will gather the outputs, which represent the # distributions of the next day prices. output15thPercentiles = [] output25thPercentiles = [] output35thPercentiles = [] outputMedians = [] output65thPercentiles = [] output75thPercentiles = [] output85thPercentiles = [] # We will use this to normalize our outputs by dividing them by the # mean price of the last (latest/most recent) day in our input. priceMeansToDivideLabelsBy = [] volumeMeansToDivideLabelsBy = [] date = startDate # For augmentation: phaseShift = uniform(0, np.pi * 2) count = 0 # Now we will be collecting the input prices, input volumes, and output # percentiles. while date < endDate: print("Processing", date, "/", endDate) # First, we will collect the start and end dates for this input # point (which consists of 3 hours of data if that is our input # time interval). Then we calculate the mean price and volume for # this input data point. startIndex = df.index[df["Timestamp"] == date].tolist() # If this if condition is true, then we may be missing some data in # our dataset. I think this happens during times when Binance was # down. In this case, we just use the previous data. if len(startIndex) == 0: date += self._dataTimeInterval closeMeans.append(closeMeans[-1]) volumeMeans.append(volumeMeans[-1]) outputMedians.append(outputMedians[-1]) output15thPercentiles.append(output15thPercentiles[-1]) output25thPercentiles.append(output25thPercentiles[-1]) output35thPercentiles.append(output35thPercentiles[-1]) output65thPercentiles.append(output65thPercentiles[-1]) output75thPercentiles.append(output75thPercentiles[-1]) output85thPercentiles.append(output85thPercentiles[-1]) priceMeansToDivideLabelsBy.append(priceMeansToDivideLabelsBy[-1]) volumeMeansToDivideLabelsBy.append(volumeMeansToDivideLabelsBy[-1]) continue startIndex = startIndex[0] endIndex = df.index[df["Timestamp"] == date + self._dataTimeInterval].tolist() if len(endIndex) == 0: date += self._dataTimeInterval closeMeans.append(closeMeans[-1]) volumeMeans.append(volumeMeans[-1]) outputMedians.append(outputMedians[-1]) output15thPercentiles.append(output15thPercentiles[-1]) output25thPercentiles.append(output25thPercentiles[-1]) output35thPercentiles.append(output35thPercentiles[-1]) output65thPercentiles.append(output65thPercentiles[-1]) output75thPercentiles.append(output75thPercentiles[-1]) output85thPercentiles.append(output85thPercentiles[-1]) priceMeansToDivideLabelsBy.append(priceMeansToDivideLabelsBy[-1]) volumeMeansToDivideLabelsBy.append(volumeMeansToDivideLabelsBy[-1]) continue endIndex = endIndex[0] data = df.iloc[startIndex : endIndex] if isAugmenting: x = phaseShift + count augmentation = 1 + np.sin(x) * uniform(0.02, 0.04) closeMeans.append(data["Close"].mean() * augmentation) volumeMeans.append(data["Volume"].mean() * augmentation) count += uniform(0.3, 0.6) if count > 2 * np.pi: count = 0 else: closeMeans.append(data["Close"].mean()) volumeMeans.append(data["Volume"].mean()) # Now we get the start and end dates for output data that would # be associated with an entry that begins at the data point found # above. Then we calculate the percentiles for the output. date2 = date + timedelta(days=DAYS_IN_AN_INPUT) startIndex = df.index[df["Timestamp"] == date2].tolist() if len(startIndex) == 0: date += self._dataTimeInterval outputMedians.append(outputMedians[-1]) output15thPercentiles.append(output15thPercentiles[-1]) output25thPercentiles.append(output25thPercentiles[-1]) output35thPercentiles.append(output35thPercentiles[-1]) output65thPercentiles.append(output65thPercentiles[-1]) output75thPercentiles.append(output75thPercentiles[-1]) output85thPercentiles.append(output85thPercentiles[-1]) priceMeansToDivideLabelsBy.append(priceMeansToDivideLabelsBy[-1]) volumeMeansToDivideLabelsBy.append(volumeMeansToDivideLabelsBy[-1]) continue startIndex = startIndex[0] date2 += timedelta(hours=timePeriodForOutputs) endIndex = df.index[df["Timestamp"] == date2].tolist() if len(endIndex) == 0: date += self._dataTimeInterval outputMedians.append(outputMedians[-1]) output15thPercentiles.append(output15thPercentiles[-1]) output25thPercentiles.append(output25thPercentiles[-1]) output35thPercentiles.append(output35thPercentiles[-1]) output65thPercentiles.append(output65thPercentiles[-1]) output75thPercentiles.append(output75thPercentiles[-1]) output85thPercentiles.append(output85thPercentiles[-1]) priceMeansToDivideLabelsBy.append(priceMeansToDivideLabelsBy[-1]) volumeMeansToDivideLabelsBy.append(volumeMeansToDivideLabelsBy[-1]) continue endIndex = endIndex[0] data = df.iloc[startIndex: endIndex]["Close"] outputMedians.append(data.median()) output15thPercentiles.append(data.quantile(0.15)) output25thPercentiles.append(data.quantile(0.25)) output35thPercentiles.append(data.quantile(0.35)) output65thPercentiles.append(data.quantile(0.65)) output75thPercentiles.append(data.quantile(0.75)) output85thPercentiles.append(data.quantile(0.85)) # Lastly, we need to get the last input day's mean price, which we # use to normalize our output percentiles. date3 = date + timedelta(days=DAYS_IN_AN_INPUT - 1) startIndex = df.index[df["Timestamp"] == date3].tolist() if len(startIndex) == 0: date += self._dataTimeInterval priceMeansToDivideLabelsBy.append(priceMeansToDivideLabelsBy[-1]) volumeMeansToDivideLabelsBy.append(volumeMeansToDivideLabelsBy[-1]) continue startIndex = startIndex[0] date3 = date + timedelta(days=DAYS_IN_AN_INPUT) endIndex = df.index[df["Timestamp"] == date3].tolist() if len(endIndex) == 0: date += self._dataTimeInterval priceMeansToDivideLabelsBy.append(priceMeansToDivideLabelsBy[-1]) volumeMeansToDivideLabelsBy.append(volumeMeansToDivideLabelsBy[-1]) continue endIndex = endIndex[0] data = df.iloc[startIndex: endIndex] priceMeansToDivideLabelsBy.append(data["Close"].mean()) volumeMeansToDivideLabelsBy.append(data["Volume"].mean()) date += self._dataTimeInterval # Now that our while loop above collected data for inputs and # outputs, we need to generate technical indicators as additional # input features. We seem to be getting good performance if we only # use close, volume, rsi, ema and mfi, but we also have some other # indicators to play around with, such as ma and an additional rsi # with a different parameter. stock = StockDataFrame({ "close": closeMeans, "volume": volumeMeans }) # The standard RSI is 14 day. Note that if our time interval is 3 hrs, # there are 8 data points in a day. Thus, a 14 day RSI is a 112-RSI # because 14 * 8 = 112. rsis = (stock["rsi:112"] / 100).tolist() rsis2 = (stock["rsi:14"] / 100).tolist() emas = (stock["ema:21"]).tolist() macds = stock["macd:96,208"].tolist() macds2 = stock["macd:24,52"].tolist() bollUppers = stock["boll.upper:160"].tolist() bollLowers = stock["boll.lower:160"].tolist() from ta.volume import MFIIndicator moneyFlowIndex = MFIIndicator(stock["close"], stock["close"], stock["close"], stock["volume"], window=14) mfis = (moneyFlowIndex.money_flow_index().divide(100)).to_list() # This gets rid of NANs in our indicators (just in case). import math rsis = [0 if math.isnan(x) else x for x in rsis] rsis2 = [0 if math.isnan(x) else x for x in rsis2] emas = [0 if math.isnan(x) else x for x in emas] macds = [0 if math.isnan(x) else x for x in macds] macds2 = [0 if math.isnan(x) else x for x in macds2] bollUppers = [0 if math.isnan(x) else x for x in bollUppers] bollLowers = [0 if math.isnan(x) else x for x in bollLowers] mfis = [0 if math.isnan(x) else x for x in mfis] # Now we will generate our final inputs and outputs! See the for loop # below. entryAmount = int((len(closeMeans) - self._numberOfSamples - 1)) if self.dayByDay: advanceAmount = self._datapointsPerDay else: advanceAmount = 1 def fixWithin0And1(x): return min(max(x, 0.0), 1.0) for i in range(60 * self._datapointsPerDay, entryAmount, advanceAmount): print("Percent of entries created: " + str(i / entryAmount * 100) + "%") yesterdayCloseMean = priceMeansToDivideLabelsBy[i] yesterdayVolumeMean = volumeMeansToDivideLabelsBy[i] # This gets the input features and outputs for this dataset entry. close = closeMeans[i : i + self._numberOfSamples] volume = volumeMeans[i : i + self._numberOfSamples] rsi = rsis[i : i + self._numberOfSamples] rsi2 = rsis2[i: i + self._numberOfSamples] ema = emas[i: i + self._numberOfSamples] macd = macds[i: i + self._numberOfSamples] macd2 = macds2[i: i + self._numberOfSamples] ema = [fixWithin0And1(m / yesterdayCloseMean / 2) for m in ema] macd = [fixWithin0And1(m / yesterdayCloseMean / 2 + 0.5) for m in macd] macd2 = [fixWithin0And1(m / yesterdayCloseMean / 2 + 0.5) for m in macd2] mfi = mfis[i: i + self._numberOfSamples] bollUpper = bollUppers[i: i + self._numberOfSamples] bollUpper = [fixWithin0And1(m / yesterdayCloseMean / 2) for m in bollUpper] bollLower = bollLowers[i: i + self._numberOfSamples] bollLower = [fixWithin0And1(m / yesterdayCloseMean / 2) for m in bollLower] for j in range(len(close)): close[j] = fixWithin0And1(close[j] / yesterdayCloseMean / 2) for j in range(len(volume)): volume[j] = fixWithin0And1(volume[j] / yesterdayVolumeMean / 2) # Finally, we add the entry to the dataset. if useAllIndicators: self.inputData.append([close, volume, rsi, rsi2, ema, macd, macd2, bollUpper, bollLower, mfi]) else: self.inputData.append([close, volume, rsi, ema, mfi]) # This normalizes our data. 0.5 means that the percentile is the same # as the last day's mean. 1.0 means that the percentile is twice the # value of the last day's mean. We normalize in this way so that we # can use the sigmoid activation function for the outputs, which output15thPercentile = output15thPercentiles[i] / yesterdayCloseMean / 2 output25thPercentile = output25thPercentiles[i] / yesterdayCloseMean / 2 output35thPercentile = output35thPercentiles[i] / yesterdayCloseMean / 2 outputMedian = outputMedians[i] / yesterdayCloseMean / 2 output65thPercentile = output65thPercentiles[i] / yesterdayCloseMean / 2 output75thPercentile = output75thPercentiles[i] / yesterdayCloseMean / 2 output85thPercentile = output85thPercentiles[i] / yesterdayCloseMean / 2 self.outputData.append([ output15thPercentile, output25thPercentile, output35thPercentile, outputMedian, output65thPercentile, output75thPercentile, output85thPercentile ])
import yfinance as yf import pandas as pd from stock_pandas import StockDataFrame import generate_dataframe as gd import stock_dataframe_to_stdf as sdts import stock_pandas_to_stdf as spts import matplotlib import matplotlib.pyplot as plt import numpy as np data = gd.generate_df("AAPL","1y","1d") columns = ['open','close','high','low','volume','amount'] stockdf = sdts.generate_stdf(data) stock = sdts.data_processing(columns,stockdf) df = sdts.to_df(stock) stockdf = spts.data_processing(StockDataFrame(df)) df = sdts.to_df(stockdf) # dropping columns with at least one nan value temp_df = df.dropna(axis=1) print(temp_df.head()) numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] numeric_df = temp_df.select_dtypes(include=numerics) # print(numeric_df.shape) # numeric_df.drop(["Date"],axis=1) # print(numeric_df.head()) import seaborn as sns # sns.heatmap(numeric_df.corr()) plt.show()