def getDataForNeuralNetworkModel(standardScaler=True, squashHoliday=False, dropDescription=True): mt = readMetroTrafficCSV() if squashHoliday: mt.holiday = np.vectorize(lambda h: 'None' if h == 'None' else 'Holiday')(mt.holiday) columnsToEncode = ['holiday', 'weather_main'] if not dropDescription: columnsToEncode.append('weather_description') encoder = data_utils.DataEncoder(columnsToEncode, oneHotEncoding=True) mt = encoder.encode(mt) mt = cleanupMetroTrafficDups(mt, keep='last') mt = updateMetroTrafficData(mt, reindex=False, temp='F') columnsToDrop = ['date_time', 'rain_1h', 'snow_1h'] if dropDescription: columnsToDrop.append('weather_description') mt = mt.drop(columns=columnsToDrop) scaler = StandardScaler() if standardScaler else MinMaxScaler() scaleColumns = ['week_day', 'hour', 'temp', 'clouds_all'] scaler.fit(mt[scaleColumns]) mt[scaleColumns] = scaler.transform(mt[scaleColumns]) xl, xt, yl, yt = splitMetroTrafficData(mt, intensity=True, approach='random') return xl, xt, yl - 1, yt - 1
def getDowJonesData(stockClusters=0, stockScale=False, scale=None, components=None, window=None): data = readDowJonesCSV() addDowJonesDerivedData(data) columns = [ 'quarter', 'stock', 'volume', 'percent_change_price', 'percent_change_high', 'percent_change_low', 'days_to_next_dividend', 'percent_return_next_dividend', 'percent_change_next_weeks_price' ] data = data[columns].copy() if stockScale: gmmScaler = data_utils.GroupMinMaxScaler('stock', keepColumns=['quarter' ]).fit(data) data = gmmScaler.transform(data) xl, xt, yl, yt = splitDowJonesData(data) xl.drop(columns=['quarter', 'stock'], inplace=True) xt.drop(columns=['quarter', 'stock'], inplace=True) pipeline = data_utils.createPipeline(xl, scale=scale, components=components) if pipeline is not None: xl, xt = data_utils.preprocessData(pipeline, xl, xt, copyColumns=(components is None)) if stockClusters > 0: km = KMeans(n_clusters=stockClusters).fit(xl) xl['cluster'] = km.labels_ xt['cluster'] = km.predict(xt) encoder = data_utils.DataEncoder(columns=['cluster'], oneHotEncoding=True) xl = encoder.encode(xl) xt = encoder.encode(xt) if window: xl = xl.rolling(window).mean().dropna() xt = xt.rolling(window).mean().dropna() yl = yl.rolling(window).mean().dropna() yt = yt.rolling(window).mean().dropna() xl.reset_index(drop=True, inplace=True) xt.reset_index(drop=True, inplace=True) yl.reset_index(drop=True, inplace=True) yt.reset_index(drop=True, inplace=True) return xl, xt, yl, yt
def testDataEncoder(self): df = pd.DataFrame({'A': ['11', '11', '22'], 'B': ['33', '44', '55']}) de = data_utils.DataEncoder(['A', 'B']) self.assertEqual(de.getColumns(), ['A', 'B']) self.assertFalse(de.isOneHotEncoding()) adf = de.encode(df) edf = pd.DataFrame({'A': [0, 0, 1], 'B': [0, 1, 2]}) self.assertTrue(aequal(adf.values, edf.values)) self.assertEqual(de.getLabel('A', 0), '11') self.assertEqual(de.getLabel('A', 1), '22') self.assertEqual(de.getLabel('B', 0), '33') self.assertEqual(de.getLabel('B', 1), '44') self.assertEqual(de.getLabel('B', 2), '55') self.assertEqual(de.getLabel('C', 0), '')
def getMetroTrafficData(dupsKeep='last', gapsAction='fill', gapsSubAction=None, dateTimeIndex=False, temp=None): mt = readMetroTrafficCSV() encoder = data_utils.DataEncoder(['holiday', 'weather_main', 'weather_description'], oneHotEncoding=False) mt = encoder.encode(mt) if dupsKeep is not None: mt = cleanupMetroTrafficDups(mt, keep=dupsKeep) if gapsAction is not None: mt = cleanupMetroTrafficGaps(mt, action=gapsAction, subAction=gapsSubAction) mt = updateMetroTrafficData(mt, reindex=dateTimeIndex, temp=temp) return mt
def testDataEncoderOneHotEncoding(self): df = pd.DataFrame({'A': ['11', '11', '22'], 'B': ['33', '44', '55']}) de = data_utils.DataEncoder(['A', 'B'], oneHotEncoding=True) self.assertEqual(de.getColumns(), ['A', 'B']) self.assertTrue(de.isOneHotEncoding()) adf = de.encode(df) edf = pd.DataFrame({ 'A_11': [1, 1, 0], 'A_22': [0, 0, 1], 'B_33': [1, 0, 0], 'B_44': [0, 1, 0], 'B_55': [0, 0, 1] }) self.assertTrue(aequal(adf.values, edf.values)) self.assertEqual(de.getLabel('A', 0), '') self.assertEqual(de.getLabel('A', 1), '') self.assertEqual(de.getLabel('B', 0), '') self.assertEqual(de.getLabel('B', 1), '') self.assertEqual(de.getLabel('C', 0), '')