__author__ = 'Trevor "Autogen" Grant' """ This is soooooo lazy. """ import pandas as pd print "loading test data" data = pd.DataFrame.from_csv("input_data/testing_soups.csv", index_col=None) print "testing dates_one_hot" from m6_local.functions import dates_one_hot date_series = pd.to_datetime(data['DATE']) one_hot_dates = dates_one_hot(date_series) if one_hot_dates.shape == (100,16): print "looks good" else: print "DOOD, this is broken!!!!"
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, median_absolute_error from sklearn.cross_validation import train_test_split from sklearn.preprocessing import OneHotEncoder from m6_local.functions import dates_one_hot data = pd.DataFrame.from_csv("C:/Users/tgrant/Documents/oos_pred_poc/soups.csv", index_col=None) data = data[data['ACT_SALES_QTY'] > 0] print "data loaded and filtered" #data['DATE'] = pd.to_datetime(data['DATE']) # need to handle dates teh honest way data['fixed_dates'] = pd.to_datetime(data['DATE']) print 'updated dates' one_hot_store_ids = pd.get_dummies(data['STORE_ID'], sparse= True) one_hot_item_ids = pd.get_dummies(data['ITEM_ID'], sparse=True) one_hot_date = dates_one_hot(data['fixed_dates']) print 'created one-hots' #one_hot_dates = pd.get_dummies(data['DATE'], sparse=True) # still need some other stuff - eg promo, act price/reg price # need to normalize act_sales # split training set discount = data['PLAN_PRICE'] / data['REG_PRICE'] from sklearn.preprocessing import MinMaxScaler from scipy.sparse import hstack # need discount and data['ON_PROMOTION'] X = hstack([ one_hot_date, one_hot_item_ids, one_hot_store_ids])