def __do_one_hot_encodings(self): df_train, cv = self.res_data_dict[ g_singletonDataFilePath.getTrainDir()] df_testset1 = self.res_data_dict[g_singletonDataFilePath.getTest1Dir()] df_testset2 = self.res_data_dict[g_singletonDataFilePath.getTest2Dir()] enc = OneHotEncoder(sparse=False) cross_feature_dict = self.__get_label_encode_dict() to_be_encoded = [] for _, new_feature_name in cross_feature_dict.iteritems(): to_be_encoded.append(new_feature_name) #fix all data source to_be_stacked_df = pd.concat([ df_train[to_be_encoded], df_testset1[to_be_encoded], df_testset2[to_be_encoded] ], axis=0) enc.fit(to_be_stacked_df) enc, to_be_encoded = self.__filter_too_big_onehot_encoding( enc, to_be_encoded, df_train, df_testset1, df_testset2) # transform on seprate data source self.res_data_dict[g_singletonDataFilePath.getTrainDir( )] = self.__do_one_hot_encoding(df_train, enc, to_be_encoded), cv self.res_data_dict[g_singletonDataFilePath.getTest1Dir( )] = self.__do_one_hot_encoding(df_testset1, enc, to_be_encoded) self.res_data_dict[g_singletonDataFilePath.getTest2Dir( )] = self.__do_one_hot_encoding(df_testset2, enc, to_be_encoded) return
def __init__(self): ExploreOrder.__init__(self) self.gapdf = self.load_gapdf(g_singletonDataFilePath.getTrainDir()) # self.gap_time_dict = self.gapdf.groupby('time_slotid')['gap'].sum().to_dict() self.weathdf = self.load_weatherdf(g_singletonDataFilePath.getTrainDir()) # self.trafficdf = self.load_trafficdf(g_singletonDataFilePath.getTrainDir()) # self.gapDict = self.loadGapDict(g_singletonDataFilePath.getTrainDir() + 'temp/gap.csv.dict.pickle') return
def __init__(self): ExploreOrder.__init__(self) self.gapdf = self.load_gapdf(g_singletonDataFilePath.getTrainDir()) # self.gap_time_dict = self.gapdf.groupby('time_slotid')['gap'].sum().to_dict() self.weathdf = self.load_weatherdf( g_singletonDataFilePath.getTrainDir()) # self.trafficdf = self.load_trafficdf(g_singletonDataFilePath.getTrainDir()) # self.gapDict = self.loadGapDict(g_singletonDataFilePath.getTrainDir() + 'temp/gap.csv.dict.pickle') return
def disp_gap_bytraffic(self): df = self.gapdf data_dir = g_singletonDataFilePath.getTrainDir() dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_prevtraffic.df.pickle' dumpload = DumpLoad(dumpfile_path) if dumpload.isExisiting(): temp_df = dumpload.load() else: traffic_dict = self.get_traffic_dict(data_dir) temp_df = self.X_y_Df[['start_district_id', 'time_slotid']].apply(self.find_prev_traffic,axis = 1, traffic_dict=traffic_dict, pre_num = 3) dumpload.dump(temp_df) df = pd.concat([df, temp_df], axis=1) by_traffic = df.groupby('traffic1') x=[] y=[] for name, group in by_traffic: x.append(name) y.append(group['gap'].mean()) plt.scatter(x,y) return
def disp_gap_bytraffic(self): df = self.gapdf data_dir = g_singletonDataFilePath.getTrainDir() dumpfile_path = '../data_preprocessed/' + data_dir.split( '/')[-2] + '_prevtraffic.df.pickle' dumpload = DumpLoad(dumpfile_path) if dumpload.isExisiting(): temp_df = dumpload.load() else: traffic_dict = self.get_traffic_dict(data_dir) temp_df = self.X_y_Df[['start_district_id', 'time_slotid' ]].apply(self.find_prev_traffic, axis=1, traffic_dict=traffic_dict, pre_num=3) dumpload.dump(temp_df) df = pd.concat([df, temp_df], axis=1) by_traffic = df.groupby('traffic1') x = [] y = [] for name, group in by_traffic: x.append(name) y.append(group['gap'].mean()) plt.scatter(x, y) return
def get_history_data_dict(self): """ indexes for quick search key = 'start_district_id','time_id' value = 'gap its data includes those from train, test1, test2. """ t0 = time() filename = "../data_preprocessed/" + 'traintest_history_data.dict.pickle' dumpload = DumpLoad( filename) if dumpload.isExisiting(): return dumpload.load() test1data_df = ExploreOrder().load_gapdf(g_singletonDataFilePath.getTest1Dir()) test2data_df = ExploreOrder().load_gapdf(g_singletonDataFilePath.getTest2Dir()) traindata_df = ExploreOrder().load_gapdf(g_singletonDataFilePath.getTrainDir()) df = pd.concat([traindata_df, test1data_df,test2data_df], axis=0) self.__fileter_earlier_date(df) res_dict = self.__generate_dict(df) dumpload.dump(res_dict) print "dump weather dict:", round(time()-t0, 3), "s" return res_dict
def disp_gap_by_district_type(self): df = self.gapdf data_dir = g_singletonDataFilePath.getTrainDir() dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_poi.df.pickle' dumpload = DumpLoad(dumpfile_path) if dumpload.isExisiting(): temp_df = dumpload.load() else: poi_dict = self.get_district_type_dict() temp_df = self.X_y_Df[['start_district_id']].apply(self.find_poi,axis = 1, poi_dict=poi_dict) dumpload.dump(temp_df) df = pd.concat([df, temp_df], axis=1) dt_list = self.get_district_type_list() size = len(dt_list) col_len = 4 row_len = 7 # col_len = row_len = int(math.ceil(math.sqrt(size))) # count = 1 _, axarr = plt.subplots(row_len, col_len, sharex=True, sharey=True) for row in range(row_len): for col in range(col_len): index = row * col_len + col if index >= size: break item = dt_list[index] axarr[row, col].scatter(df[item], df['gap']) axarr[row, col].set_ylabel('Gap') axarr[row, col].set_xlabel(item) return
def __unittest(self): # self.combine_all_csv(g_singletonDataFilePath.getTrainDir() + 'weather_data/temp/', 'weather_', 'weather.csv') # self.save_one_csv(g_singletonDataFilePath.getTrainDir() + 'traffic_data/traffic_data_2016-01-04') # weatherdf = self.load_weatherdf(g_singletonDataFilePath.getTrainDir()) data_dir = g_singletonDataFilePath.getTrainDir() traffic_dict = self.get_traffic_dict(data_dir) assert [0, 0, 0] == self.find_prev_traffic(pd.Series([1, '2016-01-01-2']), traffic_dict=traffic_dict, pre_num=3).tolist() assert [2246, 2081] == self.find_prev_traffic(pd.Series([1, '2016-01-01-9']), traffic_dict=traffic_dict, pre_num=2).tolist() data_dir = g_singletonDataFilePath.getTest1Dir() traffic_dict = self.get_traffic_dict(data_dir) assert [346, 424, 0] == self.find_prev_traffic(pd.Series([66, '2016-01-30-141']), traffic_dict=traffic_dict, pre_num=3).tolist() assert [501, 484, 447] == self.find_prev_traffic(pd.Series([66, '2016-01-30-70']), traffic_dict=traffic_dict, pre_num=3).tolist() assert [772, 802, 775] == self.find_prev_traffic(pd.Series([57, '2016-01-24-58']), traffic_dict=traffic_dict, pre_num=3).tolist() print 'passed unit test' return
def getFeaturesLabel(self): data_dir = g_singletonDataFilePath.getTrainDir() self.__do_prepare_data() df, cv = self.res_data_dict[data_dir] return df[self.get_used_features()], df[self.usedLabel], cv return
def __init__(self): ExploreOrder.__init__(self) self.gap_testdf = self.load_gapdf(g_singletonDataFilePath.getTest1Dir()) self.gap_traindf = self.load_gapdf(g_singletonDataFilePath.getTrainDir()) self.gap_traindf.describe() self.gap_testdf.describe() return
def get_history_data_dict(self): """ indexes for quick search key = 'start_district_id','time_id' value = 'gap its data includes those from train, test1, test2. """ t0 = time() filename = "../data_preprocessed/" + 'traintest_history_data.dict.pickle' dumpload = DumpLoad(filename) if dumpload.isExisiting(): return dumpload.load() test1data_df = ExploreOrder().load_gapdf( g_singletonDataFilePath.getTest1Dir()) test2data_df = ExploreOrder().load_gapdf( g_singletonDataFilePath.getTest2Dir()) traindata_df = ExploreOrder().load_gapdf( g_singletonDataFilePath.getTrainDir()) df = pd.concat([traindata_df, test1data_df, test2data_df], axis=0) self.__fileter_earlier_date(df) res_dict = self.__generate_dict(df) dumpload.dump(res_dict) print "dump weather dict:", round(time() - t0, 3), "s" return res_dict
def getFeaturesLabel(self): data_dir = g_singletonDataFilePath.getTrainDir() self.__do_prepare_data() df, cv = self.res_data_dict[data_dir] return df[self.get_used_features()], df[self.usedLabel],cv return
def __save_final_data(self): df_train, _ = self.res_data_dict[g_singletonDataFilePath.getTrainDir()] df_testset1 = self.res_data_dict[g_singletonDataFilePath.getTest1Dir()] df_testset2 = self.res_data_dict[g_singletonDataFilePath.getTest2Dir()] df_train.to_csv('temp/df_train_final.csv') df_testset1.to_csv('temp/df_testset1_final.csv') df_testset2.to_csv('temp/df_testset2_final.csv') return
def __init__(self): ExploreOrder.__init__(self) self.gap_testdf = self.load_gapdf( g_singletonDataFilePath.getTest1Dir()) self.gap_traindf = self.load_gapdf( g_singletonDataFilePath.getTrainDir()) self.gap_traindf.describe() self.gap_testdf.describe() return
def weather_distribution(self): data_dir = g_singletonDataFilePath.getTrainDir() self.gapdf = self.load_weatherdf(data_dir) print (self.gapdf['weather'].describe()) # sns.distplot(self.gapdf['gap'],kde=False, bins=100); sns.countplot(x="weather", data=self.gapdf, palette="Greens_d"); plt.title('Countplot of Weather') return
def gapdistricution(self): data_dir = g_singletonDataFilePath.getTrainDir() self.gapdf = self.load_gapdf(data_dir) print self.gapdf['gap'].describe() # sns.distplot(self.gapdf['gap'],kde=False, bins=100); self.gapdf['gap'].plot(kind='hist', bins=200) plt.xlabel('Gaps') plt.title('Histogram of Gaps') return
def traffic_districution(self): data_dir = g_singletonDataFilePath.getTrainDir() df = self.load_trafficdf(data_dir) print df['traffic'].describe() # sns.distplot(self.gapdf['gap'],kde=False, bins=100); df['traffic'].plot(kind='hist', bins=100) plt.xlabel('Traffic') plt.title('Histogram of Traffic') return
def __do_one_hot_encodings(self): df_train, cv = self.res_data_dict[g_singletonDataFilePath.getTrainDir()] df_testset1 = self.res_data_dict[g_singletonDataFilePath.getTest1Dir()] df_testset2 = self.res_data_dict[g_singletonDataFilePath.getTest2Dir()] enc = OneHotEncoder(sparse=False) cross_feature_dict = self.__get_label_encode_dict() to_be_encoded = [] for _, new_feature_name in cross_feature_dict.iteritems(): to_be_encoded.append(new_feature_name) #fix all data source to_be_stacked_df = pd.concat([df_train[to_be_encoded], df_testset1[to_be_encoded], df_testset2[to_be_encoded]], axis = 0) enc.fit(to_be_stacked_df) enc, to_be_encoded = self.__filter_too_big_onehot_encoding(enc, to_be_encoded, df_train, df_testset1, df_testset2) # transform on seprate data source self.res_data_dict[g_singletonDataFilePath.getTrainDir()] = self.__do_one_hot_encoding(df_train, enc, to_be_encoded),cv self.res_data_dict[g_singletonDataFilePath.getTest1Dir()] = self.__do_one_hot_encoding(df_testset1,enc, to_be_encoded) self.res_data_dict[g_singletonDataFilePath.getTest2Dir()] = self.__do_one_hot_encoding(df_testset2, enc, to_be_encoded) return
def weather_distribution(self): data_dir = g_singletonDataFilePath.getTrainDir() self.gapdf = self.load_weatherdf(data_dir) print self.gapdf['weather'].describe() # sns.distplot(self.gapdf['gap'],kde=False, bins=100); sns.countplot(x="weather", data=self.gapdf, palette="Greens_d"); plt.title('Countplot of Weather') # self.gapdf['weather'].plot(kind='bar') # plt.xlabel('Weather') # plt.title('Histogram of Weather') return
def get_train_validationset(self): data_dir = g_singletonDataFilePath.getTrainDir() self.__do_prepare_data() df, cv = self.res_data_dict[data_dir] folds = [] for train_index, test_index in cv: folds.append((train_index, test_index)) train_index = folds[self.train_validation_foldid][0] test_index = folds[self.train_validation_foldid][1] X_train = df.iloc[train_index][self.get_used_features()] y_train = df.iloc[train_index][self.usedLabel] X_test = df.iloc[test_index][self.get_used_features()] y_test = df.iloc[test_index][self.usedLabel] return X_train, y_train,X_test,y_test
def __get_feature_label(self): data_dir = g_singletonDataFilePath.getTrainDir() self.X_y_Df = self.load_gapdf(data_dir) self.__engineer_feature(data_dir) if self.holdout_split == HoldoutSplitMethod.kFOLD_FORWARD_CHAINING: cv = self.get_kfold_forward_chaining(self.X_y_Df) elif self.holdout_split == HoldoutSplitMethod.KFOLD_BYDATE: cv = self.get_kfold_bydate(self.X_y_Df) else: cv = self.get_imitate_testset2(self.X_y_Df, split_method = self.holdout_split) self.res_data_dict[data_dir] = self.X_y_Df,cv return
def __do_label_encoding(self): df_train, _ = self.res_data_dict[g_singletonDataFilePath.getTrainDir()] df_testset1 = self.res_data_dict[g_singletonDataFilePath.getTest1Dir()] df_testset2 = self.res_data_dict[g_singletonDataFilePath.getTest2Dir()] le = LabelEncoder() cross_feature_dict = self.__get_label_encode_dict() for _, new_feature_name in cross_feature_dict.iteritems(): to_be_stacked = [df_train[new_feature_name], df_testset1[new_feature_name], df_testset2[new_feature_name]] le.fit(pd.concat(to_be_stacked, axis=0)) df_train[new_feature_name] = le.transform(df_train[new_feature_name]) df_testset1[new_feature_name] = le.transform(df_testset1[new_feature_name]) df_testset2[new_feature_name] = le.transform(df_testset2[new_feature_name]) return
def __unittest(self): # self.combine_all_csv(g_singletonDataFilePath.getTrainDir() + 'weather_data/temp/', 'weather_', 'weather.csv') # self.save_one_csv(g_singletonDataFilePath.getTrainDir() + 'weather_data/weather_data_2016-01-02') # weatherdf = self.load_weatherdf(g_singletonDataFilePath.getTrainDir()) weather_dict = self.get_weather_dict(g_singletonDataFilePath.getTrainDir()) assert 2== self.find_prev_weather_mode('2016-01-01-1', weather_dict = weather_dict)[0] assert 2== self.find_prev_weather_mode('2016-01-21-144', weather_dict = weather_dict)[0] # assert 2== self.find_prev_weather_mode('2016-01-21-115', weather_dict = weather_dict)[0] assert 2== self.find_prev_weather_mode('2016-01-21-114', weather_dict = weather_dict)[0] print 'passed unit test' return
def get_train_validationset(self): data_dir = g_singletonDataFilePath.getTrainDir() self.__do_prepare_data() df, cv = self.res_data_dict[data_dir] folds = [] for train_index, test_index in cv: folds.append((train_index, test_index)) train_index = folds[self.train_validation_foldid][0] test_index = folds[self.train_validation_foldid][1] X_train = df.iloc[train_index][self.get_used_features()] y_train = df.iloc[train_index][self.usedLabel] X_test = df.iloc[test_index][self.get_used_features()] y_test = df.iloc[test_index][self.usedLabel] return X_train, y_train, X_test, y_test
def __get_feature_label(self): data_dir = g_singletonDataFilePath.getTrainDir() self.X_y_Df = self.load_gapdf(data_dir) self.__engineer_feature(data_dir) if self.holdout_split == HoldoutSplitMethod.kFOLD_FORWARD_CHAINING: cv = self.get_kfold_forward_chaining(self.X_y_Df) elif self.holdout_split == HoldoutSplitMethod.KFOLD_BYDATE: cv = self.get_kfold_bydate(self.X_y_Df) else: cv = self.get_imitate_testset2(self.X_y_Df, split_method=self.holdout_split) self.res_data_dict[data_dir] = self.X_y_Df, cv return
def unitTest(self): # test cases for find_prev_gap data_dir = g_singletonDataFilePath.getTrainDir() gap_dict = self.get_gap_dict(data_dir) assert [3096, 1698, 318, 33, 0, 0] == self.find_prev_gap(pd.Series([51, '2016-01-01-5']), pre_num=6, gap_dict=gap_dict).tolist() assert [0, 0, 0] == self.find_prev_gap(pd.Series([45, '2016-01-16-2']), pre_num=3, gap_dict=gap_dict).tolist() assert [24, 26, 37] == self.find_prev_gap(pd.Series([53, '2016-01-04-56']), pre_num=3, gap_dict=gap_dict).tolist() data_dir = g_singletonDataFilePath.getTest1Dir() gap_dict = self.get_gap_dict(data_dir) assert [0, 1, 0] == self.find_prev_gap(pd.Series([54, '2016-01-24-81']), pre_num=3, gap_dict=gap_dict).tolist() assert [6, 4, 0] == self.find_prev_gap(pd.Series([7, '2016-01-30-141']), pre_num=3, gap_dict=gap_dict).tolist() assert [0, 0] == self.find_prev_gap(pd.Series([7, '2016-01-30-138']), pre_num=2, gap_dict=gap_dict).tolist() assert [0, 0, 0] == self.find_prev_gap(pd.Series([7, '2016-01-30-139']), pre_num=3, gap_dict=gap_dict).tolist() assert [0, 0, 1] == self.find_prev_gap(pd.Series([50, '2016-01-30-143']), pre_num=3, gap_dict=gap_dict).tolist() assert [245, 282, 0] == self.find_prev_gap(pd.Series([51, '2016-01-22-141']), pre_num=3, gap_dict=gap_dict).tolist() gap_meanmedian_dict = self.get_gap_meanmedian_dict() self.find_gap_meanmedian(pd.Series([5, 55]), gap_meanmedian_dict=gap_meanmedian_dict) print("unit test passed") return
def get_gap_meanmedian_dict(self): data_dir = g_singletonDataFilePath.getTrainDir() filename = data_dir + 'order_data/temp/gap_meanmedian.dict.pickle' dumpload = DumpLoad( filename) if dumpload.isExisiting(): return dumpload.load() resDict = {} df = self.load_gapdf(data_dir) grps = df.groupby(['start_district_id','time_id']) for name, row in grps: resDict[name] = row['gap'].tolist() # resDict[name] = [i for i in row['gap'].tolist() if i !=0] dumpload.dump(resDict) return resDict
def __unittest(self): # self.combine_all_csv(g_singletonDataFilePath.getTrainDir() + 'weather_data/temp/', 'weather_', 'weather.csv') # self.save_one_csv(g_singletonDataFilePath.getTrainDir() + 'weather_data/weather_data_2016-01-02') # weatherdf = self.load_weatherdf(g_singletonDataFilePath.getTrainDir()) weather_dict = self.get_weather_dict( g_singletonDataFilePath.getTrainDir()) assert 2 == self.find_prev_weather_mode('2016-01-01-1', weather_dict=weather_dict)[0] assert 2 == self.find_prev_weather_mode('2016-01-21-144', weather_dict=weather_dict)[0] # assert 2 == self.find_prev_weather_mode('2016-01-21-115', weather_dict=weather_dict)[0] assert 2 == self.find_prev_weather_mode('2016-01-21-114', weather_dict=weather_dict)[0] print 'passed unit test' return
def get_gap_meanmedian_dict(self): data_dir = g_singletonDataFilePath.getTrainDir() filename = data_dir + 'order_data/temp/gap_meanmedian.dict.pickle' dumpload = DumpLoad(filename) if dumpload.isExisiting(): return dumpload.load() resDict = {} df = self.load_gapdf(data_dir) grps = df.groupby(['start_district_id', 'time_id']) for name, row in grps: resDict[name] = row['gap'].tolist() # resDict[name] = [i for i in row['gap'].tolist() if i !=0] dumpload.dump(resDict) return resDict
def disp_gap_by_district_type(self): df = self.gapdf data_dir = g_singletonDataFilePath.getTrainDir() dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_poi.df.pickle' dumpload = DumpLoad(dumpfile_path) if dumpload.isExisiting(): temp_df = dumpload.load() else: poi_dict = self.get_district_type_dict() temp_df = self.X_y_Df[['start_district_id']].apply(self.find_poi,axis = 1, poi_dict=poi_dict) dumpload.dump(temp_df) df = pd.concat([df, temp_df], axis=1) dt_list = self.get_district_type_list() size = len(dt_list) col_len = 4 row_len = 7 # col_len = row_len = int(math.ceil(math.sqrt(size))) # count = 1 _, axarr = plt.subplots(row_len, col_len, sharex=True, sharey=True) for row in range(row_len): for col in range(col_len): index = row * col_len + col if index >= size: break item = dt_list[index] axarr[row, col].scatter(df[item], df['gap']) axarr[row, col].set_ylabel('Gap') axarr[row, col].set_xlabel(item) # axarr[row, col].set_title('POI/Gap Correlation') # for item in dt_list: # plt.subplot(row_len, col_len, count) # plt.scatter(df[item], df['gap']) # plt.ylabel('Gap') # plt.xlabel('POI') # count += 1 # plt.title('POI/Gap Correlation') return
def __do_label_encoding(self): df_train, _ = self.res_data_dict[g_singletonDataFilePath.getTrainDir()] df_testset1 = self.res_data_dict[g_singletonDataFilePath.getTest1Dir()] df_testset2 = self.res_data_dict[g_singletonDataFilePath.getTest2Dir()] le = LabelEncoder() cross_feature_dict = self.__get_label_encode_dict() for _, new_feature_name in cross_feature_dict.iteritems(): to_be_stacked = [ df_train[new_feature_name], df_testset1[new_feature_name], df_testset2[new_feature_name] ] le.fit(pd.concat(to_be_stacked, axis=0)) df_train[new_feature_name] = le.transform( df_train[new_feature_name]) df_testset1[new_feature_name] = le.transform( df_testset1[new_feature_name]) df_testset2[new_feature_name] = le.transform( df_testset2[new_feature_name]) return
def disp_gap_byweather(self): df = self.gapdf data_dir = g_singletonDataFilePath.getTrainDir() dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_prevweather.df.pickle' dumpload = DumpLoad(dumpfile_path) if dumpload.isExisiting(): temp_df = dumpload.load() else: weather_dict = self.get_weather_dict(data_dir) temp_df = self.X_y_Df['time_slotid'].apply(self.find_prev_weather_mode, weather_dict=weather_dict) dumpload.dump(temp_df) df = pd.concat([df, temp_df], axis=1) gaps_mean = df.groupby('preweather')['gap'].mean() gaps_mean.plot(kind='bar') plt.ylabel('Mean of gap') plt.xlabel('Weather') plt.title('Weather/Gap Correlation') return
def unitTest(self): # test cases for find_prev_gap data_dir = g_singletonDataFilePath.getTrainDir() gap_dict = self.get_gap_dict(data_dir) assert [3096,1698,318,33,0,0] == self.find_prev_gap(pd.Series([51, '2016-01-01-5']), pre_num = 6, gap_dict = gap_dict).tolist() assert [0,0,0] == self.find_prev_gap(pd.Series([45, '2016-01-16-2']), pre_num = 3, gap_dict = gap_dict).tolist() assert [24,26,37] == self.find_prev_gap(pd.Series([53, '2016-01-04-56']), pre_num = 3, gap_dict = gap_dict).tolist() data_dir = g_singletonDataFilePath.getTest1Dir() gap_dict = self.get_gap_dict(data_dir) assert [0,1,0] == self.find_prev_gap(pd.Series([54, '2016-01-24-81']), pre_num = 3, gap_dict = gap_dict).tolist() assert [6,4,0] == self.find_prev_gap(pd.Series([7, '2016-01-30-141']), pre_num = 3, gap_dict = gap_dict).tolist() assert [0,0] == self.find_prev_gap(pd.Series([7, '2016-01-30-138']), pre_num = 2, gap_dict = gap_dict).tolist() assert [0,0,0] == self.find_prev_gap(pd.Series([7, '2016-01-30-139']), pre_num = 3, gap_dict = gap_dict).tolist() assert [0,0,1] == self.find_prev_gap(pd.Series([50, '2016-01-30-143']), pre_num = 3, gap_dict = gap_dict).tolist() assert [245,282,0] == self.find_prev_gap(pd.Series([51, '2016-01-22-141']), pre_num = 3, gap_dict = gap_dict).tolist() gap_meanmedian_dict = self.get_gap_meanmedian_dict() self.find_gap_meanmedian(pd.Series([5,55]),gap_meanmedian_dict = gap_meanmedian_dict) print "unit test passed" return
res.append(item) # training 1-19, validation 19-21 # item = self.__get_train_validation_indexes(df, '2016-01-01', 19, split_method), self.__get_train_validation_indexes(df, '2016-01-20', 2) # res.append(item) # # # training 1-20, validation 21 # item = self.__get_train_validation_indexes(df, '2016-01-01', 20, split_method), self.__get_train_validation_indexes(df, '2016-01-21', 1) # res.append(item) return res def __get_train_validation_indexes(self,df, start_date, days_num, split_method = HoldoutSplitMethod.IMITTATE_TEST2_MIN): dates = self.__get_date(start_date, days_num, days_step=1) slots = self.__get_slots(split_method) dates_slots = self.__get_date_slots(dates, slots) indexes = self.__get_df_indexes(df, dates_slots) return indexes def run(self, df): self.__unit_test() # self.get_kfold_bydate(df) # self.get_kfold_forward_chaining(df) return if __name__ == "__main__": obj= SplitTrainValidation() from preparedata import PrepareData from utility.datafilepath import g_singletonDataFilePath pre = PrepareData() pre.X_y_Df = pre.load_gapdf(g_singletonDataFilePath.getTrainDir()) pre.__engineer_feature(g_singletonDataFilePath.getTrainDir()) obj.run(pre.X_y_Df)
# item = self.__get_train_validation_indexes(df, '2016-01-01', 20, split_method), self.__get_train_validation_indexes(df, '2016-01-21', 1) # res.append(item) return res def __get_train_validation_indexes( self, df, start_date, days_num, split_method=HoldoutSplitMethod.IMITTATE_TEST2_MIN): dates = self.__get_date(start_date, days_num, days_step=1) slots = self.__get_slots(split_method) dates_slots = self.__get_date_slots(dates, slots) indexes = self.__get_df_indexes(df, dates_slots) return indexes def run(self, df): self.__unit_test() # self.get_kfold_bydate(df) # self.get_kfold_forward_chaining(df) return if __name__ == "__main__": obj = SplitTrainValidation() from preparedata import PrepareData from utility.datafilepath import g_singletonDataFilePath pre = PrepareData() pre.X_y_Df = pre.load_gapdf(g_singletonDataFilePath.getTrainDir()) pre.__engineer_feature(g_singletonDataFilePath.getTrainDir()) obj.run(pre.X_y_Df)