def __init__(self, demo_raw, train_start_time = '2014-02-01',train_end_time = '2018-10-31', test_start_time = '2018-11-01 00:00:00', test_end_time = '2019-05-01 23:00:00' ): # self.raw_df = raw_df # demongraphic data [32, 32, 14] self.demo_raw = demo_raw self.train_start_time = train_start_time #self.train_end_time = '2018-03-31' self.train_end_time = train_end_time # set train/test set self.test_start_time = test_start_time self.test_end_time = test_end_time # prediction window: use one week's data to predict next hour self.window = datetime.timedelta(hours=24 * 7) self.step = datetime.timedelta(hours=1) # predict_start_time should be '2018-04-08 00:00:00' # e.g. use '2018-04-01 00:00:00' -> '2018-04-07 23:00:00', in total 168 time stamps # to predict '2018-04-08 00:00:00' # however, test_start_time + window = predict_start_time # e.g. '2018-04-01 00:00:00' + 168 hour window = '2018-04-08 00:00:00' # this is calculated by time interval, there is 1 hour shift between timestamp and time interval self.predict_start_time = datetime_utils.str_to_datetime(self.test_start_time) + self.window # predict_end_time = test_end_time = '2018-04-30 23:00:00' self.predict_end_time = datetime_utils.str_to_datetime(self.test_end_time) # if window = 7 days, test_end_time = '2018-04-30 23:00:00', actual_end_time = 04/23 - 23:00 self.actual_end_time = self.predict_end_time - self.window # 41616 self.train_hours = datetime_utils.get_total_hour_range(self.train_start_time, self.train_end_time)
def train_test_split(self,raw_seq_arr): train_hours = datetime_utils.get_total_hour_range(self.train_start_time, self.train_end_time) # train_arr = raw_seq_arr[:, :train_hours, :, :] # test_arr = raw_seq_arr[:, train_hours:, :, :] train_arr = raw_seq_arr[:, :train_hours] test_arr = raw_seq_arr[:, train_hours:] return train_arr, test_arr
def main(): args = parse_args() # lamda = args.lamda # beta = args.beta # # use_1d_fea = bool(args.use_1d_fea) # # use_2d_fea = bool(args.use_2d_fea) # fairloss = args.fairloss # multivar= bool(args.multivar) suffix = args.suffix # the following arguments for resuming training resume_training = args.resume_training train_dir = args.train_dir checkpoint = args.checkpoint place = args.place epoch = args.epoch learning_rate = args.learning_rate encoding_dir = args.encoding_dir # print("received arguments: lamda: ",lamda) # print("received arguments: beta: ",beta) # print("use_1d_fea: ", use_1d_fea) # print("use_2d_fea: ", use_2d_fea) # print("fairloss: ", fairloss) # print("multivar: ", multivar) print("resume_training: ", resume_training) print("training dir path: ", train_dir) print("checkpoint: ", checkpoint) print("place: ", place) print("epochs to train: ", epoch) print("start learning rate: ", learning_rate) if checkpoint is not None: checkpoint = train_dir + checkpoint print('pick up checkpoint: ', checkpoint) if place == "Seattle": print('load data for Seattle...') globals()['TRAINING_STEPS'] = epoch globals()['LEARNING_RATE'] = learning_rate print('TRAINING_STEPS: ', TRAINING_STEPS) # hourly_grid_timeseries = pd.read_csv('./hourly_grid_1000_timeseries_trail.csv', index_col = 0) # hourly_grid_timeseries.index = pd.to_datetime(hourly_grid_timeseries.index) rawdata = pd.read_csv( 'lime_whole_grid_32_20_hourly_1000_171001-181031.csv', index_col=0) rawdata.index = pd.to_datetime(rawdata.index) # a set of region codes (e.g.: 10_10) that intersect with the city intersect_pos = pd.read_csv( '../auxillary_data/intersect_pos_32_20.csv') intersect_pos_set = set(intersect_pos['0'].tolist()) # demographic data # should use 2018 data demo_raw = pd.read_csv( '../auxillary_data/whole_grid_32_20_demo_1000_intersect_geodf_2018_corrected.csv', index_col=0) train_obj = train(rawdata, demo_raw) #ignore non-intersection cells in test_df # this is for evaluation test_df_cut = train_obj.test_df.loc[:, train_obj.test_df.columns. isin(list(intersect_pos_set))] # generate binary demo feature according to 2018 city mean train_obj.generate_binary_demo_attr(intersect_pos_set) if os.path.isfile('bikedata_32_20_171001-181031.npy'): print('loading raw data array...') rawdata_arr = np.load('bikedata_32_20_171001-181031.npy') else: print('generating raw data array') rawdata_arr = train_obj.df_to_tensor() np.save('bikedata_32_20_171001-181031.npy', rawdata_arr) print( 'generating fixed window length training and testing sequences...') # raw_seq_arr.shape (169, 9336, 32, 20) raw_seq_arr = train_obj.generate_fixlen_timeseries(rawdata_arr) train_arr, test_arr = train_obj.train_test_split(raw_seq_arr) print('input train_arr shape: ', train_arr.shape) print('input test_arr shape: ', test_arr.shape) # train_hours: 8084 train_hours = datetime_utils.get_total_hour_range( train_obj.train_start_time, train_obj.train_end_time) total_length = raw_seq_arr.shape[1] # 9336 test_len = total_length - train_hours # 1296 # 32112 start_train_hour = datetime_utils.get_total_hour_range( '2014-02-01', '2017-09-30') # 40152 end_train_hour = datetime_utils.get_total_hour_range( '2014-02-01', '2018-08-31') # -------------------------------------------------------------- print('loading latent representation') latent_rep_path = '/home/ubuntu/CTensor/' + encoding_dir + 'latent_rep/final_lat_rep.npy' latent_rep = np.load(latent_rep_path) print('latent_rep.shape: ', latent_rep.shape) # should be [42240, 32, 20, 3] latent_rep = latent_rep.reshape((45960, 32, 20, 5)) latent_series = latent_rep[start_train_hour:end_train_hour + test_len + TIMESTEPS, :, :, :] ################# add groupwise latent representations ############## groupwise_latent_rep_path = '/home/ubuntu/CTensor/autoencoder_alltoall_groupwise_denoise/groupwise_tensors/' weather_latent_rep = np.load(groupwise_latent_rep_path + 'weather_grp.npy') economics_latent_rep = np.load(groupwise_latent_rep_path + 'economics_grp.npy') transportation_latent_rep = np.load(groupwise_latent_rep_path + 'transportation_grp.npy') public_service_latent_rep = np.load(groupwise_latent_rep_path + 'public_service_grp.npy') print('weather_latent_rep.shape: ', weather_latent_rep.shape) # should be [42240, 32, 20, 3] group_latent_rep = np.concatenate([ weather_latent_rep, economics_latent_rep, transportation_latent_rep, public_service_latent_rep ], axis=-1) group_latent_series = group_latent_rep[ start_train_hour:end_train_hour + test_len + TIMESTEPS, :, :, :] ####### add groupwise with ALL2ALL latent representation ############### latent_series = np.concatenate([group_latent_series, latent_series], axis=-1) # if only groupwise features are used latent_series = group_latent_series dim = latent_series.shape[-1] print('latent_series.shape: ', latent_series.shape) latent_seq_arr = train_obj.generate_fixlen_timeseries(latent_series) print('input latent_seq_arr shape: ', latent_seq_arr.shape) train_latent_arr, test_latent_arr = train_obj.train_test_split( latent_seq_arr) print('input train_latent_arr shape: ', train_latent_arr.shape) print('input test_latent_arr shape: ', test_latent_arr.shape) # --------------------------------------------------------------- elif place == "Austin": print('load data for Austin...') globals()['HEIGHT'] = 28 globals()['WIDTH'] = 28 globals()['TIMESTEPS'] = 168 globals()['BIKE_CHANNEL'] = 1 globals()['NUM_2D_FEA'] = 3 # street count / streent len / poi count globals()['NUM_1D_FEA'] = 3 globals()['BATCH_SIZE'] = 32 globals()['TRAINING_STEPS'] = epoch # globals()['LEARNING_RATE'] = 0.003 globals()['LEARNING_RATE'] = learning_rate print('global HEIGHT: ', HEIGHT) train_start_time = '2016-08-01' train_end_time = '2017-02-28' test_start_time = '2017-03-01 00:00:00' test_end_time = '2017-04-13 23:00:00' print('train_start_time for Austin: ', train_start_time) # hourly_grid_timeseries = pd.read_csv('./hourly_grid_1000_timeseries_trail.csv', index_col = 0) # hourly_grid_timeseries.index = pd.to_datetime(hourly_grid_timeseries.index) rawdata = pd.read_csv( '../rideaustin/rideaustin_grided_hourly_2000_20160801-20170413.csv', index_col=0) rawdata.index = pd.to_datetime(rawdata.index) # a set of region codes (e.g.: 10_10) that intersect with the city intersect_pos = pd.read_csv( '../rideaustin/austin_intersect_pos_28_28.csv') intersect_pos_set = set(intersect_pos['0'].tolist()) # demographic data # should use 2018 data demo_raw = pd.read_csv( '../rideaustin/austin_demo_data/austin_28_28_demo_2000_intersect_geodf_2017.csv', index_col=0) train_obj = train(rawdata, demo_raw, train_start_time, train_end_time, test_start_time, test_end_time) #ignore non-intersection cells in test_df # this is for evaluation test_df_cut = train_obj.test_df.loc[:, train_obj.test_df.columns. isin(list(intersect_pos_set))] # generate binary demo feature according to 2017 Austin city mean train_obj.generate_binary_demo_attr(intersect_pos_set, 70.2222, 8.7057, 32.6351, 42.0087, 6.453) # load 2d and 1d features if use_2d_fea: print("use 2d feature") # landuse arr 28 28 1 landuse_arr = np.load( '../feature_transform/austin_landuse_arr.npy') street_arr = np.load('../feature_transform/austin_street_arr.npy') # concatenate 2d data data_2d = np.concatenate([landuse_arr, street_arr], axis=2) else: print('ignore 2d data') data_2d = None if use_1d_fea: # weather: (1,1,6144,3) weather_arr = np.load( '../feature_transform/austin_weather_arr_1by1bytime.npy') weather_arr = weather_arr[0, 0, :, :] # [6144, 3] # construct training / testing data for 1d data print( 'generating fixed window length training and testing sequences for 1d data' ) raw_seq_arr_1d = train_obj.generate_fixlen_timeseries(weather_arr) # test_series_1d.shape -> (169, 1296, 3) train_arr_1d, test_arr_1d = train_obj.train_test_split( raw_seq_arr_1d) # else: print('ignore 1d data') train_arr_1d = None test_arr_1d = None if os.path.isfile('../rideaustin/austin_28_20160801-20170413.npy'): print('loading raw data array...') rawdata_arr = np.load( '../rideaustin/austin_28_20160801-20170413.npy') else: print('generating raw data array') rawdata_arr = train_obj.df_to_tensor() np.save('../rideaustin/austin_28_20160801-20170413.npy', rawdata_arr) else: print("Please input correct city name") ####################### city ignorant treatment ################ # lamda = 0 # if specified training dir to resume training, # the save_path is the same dir as train_dir # otherwise, create ta new dir for training if suffix == '': save_path = './bike_groupwise_model_' + str(dim) + '/' else: save_path = './bike_groupwise_model_' + str(dim) + '_' + suffix + '/' if train_dir: save_path = train_dir print("training dir: ", train_dir) print("save_path: ", save_path) if not os.path.exists(save_path): os.makedirs(save_path) # save demongraphic array #if os.path.isfile(save_path + 'demo_arr_32_20.npy'): # print('loading demopgraphic data array...') # demo_arr = np.load(save_path + 'demo_arr_32_20.npy') #else: # generate mask arr for city boundary demo_mask_arr = train_obj.demo_mask() # generate demographic in array format print('generating demo_arr array') demo_arr = train_obj.selected_demo_to_tensor() if not os.path.isfile(save_path + str(place) + '_demo_arr_' + str(HEIGHT) + '.npy'): np.save(save_path + str(place) + '_demo_arr_' + str(HEIGHT) + '.npy', demo_arr) # calculate statistics for demo # pop_df, pop_ratio_df = train_obj.generate_pop_df() # pop_df.to_csv(save_path + 'pop_df.csv') # pop_ratio_df.to_csv(save_path + 'pop_ratio_df.csv') # # # demo_pop: if IFG, RFG, equal mean, use normalized pop. # # if pairwise, use non-normalized pop # if fairloss == "pairwise": # #demo_pop = demo_arr[:,:,1] # normalized pop # demo_pop = demo_arr[:,:,0] # pop # use pop for pairwise loss # else: # demo_pop = demo_arr[:,:,1] # normalized pop # demo_pop = np.expand_dims(demo_pop, axis=2) # print('demo_pop.shape: ', demo_pop.shape) timer = str(time.time()) if resume_training == False: # Model fusion without fairness print('Train Model fusion without fairness') conv3d_predicted = fused_model_with_latent_features.Conv3D( train_obj, train_arr, test_arr, intersect_pos_set, # multi_demo_sensitive, demo_pop, multi_pop_g1, multi_pop_g2, # multi_grid_g1, multi_grid_g2, fairloss, # train_arr_1d, test_arr_1d, data_2d, train_latent_arr, test_latent_arr, demo_mask_arr, save_path, HEIGHT, WIDTH, TIMESTEPS, BIKE_CHANNEL, NUM_2D_FEA, NUM_1D_FEA, BATCH_SIZE, TRAINING_STEPS, LEARNING_RATE).conv3d_predicted else: # resume training print('resume trainging from : ', train_dir) conv3d_predicted = fused_model_with_latent_features.Conv3D( train_obj, train_arr, test_arr, intersect_pos_set, # train_arr_1d, test_arr_1d, data_2d, train_latent_arr, test_latent_arr, demo_mask_arr, train_dir, HEIGHT, WIDTH, TIMESTEPS, BIKE_CHANNEL, NUM_2D_FEA, NUM_1D_FEA, BATCH_SIZE, TRAINING_STEPS, LEARNING_RATE, False, checkpoint, True, train_dir).conv3d_predicted conv3d_predicted.index = pd.to_datetime(conv3d_predicted.index) conv3d_predicted.to_csv(save_path + 'fused_model_pred_' + timer + '.csv') #convlstm_predicted = pd.read_csv(save_path + 'convlstm_predicted.csv', index_col=0) #convlstm_predicted.index = pd.to_datetime(convlstm_predicted.index) eval_obj4 = evaluation.evaluation(test_df_cut, conv3d_predicted, train_obj.demo_raw) diff_df = eval_obj4.group_difference() diff_df.to_csv(save_path + str(place) + '_evaluation.csv') finegrain_diff_df = eval_obj4.individual_difference() finegrain_diff_df.to_csv(save_path + 'IFG_eval.csv') print('rmse for conv3d: ', eval_obj4.rmse_val) print('mae for conv3d: ', eval_obj4.mae_val) print('mape for conv3d: ', eval_obj4.mape_val) # plot train test accuracy train_test = pd.read_csv(save_path + 'ecoch_res_df_' + '.csv') train_test = train_test.loc[:, ~train_test.columns.str.contains('^Unnamed')] total_loss = train_test[['train_loss', 'test_loss']].plot() plt.savefig(save_path + 'total_loss_finish.png') acc_loss = train_test[['train_acc', 'test_acc']].plot() plt.savefig(save_path + 'acc_loss_finish.png') # fair_loss = train_test[['train_fair', 'test_fair']].plot() # plt.savefig(save_path + 'fair_loss_finish.png') plt.close() txt_name = save_path + 'latent_fea_df_' + timer + '.txt' with open(txt_name, 'w') as the_file: the_file.write( 'Only account for grids that intersect with city boundary \n') # the_file.write('lamda\n') # the_file.write(str(lamda) + '\n') # the_file.write('beta\n') # the_file.write(str(beta) + '\n') the_file.write('dim\n') the_file.write(str(dim) + '\n') the_file.write('latent_rep_path\n') the_file.write(str(latent_rep_path) + '\n') # the_file.write('use_1d_fea\n') # the_file.write(str(use_1d_fea) + '\n') # the_file.write('use_2d_fea\n') # the_file.write(str(use_2d_fea) + '\n') # the_file.write('fairloss\n') # the_file.write(str(fairloss) + '\n') # the_file.write('multivar\n') # the_file.write(str(multivar) + '\n') the_file.write('learning rate\n') the_file.write(str(LEARNING_RATE) + '\n') the_file.write('rmse for conv3d\n') the_file.write(str(eval_obj4.rmse_val) + '\n') the_file.write('mae for conv3d\n') the_file.write(str(eval_obj4.mae_val) + '\n') the_file.write('mape for conv3d\n') the_file.write(str(eval_obj4.mape_val) + '\n') the_file.close()
def main(): args = parse_args() lamda = args.lamda beta = args.beta use_1d_fea = bool(args.use_1d_fea) use_2d_fea = bool(args.use_2d_fea) fairloss = args.fairloss multivar = bool(args.multivar) suffix = args.suffix # the following arguments for resuming training resume_training = args.resume_training train_dir = args.train_dir checkpoint = args.checkpoint place = args.place epoch = args.epoch learning_rate = args.learning_rate print("received arguments: lamda: ", lamda) print("received arguments: beta: ", beta) print("use_1d_fea: ", use_1d_fea) print("use_2d_fea: ", use_2d_fea) print("fairloss: ", fairloss) print("multivar: ", multivar) print("resume_training: ", resume_training) print("training dir path: ", train_dir) print("checkpoint: ", checkpoint) print("place: ", place) print("epochs to train: ", epoch) print("start learning rate: ", learning_rate) if checkpoint is not None: checkpoint = train_dir + checkpoint print('pick up checkpoint: ', checkpoint) if place == "Seattle": print('load data for Seattle...') globals()['TRAINING_STEPS'] = epoch globals()['LEARNING_RATE'] = learning_rate print('TRAINING_STEPS: ', TRAINING_STEPS) # hourly_grid_timeseries = pd.read_csv('./hourly_grid_1000_timeseries_trail.csv', index_col = 0) # hourly_grid_timeseries.index = pd.to_datetime(hourly_grid_timeseries.index) rawdata = pd.read_csv( 'lime_whole_grid_32_20_hourly_1000_171001-181031.csv', index_col=0) rawdata.index = pd.to_datetime(rawdata.index) # a set of region codes (e.g.: 10_10) that intersect with the city intersect_pos = pd.read_csv( '../auxillary_data/intersect_pos_32_20.csv') intersect_pos_set = set(intersect_pos['0'].tolist()) # demographic data # should use 2018 data demo_raw = pd.read_csv( '../auxillary_data/whole_grid_32_20_demo_1000_intersect_geodf_2018_corrected.csv', index_col=0) train_obj = train(rawdata, demo_raw) #ignore non-intersection cells in test_df # this is for evaluation test_df_cut = train_obj.test_df.loc[:, train_obj.test_df.columns. isin(list(intersect_pos_set))] # generate binary demo feature according to 2018 city mean train_obj.generate_binary_demo_attr(intersect_pos_set) if os.path.isfile('bikedata_32_20_171001-181031.npy'): print('loading raw data array...') rawdata_arr = np.load('bikedata_32_20_171001-181031.npy') else: print('generating raw data array') rawdata_arr = train_obj.df_to_tensor() np.save('bikedata_32_20_171001-181031.npy', rawdata_arr) ################# LOAD DATA ###################### # ---- reading data ---------------------# print('Reading 1d, 2d, and 3d data') path_1d = '../data_processing/1d_source_data/' path_2d = '../data_processing/2d_source_data/' path_3d = '../data_processing/3d_source_data/' # 1d weather_arr = np.load(path_1d + 'weather_arr_20140201_20190501.npy') airquality_arr = np.load(path_1d + 'air_quality_arr_20140201_20190501.npy') print('weather_arr.shape: ', weather_arr.shape) print('airquality_arr.shape: ', airquality_arr.shape) weather_arr = weather_arr[0, 0, :, :] airquality_arr = airquality_arr[0, 0, :, :] # 2d house_price_arr = np.load(path_2d + 'house_price.npy') POI_business_arr = np.load(path_2d + 'POI_business.npy') POI_food_arr = np.load(path_2d + 'POI_food.npy') POI_government_arr = np.load(path_2d + 'POI_government.npy') POI_hospitals_arr = np.load(path_2d + 'POI_hospitals.npy') POI_publicservices_arr = np.load(path_2d + 'POI_publicservices.npy') POI_recreation_arr = np.load(path_2d + 'POI_recreation.npy') POI_school_arr = np.load(path_2d + 'POI_school.npy') POI_transportation_arr = np.load(path_2d + 'POI_transportation.npy') seattle_street_arr = np.load(path_2d + 'seattle_street.npy') total_flow_count_arr = np.load(path_2d + 'total_flow_count.npy') transit_routes_arr = np.load(path_2d + 'transit_routes.npy') transit_signals_arr = np.load(path_2d + 'transit_signals.npy') transit_stop_arr = np.load(path_2d + 'transit_stop.npy') slope_arr = np.load(path_2d + 'slope_arr.npy') bikelane_arr = np.load(path_2d + 'bikelane_arr.npy') print('transit_routes_arr.shape: ', transit_routes_arr.shape) print('POI_recreation_arr.shape: ', POI_recreation_arr.shape) # 3d building_permit_arr = np.load( path_3d + 'building_permit_arr_20140201_20190501_python3.npy') collisions_arr = np.load( path_3d + 'collisions_arr_20140201_20190501_python3.npy') crime_arr = np.load(path_3d + 'crime_arr_20140201_20190501_python3.npy') seattle911calls_arr = np.load( path_3d + 'seattle911calls_arr_20140201_20190501.npy') print('building_permit_arr.shape:', building_permit_arr.shape) print('collisions_arr.shape: ', collisions_arr.shape) print('crime_arr.shape: ', crime_arr.shape) print('seattle911calls_arr.shape: ', seattle911calls_arr.shape) building_permit_arr_seq_extend = np.repeat(building_permit_arr, 24, axis=0) collisions_arr_seq_extend = np.repeat(collisions_arr, 24, axis=0) print('building_permit_arr_seq_extend.shape: ', building_permit_arr_seq_extend.shape) # train_hours: 8084 print( 'generating fixed window length training and testing sequences...') raw_seq_arr = train_obj.generate_fixlen_timeseries(rawdata_arr) train_arr, test_arr = train_obj.train_test_split(raw_seq_arr) print('input train_arr shape: ', train_arr.shape) train_hours = datetime_utils.get_total_hour_range( train_obj.train_start_time, train_obj.train_end_time) total_length = raw_seq_arr.shape[1] # 9336 test_len = total_length - train_hours # 1296 # 32112 start_train_hour = datetime_utils.get_total_hour_range( '2014-02-01', '2017-09-30') # 40152 end_train_hour = datetime_utils.get_total_hour_range( '2014-02-01', '2018-08-31') start_idx = start_train_hour end_idx = end_train_hour + test_len + TIMESTEPS # construct dictionary print('use dictionary to organize data') rawdata_1d_dict = { 'precipitation': np.expand_dims(weather_arr[start_idx:end_idx, 0], axis=1), 'temperature': np.expand_dims(weather_arr[start_idx:end_idx, 1], axis=1), 'pressure': np.expand_dims(weather_arr[start_idx:end_idx, 2], axis=1), 'airquality': airquality_arr[start_idx:end_idx, :], } rawdata_2d_dict = { 'house_price': house_price_arr, 'POI_business': POI_business_arr, 'POI_food': POI_food_arr, 'POI_government': POI_government_arr, 'POI_hospitals': POI_hospitals_arr, 'POI_publicservices': POI_publicservices_arr, 'POI_recreation': POI_recreation_arr, 'POI_school': POI_school_arr, 'POI_transportation': POI_transportation_arr, 'seattle_street': seattle_street_arr, 'total_flow_count': total_flow_count_arr, 'transit_routes': transit_routes_arr, 'transit_signals': transit_signals_arr, 'transit_stop': transit_stop_arr, 'slope': slope_arr, 'bikelane': bikelane_arr, } rawdata_3d_dict = { 'building_permit': np.expand_dims( building_permit_arr_seq_extend[start_idx:end_idx, :, :], axis=3), 'collisions': np.expand_dims(collisions_arr_seq_extend[start_idx:end_idx, :, :], axis=3), # (7, 45840, 32, 20) 'seattle911calls': np.expand_dims(seattle911calls_arr[start_idx:end_idx, :, :], axis=3) # (45984, 32, 20) } ################ original code ########################### # load 2d and 1d features if use_2d_fea: print("use 2d feature") # bikelane_arr = np.load('../feature_transform/bikelane_arr.npy') # slope_arr = np.load('../feature_transform/slope_arr.npy') # concatenate 2d data # data_2d = np.concatenate([slope_arr,bikelane_arr], axis=2) data_2d = np.concatenate(list(rawdata_2d_dict.values()), axis=2) else: print('ignore 2d data') data_2d = None if use_1d_fea: # weather: (1,1,9504,3) or (9504, 3) # weather_arr = np.load('../feature_transform/weather_arr_1by1by9504.npy') # weather_arr = weather_arr[0,0,:,:] # [9504, 3] # construct training / testing data for 1d data print( 'generating fixed window length training and testing sequences for 1d data' ) # raw_seq_arr_1d = train_obj.generate_fixlen_timeseries(weather_arr) # train_arr_1d, test_arr_1d = train_obj.train_test_split(raw_seq_arr_1d) data_1d = np.concatenate(list(rawdata_1d_dict.values()), axis=1) raw_seq_arr_1d = train_obj.generate_fixlen_timeseries(data_1d) train_arr_1d, test_arr_1d = train_obj.train_test_split( raw_seq_arr_1d) else: print('ignore 1d data') train_arr_1d = None test_arr_1d = None #### add 3D data ############################################# data_3d = np.concatenate(list(rawdata_3d_dict.values()), axis=3) print('data_3d.shape: ', data_3d.shape) # fea_seq_arr_3d = train_obj.generate_fixlen_timeseries(data_3d) # fea_train_arr_3d, fea_test_arr_3d = train_obj.train_test_split(fea_seq_arr_3d) # print('fea_train_arr_3d.shape: ', fea_train_arr_3d.shape) # (169, 8040, 32, 20, 3) # print('train_arr.shape: ', train_arr.shape) # (169, 8040, 32, 20) # concatenate with bikeshare data # train_arr = np.expand_dims(train_arr, axis=4) # test_arr = np.expand_dims(test_arr, axis=4) # # train_arr = np.concatenate([train_arr,fea_train_arr_3d], axis=4) # test_arr = np.concatenate([test_arr,fea_test_arr_3d], axis=4) # print('train_arr.shape: ', train_arr.shape) globals()['NUM_2D_FEA'] = data_2d.shape[-1] globals()['NUM_1D_FEA'] = train_arr_1d.shape[-1] globals()['NUM_3D_FEA'] = data_3d.shape[-1] elif place == "Austin": print('load data for Austin...') globals()['HEIGHT'] = 28 globals()['WIDTH'] = 28 globals()['TIMESTEPS'] = 168 globals()['BIKE_CHANNEL'] = 1 globals()['NUM_2D_FEA'] = 3 # street count / streent len / poi count globals()['NUM_1D_FEA'] = 3 globals()['BATCH_SIZE'] = 32 globals()['TRAINING_STEPS'] = epoch # globals()['LEARNING_RATE'] = 0.003 globals()['LEARNING_RATE'] = learning_rate print('global HEIGHT: ', HEIGHT) train_start_time = '2016-08-01' train_end_time = '2017-02-28' test_start_time = '2017-03-01 00:00:00' test_end_time = '2017-04-13 23:00:00' print('train_start_time for Austin: ', train_start_time) # hourly_grid_timeseries = pd.read_csv('./hourly_grid_1000_timeseries_trail.csv', index_col = 0) # hourly_grid_timeseries.index = pd.to_datetime(hourly_grid_timeseries.index) rawdata = pd.read_csv( '../rideaustin/rideaustin_grided_hourly_2000_20160801-20170413.csv', index_col=0) rawdata.index = pd.to_datetime(rawdata.index) # a set of region codes (e.g.: 10_10) that intersect with the city intersect_pos = pd.read_csv( '../rideaustin/austin_intersect_pos_28_28.csv') intersect_pos_set = set(intersect_pos['0'].tolist()) # demographic data # should use 2018 data demo_raw = pd.read_csv( '../rideaustin/austin_demo_data/austin_28_28_demo_2000_intersect_geodf_2017.csv', index_col=0) train_obj = train(rawdata, demo_raw, train_start_time, train_end_time, test_start_time, test_end_time) #ignore non-intersection cells in test_df # this is for evaluation test_df_cut = train_obj.test_df.loc[:, train_obj.test_df.columns. isin(list(intersect_pos_set))] # generate binary demo feature according to 2017 Austin city mean train_obj.generate_binary_demo_attr(intersect_pos_set, 70.2222, 8.7057, 32.6351, 42.0087, 6.453) # load 2d and 1d features if use_2d_fea: print("use 2d feature") # landuse arr 28 28 1 landuse_arr = np.load( '../feature_transform/austin_landuse_arr.npy') street_arr = np.load('../feature_transform/austin_street_arr.npy') # concatenate 2d data data_2d = np.concatenate([landuse_arr, street_arr], axis=2) else: print('ignore 2d data') data_2d = None if use_1d_fea: # weather: (1,1,6144,3) weather_arr = np.load( '../feature_transform/austin_weather_arr_1by1bytime.npy') weather_arr = weather_arr[0, 0, :, :] # [6144, 3] # construct training / testing data for 1d data print( 'generating fixed window length training and testing sequences for 1d data' ) raw_seq_arr_1d = train_obj.generate_fixlen_timeseries(weather_arr) # test_series_1d.shape -> (169, 1296, 3) train_arr_1d, test_arr_1d = train_obj.train_test_split( raw_seq_arr_1d) # else: print('ignore 1d data') train_arr_1d = None test_arr_1d = None if os.path.isfile('../rideaustin/austin_28_20160801-20170413.npy'): print('loading raw data array...') rawdata_arr = np.load( '../rideaustin/austin_28_20160801-20170413.npy') else: print('generating raw data array') rawdata_arr = train_obj.df_to_tensor() np.save('../rideaustin/austin_28_20160801-20170413.npy', rawdata_arr) else: print("Please input correct city name") ####################### city ignorant treatment ################ # lamda = 0 # if specified training dir to resume training, # the save_path is the same dir as train_dir # otherwise, create ta new dir for training if suffix == '': save_path = './fusion_model_originalfeatures_' + str( place) + '_' + str(fairloss) + '_' + str(use_1d_fea) + '_' + str( use_2d_fea) + '_' + str(multivar) + '/' else: save_path = './fusion_model_originalfeatures_' + str( place) + '_' + str(fairloss) + '_' + str(use_1d_fea) + '_' + str( use_2d_fea) + '_' + str(multivar) + '_' + suffix + '/' if train_dir: save_path = train_dir print("training dir: ", train_dir) print("save_path: ", save_path) if not os.path.exists(save_path): os.makedirs(save_path) # generate mask arr for city boundary demo_mask_arr = train_obj.demo_mask() # generate demographic in array format print('generating demo_arr array') demo_arr = train_obj.selected_demo_to_tensor() if not os.path.isfile(save_path + str(place) + '_demo_arr_' + str(HEIGHT) + '.npy'): np.save(save_path + str(place) + '_demo_arr_' + str(HEIGHT) + '.npy', demo_arr) timer = str(time.time()) if resume_training == False: # Model fusion without fairness print('Train Model fusion without fairness') conv3d_predicted = fused_model_augment.Conv3D( train_obj, train_arr, test_arr, intersect_pos_set, # multi_demo_sensitive, demo_pop, multi_pop_g1, multi_pop_g2, # multi_grid_g1, multi_grid_g2, fairloss, train_arr_1d, test_arr_1d, data_2d, data_3d, demo_mask_arr, save_path, HEIGHT, WIDTH, TIMESTEPS, BIKE_CHANNEL, NUM_3D_FEA, NUM_2D_FEA, NUM_1D_FEA, BATCH_SIZE, TRAINING_STEPS, LEARNING_RATE).conv3d_predicted else: # resume training print('resume trainging from : ', train_dir) conv3d_predicted = fused_model_augment.Conv3D( train_obj, train_arr, test_arr, intersect_pos_set, # multi_demo_sensitive, demo_pop, multi_pop_g1, multi_pop_g2, # multi_grid_g1, multi_grid_g2,fairloss, train_arr_1d, test_arr_1d, data_2d, data_3d, demo_mask_arr, train_dir, HEIGHT, WIDTH, TIMESTEPS, BIKE_CHANNEL, NUM_3D_FEA, NUM_2D_FEA, NUM_1D_FEA, BATCH_SIZE, TRAINING_STEPS, LEARNING_RATE, False, checkpoint, True, train_dir).conv3d_predicted conv3d_predicted.index = pd.to_datetime(conv3d_predicted.index) conv3d_predicted.to_csv(save_path + 'fused_model_pred_' + timer + '.csv') #convlstm_predicted = pd.read_csv(save_path + 'convlstm_predicted.csv', index_col=0) #convlstm_predicted.index = pd.to_datetime(convlstm_predicted.index) eval_obj4 = evaluation.evaluation(test_df_cut, conv3d_predicted, train_obj.demo_raw) diff_df = eval_obj4.group_difference() diff_df.to_csv(save_path + str(place) + '_evaluation.csv') finegrain_diff_df = eval_obj4.individual_difference() finegrain_diff_df.to_csv(save_path + 'IFG_eval.csv') print('rmse for conv3d: ', eval_obj4.rmse_val) print('mae for conv3d: ', eval_obj4.mae_val) # plot train test accuracy train_test = pd.read_csv(save_path + 'ecoch_res_df_' + '.csv') train_test = train_test.loc[:, ~train_test.columns.str.contains('^Unnamed')] total_loss = train_test[['train_loss', 'test_loss']].plot() plt.savefig(save_path + 'total_loss_finish.png') acc_loss = train_test[['train_acc', 'test_acc']].plot() plt.savefig(save_path + 'acc_loss_finish.png') # fair_loss = train_test[['train_fair', 'test_fair']].plot() # plt.savefig(save_path + 'fair_loss_finish.png') plt.close() txt_name = save_path + 'fused_model_df_' + str(beta) + '_' + timer + '.txt' with open(txt_name, 'w') as the_file: the_file.write( 'Only account for grids that intersect with city boundary \n') the_file.write('place\n') the_file.write(str(place) + '\n') the_file.write('use_1d_fea\n') the_file.write(str(use_1d_fea) + '\n') the_file.write('use_2d_fea\n') the_file.write(str(use_2d_fea) + '\n') the_file.write(str(LEARNING_RATE) + '\n') the_file.write('rmse for conv3d\n') the_file.write(str(eval_obj4.rmse_val) + '\n') the_file.write('mae for conv3d\n') the_file.write(str(eval_obj4.mae_val) + '\n') the_file.close()
def main(): args = parse_args() # lamda = args.lamda # beta = args.beta # # use_1d_fea = bool(args.use_1d_fea) # # use_2d_fea = bool(args.use_2d_fea) # fairloss = args.fairloss # multivar= bool(args.multivar) suffix = args.suffix # the following arguments for resuming training resume_training = args.resume_training train_dir = args.train_dir checkpoint = args.checkpoint place = args.place epoch = args.epoch learning_rate = args.learning_rate # print("received arguments: lamda: ",lamda) # print("received arguments: beta: ",beta) # print("use_1d_fea: ", use_1d_fea) # print("use_2d_fea: ", use_2d_fea) # print("fairloss: ", fairloss) # print("multivar: ", multivar) print("resume_training: ", resume_training) print("training dir path: ", train_dir) print("checkpoint: ", checkpoint) print("place: ", place) print("epochs to train: ", epoch) print("start learning rate: ", learning_rate) if checkpoint is not None: checkpoint = train_dir + checkpoint print('pick up checkpoint: ', checkpoint) if place == "Seattle": print('load data for Seattle...') globals()['TRAINING_STEPS'] = epoch globals()['LEARNING_RATE'] = learning_rate print('TRAINING_STEPS: ', TRAINING_STEPS) # hourly_grid_timeseries = pd.read_csv('./hourly_grid_1000_timeseries_trail.csv', index_col = 0) # hourly_grid_timeseries.index = pd.to_datetime(hourly_grid_timeseries.index) rawdata = pd.read_csv( 'lime_whole_grid_32_20_hourly_1000_171001-181031.csv', index_col=0) rawdata.index = pd.to_datetime(rawdata.index) # a set of region codes (e.g.: 10_10) that intersect with the city intersect_pos = pd.read_csv( '../auxillary_data/intersect_pos_32_20.csv') intersect_pos_set = set(intersect_pos['0'].tolist()) # demographic data # should use 2018 data demo_raw = pd.read_csv( '../auxillary_data/whole_grid_32_20_demo_1000_intersect_geodf_2018_corrected.csv', index_col=0) train_obj = train(rawdata, demo_raw) #ignore non-intersection cells in test_df # this is for evaluation test_df_cut = train_obj.test_df.loc[:, train_obj.test_df.columns. isin(list(intersect_pos_set))] # generate binary demo feature according to 2018 city mean train_obj.generate_binary_demo_attr(intersect_pos_set) if os.path.isfile('bikedata_32_20_171001-181031.npy'): print('loading raw data array...') rawdata_arr = np.load('bikedata_32_20_171001-181031.npy') else: print('generating raw data array') rawdata_arr = train_obj.df_to_tensor() np.save('bikedata_32_20_171001-181031.npy', rawdata_arr) print( 'generating fixed window length training and testing sequences...') # raw_seq_arr.shape (169, 9336, 32, 20) raw_seq_arr = train_obj.generate_fixlen_timeseries(rawdata_arr) train_arr, test_arr = train_obj.train_test_split(raw_seq_arr) print('input train_arr shape: ', train_arr.shape) # train_hours: 8084 train_hours = datetime_utils.get_total_hour_range( train_obj.train_start_time, train_obj.train_end_time) total_length = raw_seq_arr.shape[1] # 9336 test_len = total_length - train_hours # 1296 # 32112 start_train_hour = datetime_utils.get_total_hour_range( '2014-02-01', '2017-09-30') # 40152 end_train_hour = datetime_utils.get_total_hour_range( '2014-02-01', '2018-08-31') # -------------------------------------------------------------- print('loading latent representation') # latent_rep_path = '/home/ubuntu/CTensor/predictions/autoencoder_v1_Seattle/inference/infer_latent_representation.npy' # latent_rep_path = '/home/ubuntu/CTensor/autoencoder_alltoall/autoencoder_v2_dim1_epoch15/train_lat_rep.npy' # latent_rep_path = '/home/ubuntu/CTensor/toy_examples/toy_autoencoder_v6_cos_dim3_trail/inference/encoded_list' latent_rep_path = '/home/ubuntu/CTensor/autoencoder_featuremap_grouping/autoencoder_v6_cos_dim5_firstlevel_from50_ag_1d2dcalc_n7/inference/encoded_list' file = open(latent_rep_path, 'rb') encoded_list = pickle.load(file) print(len(encoded_list[0])) # close the file file.close() # rearrange encoded_list # original dimension: # of batches, # of datasets, [batch_size, ......], # arrange into : # of datasets, # of batches, [batch_size, ......], encoded_list_rearrange = [[None for j in range(len(encoded_list))] for i in range(len(encoded_list[0]))] for i, batch in enumerate(encoded_list): for j, ds in enumerate(batch): encoded_list_rearrange[j][i] = encoded_list[i][j] encoded_list_rearrange_concat = [ np.concatenate(batch, axis=0) for batch in encoded_list_rearrange ] print('encoded_list_rearrange_concat[0].shape', encoded_list_rearrange_concat[0].shape) keys_list = [] n_groups = 7 for i in range(1, n_groups + 1): keys_list.append('group_' + str(i)) feature_map_dict = dict(zip(keys_list, encoded_list_rearrange_concat)) # get the group that we need: group 1, group 4, and group 5 # and concat ''' grouping_dict = { 'group_1':['weather', 'house_price', 'POI_business', 'POI_food', 'POI_government', 'POI_publicservices', 'POI_recreation', 'POI_transportation', 'transit_routes'], 'group_2': ['airquality'], 'group_3': ['POI_hospitals', 'building_permit', 'collisions', 'seattle911calls'], 'group_4':['POI_school', 'slope'], 'group_5': ['seattle_street', 'bikelane'], 'group_6':['total_flow_count'], 'group_7':['transit_signals', 'transit_stop'] } ''' temp_arr1 = feature_map_dict['group_1'] # train_hours, 32, 20, dim temp_arr2 = feature_map_dict['group_4'] temp_arr3 = feature_map_dict['group_5'] latent_rep = np.concatenate([temp_arr1, temp_arr2, temp_arr3], axis=-1) #latent_rep= temp_arr1 # latent_rep = np.load(latent_rep_path) # (41616, 1, 32, 20, 1) for v1, (41616, 32, 20, 1) for v2 print('latent_rep.shape: ', latent_rep.shape) # latent_rep =latent_rep.reshape((41616, 32, 20, 5)) latent_train_series = latent_rep[ start_train_hour:end_train_hour, :, :, :] latent_test_series = latent_rep[end_train_hour:end_train_hour + test_len, :, :, :] # latent_train_series = np.squeeze(latent_train_series, axis=1) # latent_test_series = np.squeeze(latent_test_series, axis=1) print('latent_test_series.shape: ', latent_test_series.shape) dim = latent_test_series.shape[-1] # --------------------------------------------------------------- elif place == "Austin": print('load data for Austin...') globals()['HEIGHT'] = 28 globals()['WIDTH'] = 28 globals()['TIMESTEPS'] = 168 globals()['BIKE_CHANNEL'] = 1 globals()['NUM_2D_FEA'] = 3 # street count / streent len / poi count globals()['NUM_1D_FEA'] = 3 globals()['BATCH_SIZE'] = 32 globals()['TRAINING_STEPS'] = epoch # globals()['LEARNING_RATE'] = 0.003 globals()['LEARNING_RATE'] = learning_rate print('global HEIGHT: ', HEIGHT) train_start_time = '2016-08-01' train_end_time = '2017-02-28' test_start_time = '2017-03-01 00:00:00' test_end_time = '2017-04-13 23:00:00' print('train_start_time for Austin: ', train_start_time) # hourly_grid_timeseries = pd.read_csv('./hourly_grid_1000_timeseries_trail.csv', index_col = 0) # hourly_grid_timeseries.index = pd.to_datetime(hourly_grid_timeseries.index) rawdata = pd.read_csv( '../rideaustin/rideaustin_grided_hourly_2000_20160801-20170413.csv', index_col=0) rawdata.index = pd.to_datetime(rawdata.index) # a set of region codes (e.g.: 10_10) that intersect with the city intersect_pos = pd.read_csv( '../rideaustin/austin_intersect_pos_28_28.csv') intersect_pos_set = set(intersect_pos['0'].tolist()) # demographic data # should use 2018 data demo_raw = pd.read_csv( '../rideaustin/austin_demo_data/austin_28_28_demo_2000_intersect_geodf_2017.csv', index_col=0) train_obj = train(rawdata, demo_raw, train_start_time, train_end_time, test_start_time, test_end_time) #ignore non-intersection cells in test_df # this is for evaluation test_df_cut = train_obj.test_df.loc[:, train_obj.test_df.columns. isin(list(intersect_pos_set))] # generate binary demo feature according to 2017 Austin city mean train_obj.generate_binary_demo_attr(intersect_pos_set, 70.2222, 8.7057, 32.6351, 42.0087, 6.453) # load 2d and 1d features if use_2d_fea: print("use 2d feature") # landuse arr 28 28 1 landuse_arr = np.load( '../feature_transform/austin_landuse_arr.npy') street_arr = np.load('../feature_transform/austin_street_arr.npy') # concatenate 2d data data_2d = np.concatenate([landuse_arr, street_arr], axis=2) else: print('ignore 2d data') data_2d = None if use_1d_fea: # weather: (1,1,6144,3) weather_arr = np.load( '../feature_transform/austin_weather_arr_1by1bytime.npy') weather_arr = weather_arr[0, 0, :, :] # [6144, 3] # construct training / testing data for 1d data print( 'generating fixed window length training and testing sequences for 1d data' ) raw_seq_arr_1d = train_obj.generate_fixlen_timeseries(weather_arr) # test_series_1d.shape -> (169, 1296, 3) train_arr_1d, test_arr_1d = train_obj.train_test_split( raw_seq_arr_1d) # else: print('ignore 1d data') train_arr_1d = None test_arr_1d = None if os.path.isfile('../rideaustin/austin_28_20160801-20170413.npy'): print('loading raw data array...') rawdata_arr = np.load( '../rideaustin/austin_28_20160801-20170413.npy') else: print('generating raw data array') rawdata_arr = train_obj.df_to_tensor() np.save('../rideaustin/austin_28_20160801-20170413.npy', rawdata_arr) else: print("Please input correct city name") ####################### city ignorant treatment ################ # lamda = 0 # if specified training dir to resume training, # the save_path is the same dir as train_dir # otherwise, create ta new dir for training if suffix == '': save_path = './bike_intermediatefea_model_' + str(dim) + '/' else: save_path = './bike_intermediatefea_model_' + str( dim) + '_' + suffix + '/' if train_dir: save_path = train_dir print("training dir: ", train_dir) print("save_path: ", save_path) if not os.path.exists(save_path): os.makedirs(save_path) # save demongraphic array #if os.path.isfile(save_path + 'demo_arr_32_20.npy'): # print('loading demopgraphic data array...') # demo_arr = np.load(save_path + 'demo_arr_32_20.npy') #else: # generate mask arr for city boundary demo_mask_arr = train_obj.demo_mask() # generate demographic in array format print('generating demo_arr array') demo_arr = train_obj.selected_demo_to_tensor() if not os.path.isfile(save_path + str(place) + '_demo_arr_' + str(HEIGHT) + '.npy'): np.save(save_path + str(place) + '_demo_arr_' + str(HEIGHT) + '.npy', demo_arr) # calculate statistics for demo # pop_df, pop_ratio_df = train_obj.generate_pop_df() # pop_df.to_csv(save_path + 'pop_df.csv') # pop_ratio_df.to_csv(save_path + 'pop_ratio_df.csv') # # # demo_pop: if IFG, RFG, equal mean, use normalized pop. # # if pairwise, use non-normalized pop # if fairloss == "pairwise": # #demo_pop = demo_arr[:,:,1] # normalized pop # demo_pop = demo_arr[:,:,0] # pop # use pop for pairwise loss # else: # demo_pop = demo_arr[:,:,1] # normalized pop # demo_pop = np.expand_dims(demo_pop, axis=2) # print('demo_pop.shape: ', demo_pop.shape) # demo sensitive ''' ['pop','normalized_pop','bi_caucasian','bi_age','bi_high_incm', 'bi_edu_univ','bi_nocar_hh','white_pop','age65_under','edu_uni'] ''' # demo_sensitive = demo_arr[:,:,2] # caucasian # demo_sensitive = np.expand_dims(demo_sensitive, axis=2) # normalized population of each group ''' caucasian non_caucasian senior young high_incm low_incm high_edu low_edu fewer_car more_car ''' # pop_g1 = pop_df['caucasian'].values[1] # pop_g2 = pop_df['non_caucasian'].values[1] # # if fairloss == 'RFG': # metric1: region-based # if multivar: # print('MULTIVAR') # fea_dim = [2,3,5] # caucasian, age, edu_univ # multi_pop_g1 = [pop_df['caucasian'].values[1], pop_df['young'].values[1], pop_df['high_edu'].values[1]] # multi_pop_g2 = [pop_df['non_caucasian'].values[1], pop_df['senior'].values[1], pop_df['low_edu'].values[1]] # else: # single var # fea_dim = [2] # binary caucasian # # multi_demo_sensitive = demo_arr[:,:,fea_dim] # caucasian # multi_pop_g1 = [pop_df['caucasian'].values[1]] # multi_pop_g2 = [pop_df['non_caucasian'].values[1]] # elif fairloss == "IFG": # if multivar: # print('MULTIVAR') # fea_dim = [7, 8, 9] # multivar # else: # fea_dim = [7] # white percent # # multi_demo_sensitive = demo_arr[:,:,fea_dim] # caucasian # multi_pop_g1 = [pop_df['caucasian'].values[1], pop_df['young'].values[1], pop_df['high_edu'].values[1]] # multi_pop_g2 = [pop_df['non_caucasian'].values[1], pop_df['senior'].values[1], pop_df['low_edu'].values[1]] # elif fairloss == "equalmean": # fea_dim = [2] # binar caucasian # # multi_demo_sensitive = demo_arr[:,:,fea_dim] # caucasian # # multi_pop_g1 = [pop_df['caucasian'].values[1], pop_df['young'].values[1], pop_df['high_edu'].values[1]] # # multi_pop_g2 = [pop_df['non_caucasian'].values[1], pop_df['senior'].values[1], pop_df['low_edu'].values[1]] # multi_pop_g1 = [pop_df['caucasian'].values[1]] # multi_pop_g2 = [pop_df['non_caucasian'].values[1]] # # # multi_grid_g1 = [pop_df['caucasian'].values[0]] # # multi_grid_g2 = [pop_df['non_caucasian'].values[0]] # elif fairloss == "pairwise": # multi_pop_g1 = [pop_df['caucasian'].values[1]] # multi_pop_g2 = [pop_df['non_caucasian'].values[1]] # fea_dim = [2] # binar caucasian # # multi_demo_sensitive = demo_arr[:,:,fea_dim] # caucasian # multi_grid_g1 = [pop_df['caucasian'].values[0]] # only for equal mean # multi_grid_g2 = [pop_df['non_caucasian'].values[0]] # # multi-var fairness input #fea_dim = [2,3,5] # caucasian, age, edu_univ # fea_dim = [7] # white percent # multi_demo_sensitive = demo_arr[:,:,fea_dim] # caucasian # multi_pop_g1 = [pop_df['caucasian'].values[1], pop_df['young'].values[1], pop_df['high_edu'].values[1]] # multi_pop_g2 = [pop_df['non_caucasian'].values[1], pop_df['senior'].values[1], pop_df['low_edu'].values[1]] timer = str(time.time()) if resume_training == False: # Model fusion without fairness print('Train Model fusion without fairness') conv3d_predicted = fused_model_with_latent_features.Conv3D( train_obj, train_arr, test_arr, intersect_pos_set, # multi_demo_sensitive, demo_pop, multi_pop_g1, multi_pop_g2, # multi_grid_g1, multi_grid_g2, fairloss, # train_arr_1d, test_arr_1d, data_2d, latent_train_series, latent_test_series, demo_mask_arr, save_path, HEIGHT, WIDTH, TIMESTEPS, BIKE_CHANNEL, NUM_2D_FEA, NUM_1D_FEA, BATCH_SIZE, TRAINING_STEPS, LEARNING_RATE).conv3d_predicted else: # resume training print('resume trainging from : ', train_dir) conv3d_predicted = fused_model_with_latent_features.Conv3D( train_obj, train_arr, test_arr, intersect_pos_set, # train_arr_1d, test_arr_1d, data_2d, latent_train_series, latent_test_series, demo_mask_arr, train_dir, HEIGHT, WIDTH, TIMESTEPS, BIKE_CHANNEL, NUM_2D_FEA, NUM_1D_FEA, BATCH_SIZE, TRAINING_STEPS, LEARNING_RATE, False, checkpoint, True, train_dir).conv3d_predicted conv3d_predicted.index = pd.to_datetime(conv3d_predicted.index) conv3d_predicted.to_csv(save_path + 'fused_model_pred_' + timer + '.csv') #convlstm_predicted = pd.read_csv(save_path + 'convlstm_predicted.csv', index_col=0) #convlstm_predicted.index = pd.to_datetime(convlstm_predicted.index) eval_obj4 = evaluation.evaluation(test_df_cut, conv3d_predicted, train_obj.demo_raw) diff_df = eval_obj4.group_difference() diff_df.to_csv(save_path + str(place) + '_evaluation.csv') finegrain_diff_df = eval_obj4.individual_difference() finegrain_diff_df.to_csv(save_path + 'IFG_eval.csv') print('rmse for conv3d: ', eval_obj4.rmse_val) print('mae for conv3d: ', eval_obj4.mae_val) print('mape for conv3d: ', eval_obj4.mape_val) # plot train test accuracy train_test = pd.read_csv(save_path + 'ecoch_res_df_' + '.csv') train_test = train_test.loc[:, ~train_test.columns.str.contains('^Unnamed')] total_loss = train_test[['train_loss', 'test_loss']].plot() plt.savefig(save_path + 'total_loss_finish.png') acc_loss = train_test[['train_acc', 'test_acc']].plot() plt.savefig(save_path + 'acc_loss_finish.png') # fair_loss = train_test[['train_fair', 'test_fair']].plot() # plt.savefig(save_path + 'fair_loss_finish.png') plt.close() txt_name = save_path + 'latent_fea_df_' + timer + '.txt' with open(txt_name, 'w') as the_file: the_file.write( 'Only account for grids that intersect with city boundary \n') # the_file.write('lamda\n') # the_file.write(str(lamda) + '\n') # the_file.write('beta\n') # the_file.write(str(beta) + '\n') the_file.write('dim\n') the_file.write(str(dim) + '\n') # the_file.write('use_1d_fea\n') # the_file.write(str(use_1d_fea) + '\n') # the_file.write('use_2d_fea\n') # the_file.write(str(use_2d_fea) + '\n') # the_file.write('fairloss\n') # the_file.write(str(fairloss) + '\n') # the_file.write('multivar\n') # the_file.write(str(multivar) + '\n') the_file.write('learning rate\n') the_file.write(str(LEARNING_RATE) + '\n') the_file.write('rmse for conv3d\n') the_file.write(str(eval_obj4.rmse_val) + '\n') the_file.write('mae for conv3d\n') the_file.write(str(eval_obj4.mae_val) + '\n') the_file.write('mape for conv3d\n') the_file.write(str(eval_obj4.mape_val) + '\n') the_file.close()