예제 #1
0
 def __init__(self,  demo_raw,
             train_start_time = '2014-02-01',train_end_time = '2018-10-31',
             test_start_time = '2018-11-01 00:00:00', test_end_time = '2019-05-01 23:00:00' ):
     # self.raw_df = raw_df
     # demongraphic data [32, 32, 14]
     self.demo_raw = demo_raw
     self.train_start_time = train_start_time
     #self.train_end_time = '2018-03-31'
     self.train_end_time = train_end_time
     # set train/test set
     self.test_start_time = test_start_time
     self.test_end_time = test_end_time
     # prediction window: use one week's data to predict next hour
     self.window = datetime.timedelta(hours=24 * 7)
     self.step = datetime.timedelta(hours=1)
     # predict_start_time should be '2018-04-08 00:00:00'
     # e.g. use '2018-04-01 00:00:00' -> '2018-04-07 23:00:00', in total 168 time stamps
     # to predict  '2018-04-08 00:00:00'
     # however, test_start_time + window = predict_start_time
     # e.g. '2018-04-01 00:00:00'  + 168 hour window = '2018-04-08 00:00:00'
     # this is calculated by time interval, there is 1 hour shift between timestamp and time interval
     self.predict_start_time = datetime_utils.str_to_datetime(self.test_start_time) + self.window
     # predict_end_time = test_end_time = '2018-04-30 23:00:00'
     self.predict_end_time = datetime_utils.str_to_datetime(self.test_end_time)
     # if window = 7 days, test_end_time  = '2018-04-30 23:00:00', actual_end_time =  04/23 - 23:00
     self.actual_end_time = self.predict_end_time - self.window
     # 41616
     self.train_hours = datetime_utils.get_total_hour_range(self.train_start_time, self.train_end_time)
예제 #2
0
 def train_test_split(self,raw_seq_arr):
     train_hours = datetime_utils.get_total_hour_range(self.train_start_time, self.train_end_time)
     # train_arr = raw_seq_arr[:, :train_hours, :, :]
     # test_arr = raw_seq_arr[:, train_hours:, :, :]
     train_arr = raw_seq_arr[:, :train_hours]
     test_arr = raw_seq_arr[:, train_hours:]
     return train_arr, test_arr
예제 #3
0
def main():
    args = parse_args()
    # lamda = args.lamda
    # beta = args.beta
    # # use_1d_fea = bool(args.use_1d_fea)
    # # use_2d_fea = bool(args.use_2d_fea)
    # fairloss = args.fairloss
    # multivar=  bool(args.multivar)
    suffix = args.suffix

    # the following arguments for resuming training
    resume_training = args.resume_training
    train_dir = args.train_dir
    checkpoint = args.checkpoint
    place = args.place
    epoch = args.epoch
    learning_rate = args.learning_rate
    encoding_dir = args.encoding_dir

    # print("received arguments: lamda: ",lamda)
    # print("received arguments: beta: ",beta)

    # print("use_1d_fea: ", use_1d_fea)
    # print("use_2d_fea: ", use_2d_fea)
    # print("fairloss: ", fairloss)
    # print("multivar: ", multivar)
    print("resume_training: ", resume_training)
    print("training dir path: ", train_dir)
    print("checkpoint: ", checkpoint)
    print("place: ", place)
    print("epochs to train: ", epoch)
    print("start learning rate: ", learning_rate)

    if checkpoint is not None:
        checkpoint = train_dir + checkpoint
        print('pick up checkpoint: ', checkpoint)

    if place == "Seattle":
        print('load data for Seattle...')
        globals()['TRAINING_STEPS'] = epoch
        globals()['LEARNING_RATE'] = learning_rate
        print('TRAINING_STEPS: ', TRAINING_STEPS)

        # hourly_grid_timeseries = pd.read_csv('./hourly_grid_1000_timeseries_trail.csv', index_col = 0)
        # hourly_grid_timeseries.index = pd.to_datetime(hourly_grid_timeseries.index)
        rawdata = pd.read_csv(
            'lime_whole_grid_32_20_hourly_1000_171001-181031.csv', index_col=0)
        rawdata.index = pd.to_datetime(rawdata.index)
        # a set of region codes (e.g.: 10_10) that intersect with the city
        intersect_pos = pd.read_csv(
            '../auxillary_data/intersect_pos_32_20.csv')
        intersect_pos_set = set(intersect_pos['0'].tolist())
        # demographic data
        # should use 2018 data
        demo_raw = pd.read_csv(
            '../auxillary_data/whole_grid_32_20_demo_1000_intersect_geodf_2018_corrected.csv',
            index_col=0)
        train_obj = train(rawdata, demo_raw)
        #ignore non-intersection cells in test_df
        # this is for evaluation
        test_df_cut = train_obj.test_df.loc[:,
                                            train_obj.test_df.columns.
                                            isin(list(intersect_pos_set))]
        # generate binary demo feature according to 2018 city mean
        train_obj.generate_binary_demo_attr(intersect_pos_set)

        if os.path.isfile('bikedata_32_20_171001-181031.npy'):
            print('loading raw data array...')
            rawdata_arr = np.load('bikedata_32_20_171001-181031.npy')
        else:
            print('generating raw data array')
            rawdata_arr = train_obj.df_to_tensor()
            np.save('bikedata_32_20_171001-181031.npy', rawdata_arr)

        print(
            'generating fixed window length training and testing sequences...')
        # raw_seq_arr.shape (169, 9336, 32, 20)
        raw_seq_arr = train_obj.generate_fixlen_timeseries(rawdata_arr)
        train_arr, test_arr = train_obj.train_test_split(raw_seq_arr)
        print('input train_arr shape: ', train_arr.shape)
        print('input test_arr shape: ', test_arr.shape)

        # train_hours: 8084
        train_hours = datetime_utils.get_total_hour_range(
            train_obj.train_start_time, train_obj.train_end_time)
        total_length = raw_seq_arr.shape[1]  # 9336
        test_len = total_length - train_hours  # 1296

        # 32112
        start_train_hour = datetime_utils.get_total_hour_range(
            '2014-02-01', '2017-09-30')
        # 40152
        end_train_hour = datetime_utils.get_total_hour_range(
            '2014-02-01', '2018-08-31')

        # --------------------------------------------------------------
        print('loading latent representation')
        latent_rep_path = '/home/ubuntu/CTensor/' + encoding_dir + 'latent_rep/final_lat_rep.npy'
        latent_rep = np.load(latent_rep_path)
        print('latent_rep.shape: ',
              latent_rep.shape)  # should be [42240, 32, 20, 3]
        latent_rep = latent_rep.reshape((45960, 32, 20, 5))
        latent_series = latent_rep[start_train_hour:end_train_hour + test_len +
                                   TIMESTEPS, :, :, :]

        #################  add groupwise latent representations ##############
        groupwise_latent_rep_path = '/home/ubuntu/CTensor/autoencoder_alltoall_groupwise_denoise/groupwise_tensors/'
        weather_latent_rep = np.load(groupwise_latent_rep_path +
                                     'weather_grp.npy')
        economics_latent_rep = np.load(groupwise_latent_rep_path +
                                       'economics_grp.npy')
        transportation_latent_rep = np.load(groupwise_latent_rep_path +
                                            'transportation_grp.npy')
        public_service_latent_rep = np.load(groupwise_latent_rep_path +
                                            'public_service_grp.npy')
        print('weather_latent_rep.shape: ',
              weather_latent_rep.shape)  # should be [42240, 32, 20, 3]
        group_latent_rep = np.concatenate([
            weather_latent_rep, economics_latent_rep,
            transportation_latent_rep, public_service_latent_rep
        ],
                                          axis=-1)

        group_latent_series = group_latent_rep[
            start_train_hour:end_train_hour + test_len + TIMESTEPS, :, :, :]

        #######  add groupwise with ALL2ALL latent representation ###############
        latent_series = np.concatenate([group_latent_series, latent_series],
                                       axis=-1)
        # if only groupwise features are used
        latent_series = group_latent_series

        dim = latent_series.shape[-1]
        print('latent_series.shape: ', latent_series.shape)

        latent_seq_arr = train_obj.generate_fixlen_timeseries(latent_series)
        print('input latent_seq_arr shape: ', latent_seq_arr.shape)
        train_latent_arr, test_latent_arr = train_obj.train_test_split(
            latent_seq_arr)
        print('input train_latent_arr shape: ', train_latent_arr.shape)
        print('input test_latent_arr shape: ', test_latent_arr.shape)

        # ---------------------------------------------------------------

    elif place == "Austin":
        print('load data for Austin...')
        globals()['HEIGHT'] = 28
        globals()['WIDTH'] = 28
        globals()['TIMESTEPS'] = 168
        globals()['BIKE_CHANNEL'] = 1
        globals()['NUM_2D_FEA'] = 3  # street count / streent len / poi count
        globals()['NUM_1D_FEA'] = 3
        globals()['BATCH_SIZE'] = 32
        globals()['TRAINING_STEPS'] = epoch
        # globals()['LEARNING_RATE']  = 0.003
        globals()['LEARNING_RATE'] = learning_rate
        print('global HEIGHT: ', HEIGHT)

        train_start_time = '2016-08-01'
        train_end_time = '2017-02-28'
        test_start_time = '2017-03-01 00:00:00'
        test_end_time = '2017-04-13 23:00:00'
        print('train_start_time for Austin: ', train_start_time)

        # hourly_grid_timeseries = pd.read_csv('./hourly_grid_1000_timeseries_trail.csv', index_col = 0)
        # hourly_grid_timeseries.index = pd.to_datetime(hourly_grid_timeseries.index)
        rawdata = pd.read_csv(
            '../rideaustin/rideaustin_grided_hourly_2000_20160801-20170413.csv',
            index_col=0)
        rawdata.index = pd.to_datetime(rawdata.index)

        # a set of region codes (e.g.: 10_10) that intersect with the city
        intersect_pos = pd.read_csv(
            '../rideaustin/austin_intersect_pos_28_28.csv')
        intersect_pos_set = set(intersect_pos['0'].tolist())
        # demographic data
        # should use 2018 data
        demo_raw = pd.read_csv(
            '../rideaustin/austin_demo_data/austin_28_28_demo_2000_intersect_geodf_2017.csv',
            index_col=0)
        train_obj = train(rawdata, demo_raw, train_start_time, train_end_time,
                          test_start_time, test_end_time)
        #ignore non-intersection cells in test_df
        # this is for evaluation
        test_df_cut = train_obj.test_df.loc[:,
                                            train_obj.test_df.columns.
                                            isin(list(intersect_pos_set))]

        # generate binary demo feature according to 2017 Austin city mean
        train_obj.generate_binary_demo_attr(intersect_pos_set, 70.2222, 8.7057,
                                            32.6351, 42.0087, 6.453)

        # load 2d and 1d features
        if use_2d_fea:
            print("use 2d feature")
            # landuse arr 28 28 1
            landuse_arr = np.load(
                '../feature_transform/austin_landuse_arr.npy')
            street_arr = np.load('../feature_transform/austin_street_arr.npy')
            # concatenate 2d data
            data_2d = np.concatenate([landuse_arr, street_arr], axis=2)
        else:
            print('ignore 2d data')
            data_2d = None

        if use_1d_fea:
            # weather: (1,1,6144,3)
            weather_arr = np.load(
                '../feature_transform/austin_weather_arr_1by1bytime.npy')
            weather_arr = weather_arr[0, 0, :, :]  # [6144, 3]
            # construct training / testing data for 1d data
            print(
                'generating fixed window length training and testing sequences for 1d data'
            )
            raw_seq_arr_1d = train_obj.generate_fixlen_timeseries(weather_arr)
            # test_series_1d.shape -> (169, 1296, 3)
            train_arr_1d, test_arr_1d = train_obj.train_test_split(
                raw_seq_arr_1d)
            #
        else:
            print('ignore 1d data')
            train_arr_1d = None
            test_arr_1d = None

        if os.path.isfile('../rideaustin/austin_28_20160801-20170413.npy'):
            print('loading raw data array...')
            rawdata_arr = np.load(
                '../rideaustin/austin_28_20160801-20170413.npy')
        else:
            print('generating raw data array')
            rawdata_arr = train_obj.df_to_tensor()
            np.save('../rideaustin/austin_28_20160801-20170413.npy',
                    rawdata_arr)
    else:
        print("Please input correct city name")


####################### city ignorant treatment ################
# lamda = 0
# if specified training dir to resume training,
# the save_path is the same dir as train_dir
# otherwise, create ta new dir for training
    if suffix == '':
        save_path = './bike_groupwise_model_' + str(dim) + '/'
    else:
        save_path = './bike_groupwise_model_' + str(dim) + '_' + suffix + '/'

    if train_dir:
        save_path = train_dir

    print("training dir: ", train_dir)
    print("save_path: ", save_path)
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # save demongraphic array
    #if os.path.isfile(save_path + 'demo_arr_32_20.npy'):
    #   print('loading demopgraphic data array...')
    #  demo_arr = np.load(save_path + 'demo_arr_32_20.npy')
    #else:

    # generate mask arr for city boundary
    demo_mask_arr = train_obj.demo_mask()

    # generate demographic in array format
    print('generating demo_arr array')
    demo_arr = train_obj.selected_demo_to_tensor()
    if not os.path.isfile(save_path + str(place) + '_demo_arr_' + str(HEIGHT) +
                          '.npy'):
        np.save(save_path + str(place) + '_demo_arr_' + str(HEIGHT) + '.npy',
                demo_arr)

    # calculate statistics for demo
    # pop_df, pop_ratio_df = train_obj.generate_pop_df()
    # pop_df.to_csv(save_path + 'pop_df.csv')
    # pop_ratio_df.to_csv(save_path + 'pop_ratio_df.csv')
    #
    # # demo_pop: if IFG, RFG, equal mean, use normalized pop.
    # # if pairwise, use non-normalized pop
    # if fairloss == "pairwise":
    # #demo_pop = demo_arr[:,:,1]  # normalized pop
    #     demo_pop = demo_arr[:,:,0]  #  pop # use pop for pairwise loss
    # else:
    #     demo_pop = demo_arr[:,:,1]  # normalized pop
    # demo_pop = np.expand_dims(demo_pop, axis=2)
    # print('demo_pop.shape: ',  demo_pop.shape)

    timer = str(time.time())
    if resume_training == False:
        # Model fusion without fairness
        print('Train Model fusion without fairness')
        conv3d_predicted = fused_model_with_latent_features.Conv3D(
            train_obj,
            train_arr,
            test_arr,
            intersect_pos_set,
            # multi_demo_sensitive, demo_pop, multi_pop_g1, multi_pop_g2,
            # multi_grid_g1, multi_grid_g2, fairloss,
            # train_arr_1d, test_arr_1d, data_2d,
            train_latent_arr,
            test_latent_arr,
            demo_mask_arr,
            save_path,
            HEIGHT,
            WIDTH,
            TIMESTEPS,
            BIKE_CHANNEL,
            NUM_2D_FEA,
            NUM_1D_FEA,
            BATCH_SIZE,
            TRAINING_STEPS,
            LEARNING_RATE).conv3d_predicted
    else:
        # resume training
        print('resume trainging from : ', train_dir)
        conv3d_predicted = fused_model_with_latent_features.Conv3D(
            train_obj,
            train_arr,
            test_arr,
            intersect_pos_set,

            # train_arr_1d, test_arr_1d, data_2d,
            train_latent_arr,
            test_latent_arr,
            demo_mask_arr,
            train_dir,
            HEIGHT,
            WIDTH,
            TIMESTEPS,
            BIKE_CHANNEL,
            NUM_2D_FEA,
            NUM_1D_FEA,
            BATCH_SIZE,
            TRAINING_STEPS,
            LEARNING_RATE,
            False,
            checkpoint,
            True,
            train_dir).conv3d_predicted

    conv3d_predicted.index = pd.to_datetime(conv3d_predicted.index)
    conv3d_predicted.to_csv(save_path + 'fused_model_pred_' + timer + '.csv')
    #convlstm_predicted = pd.read_csv(save_path + 'convlstm_predicted.csv', index_col=0)
    #convlstm_predicted.index = pd.to_datetime(convlstm_predicted.index)
    eval_obj4 = evaluation.evaluation(test_df_cut, conv3d_predicted,
                                      train_obj.demo_raw)
    diff_df = eval_obj4.group_difference()
    diff_df.to_csv(save_path + str(place) + '_evaluation.csv')

    finegrain_diff_df = eval_obj4.individual_difference()
    finegrain_diff_df.to_csv(save_path + 'IFG_eval.csv')

    print('rmse for conv3d: ', eval_obj4.rmse_val)
    print('mae for conv3d: ', eval_obj4.mae_val)
    print('mape for conv3d: ', eval_obj4.mape_val)

    # plot train test accuracy
    train_test = pd.read_csv(save_path + 'ecoch_res_df_' + '.csv')
    train_test = train_test.loc[:,
                                ~train_test.columns.str.contains('^Unnamed')]
    total_loss = train_test[['train_loss', 'test_loss']].plot()
    plt.savefig(save_path + 'total_loss_finish.png')
    acc_loss = train_test[['train_acc', 'test_acc']].plot()
    plt.savefig(save_path + 'acc_loss_finish.png')
    # fair_loss = train_test[['train_fair', 'test_fair']].plot()
    # plt.savefig(save_path + 'fair_loss_finish.png')
    plt.close()

    txt_name = save_path + 'latent_fea_df_' + timer + '.txt'
    with open(txt_name, 'w') as the_file:
        the_file.write(
            'Only account for grids that intersect with city boundary \n')
        # the_file.write('lamda\n')
        # the_file.write(str(lamda) + '\n')
        # the_file.write('beta\n')
        # the_file.write(str(beta) + '\n')
        the_file.write('dim\n')
        the_file.write(str(dim) + '\n')
        the_file.write('latent_rep_path\n')
        the_file.write(str(latent_rep_path) + '\n')
        # the_file.write('use_1d_fea\n')
        # the_file.write(str(use_1d_fea) + '\n')
        # the_file.write('use_2d_fea\n')
        # the_file.write(str(use_2d_fea) + '\n')
        # the_file.write('fairloss\n')
        # the_file.write(str(fairloss) + '\n')
        # the_file.write('multivar\n')
        # the_file.write(str(multivar) + '\n')
        the_file.write('learning rate\n')
        the_file.write(str(LEARNING_RATE) + '\n')
        the_file.write('rmse for conv3d\n')
        the_file.write(str(eval_obj4.rmse_val) + '\n')
        the_file.write('mae for conv3d\n')
        the_file.write(str(eval_obj4.mae_val) + '\n')
        the_file.write('mape for conv3d\n')
        the_file.write(str(eval_obj4.mape_val) + '\n')

        the_file.close()
예제 #4
0
def main():
    args = parse_args()
    lamda = args.lamda
    beta = args.beta
    use_1d_fea = bool(args.use_1d_fea)
    use_2d_fea = bool(args.use_2d_fea)
    fairloss = args.fairloss
    multivar = bool(args.multivar)
    suffix = args.suffix

    # the following arguments for resuming training
    resume_training = args.resume_training
    train_dir = args.train_dir
    checkpoint = args.checkpoint
    place = args.place
    epoch = args.epoch
    learning_rate = args.learning_rate

    print("received arguments: lamda: ", lamda)
    print("received arguments: beta: ", beta)

    print("use_1d_fea: ", use_1d_fea)
    print("use_2d_fea: ", use_2d_fea)
    print("fairloss: ", fairloss)
    print("multivar: ", multivar)
    print("resume_training: ", resume_training)
    print("training dir path: ", train_dir)
    print("checkpoint: ", checkpoint)
    print("place: ", place)
    print("epochs to train: ", epoch)
    print("start learning rate: ", learning_rate)

    if checkpoint is not None:
        checkpoint = train_dir + checkpoint
        print('pick up checkpoint: ', checkpoint)

    if place == "Seattle":
        print('load data for Seattle...')
        globals()['TRAINING_STEPS'] = epoch
        globals()['LEARNING_RATE'] = learning_rate
        print('TRAINING_STEPS: ', TRAINING_STEPS)

        # hourly_grid_timeseries = pd.read_csv('./hourly_grid_1000_timeseries_trail.csv', index_col = 0)
        # hourly_grid_timeseries.index = pd.to_datetime(hourly_grid_timeseries.index)
        rawdata = pd.read_csv(
            'lime_whole_grid_32_20_hourly_1000_171001-181031.csv', index_col=0)
        rawdata.index = pd.to_datetime(rawdata.index)
        # a set of region codes (e.g.: 10_10) that intersect with the city
        intersect_pos = pd.read_csv(
            '../auxillary_data/intersect_pos_32_20.csv')
        intersect_pos_set = set(intersect_pos['0'].tolist())
        # demographic data
        # should use 2018 data
        demo_raw = pd.read_csv(
            '../auxillary_data/whole_grid_32_20_demo_1000_intersect_geodf_2018_corrected.csv',
            index_col=0)
        train_obj = train(rawdata, demo_raw)
        #ignore non-intersection cells in test_df
        # this is for evaluation
        test_df_cut = train_obj.test_df.loc[:,
                                            train_obj.test_df.columns.
                                            isin(list(intersect_pos_set))]
        # generate binary demo feature according to 2018 city mean
        train_obj.generate_binary_demo_attr(intersect_pos_set)

        if os.path.isfile('bikedata_32_20_171001-181031.npy'):
            print('loading raw data array...')
            rawdata_arr = np.load('bikedata_32_20_171001-181031.npy')
        else:
            print('generating raw data array')
            rawdata_arr = train_obj.df_to_tensor()
            np.save('bikedata_32_20_171001-181031.npy', rawdata_arr)
        ################# LOAD DATA ######################
        # ---- reading data ---------------------#
        print('Reading 1d, 2d, and 3d data')
        path_1d = '../data_processing/1d_source_data/'
        path_2d = '../data_processing/2d_source_data/'
        path_3d = '../data_processing/3d_source_data/'
        # 1d
        weather_arr = np.load(path_1d + 'weather_arr_20140201_20190501.npy')
        airquality_arr = np.load(path_1d +
                                 'air_quality_arr_20140201_20190501.npy')
        print('weather_arr.shape: ', weather_arr.shape)
        print('airquality_arr.shape: ', airquality_arr.shape)
        weather_arr = weather_arr[0, 0, :, :]
        airquality_arr = airquality_arr[0, 0, :, :]

        # 2d
        house_price_arr = np.load(path_2d + 'house_price.npy')
        POI_business_arr = np.load(path_2d + 'POI_business.npy')
        POI_food_arr = np.load(path_2d + 'POI_food.npy')
        POI_government_arr = np.load(path_2d + 'POI_government.npy')
        POI_hospitals_arr = np.load(path_2d + 'POI_hospitals.npy')
        POI_publicservices_arr = np.load(path_2d + 'POI_publicservices.npy')

        POI_recreation_arr = np.load(path_2d + 'POI_recreation.npy')
        POI_school_arr = np.load(path_2d + 'POI_school.npy')
        POI_transportation_arr = np.load(path_2d + 'POI_transportation.npy')
        seattle_street_arr = np.load(path_2d + 'seattle_street.npy')
        total_flow_count_arr = np.load(path_2d + 'total_flow_count.npy')
        transit_routes_arr = np.load(path_2d + 'transit_routes.npy')
        transit_signals_arr = np.load(path_2d + 'transit_signals.npy')
        transit_stop_arr = np.load(path_2d + 'transit_stop.npy')

        slope_arr = np.load(path_2d + 'slope_arr.npy')
        bikelane_arr = np.load(path_2d + 'bikelane_arr.npy')

        print('transit_routes_arr.shape: ', transit_routes_arr.shape)
        print('POI_recreation_arr.shape: ', POI_recreation_arr.shape)

        # 3d
        building_permit_arr = np.load(
            path_3d + 'building_permit_arr_20140201_20190501_python3.npy')
        collisions_arr = np.load(
            path_3d + 'collisions_arr_20140201_20190501_python3.npy')
        crime_arr = np.load(path_3d +
                            'crime_arr_20140201_20190501_python3.npy')
        seattle911calls_arr = np.load(
            path_3d + 'seattle911calls_arr_20140201_20190501.npy')
        print('building_permit_arr.shape:', building_permit_arr.shape)
        print('collisions_arr.shape: ', collisions_arr.shape)
        print('crime_arr.shape: ', crime_arr.shape)
        print('seattle911calls_arr.shape: ', seattle911calls_arr.shape)

        building_permit_arr_seq_extend = np.repeat(building_permit_arr,
                                                   24,
                                                   axis=0)
        collisions_arr_seq_extend = np.repeat(collisions_arr, 24, axis=0)

        print('building_permit_arr_seq_extend.shape: ',
              building_permit_arr_seq_extend.shape)

        # train_hours: 8084

        print(
            'generating fixed window length training and testing sequences...')
        raw_seq_arr = train_obj.generate_fixlen_timeseries(rawdata_arr)
        train_arr, test_arr = train_obj.train_test_split(raw_seq_arr)
        print('input train_arr shape: ', train_arr.shape)

        train_hours = datetime_utils.get_total_hour_range(
            train_obj.train_start_time, train_obj.train_end_time)
        total_length = raw_seq_arr.shape[1]  # 9336
        test_len = total_length - train_hours  # 1296
        # 32112
        start_train_hour = datetime_utils.get_total_hour_range(
            '2014-02-01', '2017-09-30')
        # 40152
        end_train_hour = datetime_utils.get_total_hour_range(
            '2014-02-01', '2018-08-31')
        start_idx = start_train_hour
        end_idx = end_train_hour + test_len + TIMESTEPS

        # construct dictionary
        print('use dictionary to organize data')
        rawdata_1d_dict = {
            'precipitation':
            np.expand_dims(weather_arr[start_idx:end_idx, 0], axis=1),
            'temperature':
            np.expand_dims(weather_arr[start_idx:end_idx, 1], axis=1),
            'pressure':
            np.expand_dims(weather_arr[start_idx:end_idx, 2], axis=1),
            'airquality':
            airquality_arr[start_idx:end_idx, :],
        }

        rawdata_2d_dict = {
            'house_price': house_price_arr,
            'POI_business': POI_business_arr,
            'POI_food': POI_food_arr,
            'POI_government': POI_government_arr,
            'POI_hospitals': POI_hospitals_arr,
            'POI_publicservices': POI_publicservices_arr,
            'POI_recreation': POI_recreation_arr,
            'POI_school': POI_school_arr,
            'POI_transportation': POI_transportation_arr,
            'seattle_street': seattle_street_arr,
            'total_flow_count': total_flow_count_arr,
            'transit_routes': transit_routes_arr,
            'transit_signals': transit_signals_arr,
            'transit_stop': transit_stop_arr,
            'slope': slope_arr,
            'bikelane': bikelane_arr,
        }

        rawdata_3d_dict = {
            'building_permit':
            np.expand_dims(
                building_permit_arr_seq_extend[start_idx:end_idx, :, :],
                axis=3),
            'collisions':
            np.expand_dims(collisions_arr_seq_extend[start_idx:end_idx, :, :],
                           axis=3),  # (7, 45840, 32, 20)
            'seattle911calls':
            np.expand_dims(seattle911calls_arr[start_idx:end_idx, :, :],
                           axis=3)  # (45984, 32, 20)
        }

        ################ original code ###########################
        # load 2d and 1d features
        if use_2d_fea:
            print("use 2d feature")
            # bikelane_arr = np.load('../feature_transform/bikelane_arr.npy')
            # slope_arr = np.load('../feature_transform/slope_arr.npy')

            # concatenate 2d data
            # data_2d = np.concatenate([slope_arr,bikelane_arr], axis=2)
            data_2d = np.concatenate(list(rawdata_2d_dict.values()), axis=2)
        else:
            print('ignore 2d data')
            data_2d = None

        if use_1d_fea:
            # weather: (1,1,9504,3) or (9504, 3)
            # weather_arr = np.load('../feature_transform/weather_arr_1by1by9504.npy')
            # weather_arr = weather_arr[0,0,:,:]  # [9504, 3]
            # construct training / testing data for 1d data
            print(
                'generating fixed window length training and testing sequences for 1d data'
            )
            # raw_seq_arr_1d = train_obj.generate_fixlen_timeseries(weather_arr)
            # train_arr_1d, test_arr_1d = train_obj.train_test_split(raw_seq_arr_1d)
            data_1d = np.concatenate(list(rawdata_1d_dict.values()), axis=1)
            raw_seq_arr_1d = train_obj.generate_fixlen_timeseries(data_1d)
            train_arr_1d, test_arr_1d = train_obj.train_test_split(
                raw_seq_arr_1d)
        else:
            print('ignore 1d data')
            train_arr_1d = None
            test_arr_1d = None

        #### add 3D data  #############################################
        data_3d = np.concatenate(list(rawdata_3d_dict.values()), axis=3)
        print('data_3d.shape: ', data_3d.shape)
        # fea_seq_arr_3d = train_obj.generate_fixlen_timeseries(data_3d)
        # fea_train_arr_3d, fea_test_arr_3d = train_obj.train_test_split(fea_seq_arr_3d)
        # print('fea_train_arr_3d.shape: ', fea_train_arr_3d.shape) # (169, 8040, 32, 20, 3)
        # print('train_arr.shape: ', train_arr.shape) # (169, 8040, 32, 20)

        # concatenate with bikeshare data
        # train_arr = np.expand_dims(train_arr, axis=4)
        # test_arr = np.expand_dims(test_arr, axis=4)
        #
        # train_arr = np.concatenate([train_arr,fea_train_arr_3d], axis=4)
        # test_arr = np.concatenate([test_arr,fea_test_arr_3d], axis=4)
        # print('train_arr.shape: ', train_arr.shape)

        globals()['NUM_2D_FEA'] = data_2d.shape[-1]
        globals()['NUM_1D_FEA'] = train_arr_1d.shape[-1]
        globals()['NUM_3D_FEA'] = data_3d.shape[-1]

    elif place == "Austin":
        print('load data for Austin...')
        globals()['HEIGHT'] = 28
        globals()['WIDTH'] = 28
        globals()['TIMESTEPS'] = 168
        globals()['BIKE_CHANNEL'] = 1
        globals()['NUM_2D_FEA'] = 3  # street count / streent len / poi count
        globals()['NUM_1D_FEA'] = 3
        globals()['BATCH_SIZE'] = 32
        globals()['TRAINING_STEPS'] = epoch
        # globals()['LEARNING_RATE']  = 0.003
        globals()['LEARNING_RATE'] = learning_rate
        print('global HEIGHT: ', HEIGHT)

        train_start_time = '2016-08-01'
        train_end_time = '2017-02-28'
        test_start_time = '2017-03-01 00:00:00'
        test_end_time = '2017-04-13 23:00:00'
        print('train_start_time for Austin: ', train_start_time)

        # hourly_grid_timeseries = pd.read_csv('./hourly_grid_1000_timeseries_trail.csv', index_col = 0)
        # hourly_grid_timeseries.index = pd.to_datetime(hourly_grid_timeseries.index)
        rawdata = pd.read_csv(
            '../rideaustin/rideaustin_grided_hourly_2000_20160801-20170413.csv',
            index_col=0)
        rawdata.index = pd.to_datetime(rawdata.index)

        # a set of region codes (e.g.: 10_10) that intersect with the city
        intersect_pos = pd.read_csv(
            '../rideaustin/austin_intersect_pos_28_28.csv')
        intersect_pos_set = set(intersect_pos['0'].tolist())
        # demographic data
        # should use 2018 data
        demo_raw = pd.read_csv(
            '../rideaustin/austin_demo_data/austin_28_28_demo_2000_intersect_geodf_2017.csv',
            index_col=0)
        train_obj = train(rawdata, demo_raw, train_start_time, train_end_time,
                          test_start_time, test_end_time)
        #ignore non-intersection cells in test_df
        # this is for evaluation
        test_df_cut = train_obj.test_df.loc[:,
                                            train_obj.test_df.columns.
                                            isin(list(intersect_pos_set))]

        # generate binary demo feature according to 2017 Austin city mean
        train_obj.generate_binary_demo_attr(intersect_pos_set, 70.2222, 8.7057,
                                            32.6351, 42.0087, 6.453)

        # load 2d and 1d features
        if use_2d_fea:
            print("use 2d feature")
            # landuse arr 28 28 1
            landuse_arr = np.load(
                '../feature_transform/austin_landuse_arr.npy')
            street_arr = np.load('../feature_transform/austin_street_arr.npy')
            # concatenate 2d data
            data_2d = np.concatenate([landuse_arr, street_arr], axis=2)
        else:
            print('ignore 2d data')
            data_2d = None

        if use_1d_fea:
            # weather: (1,1,6144,3)
            weather_arr = np.load(
                '../feature_transform/austin_weather_arr_1by1bytime.npy')
            weather_arr = weather_arr[0, 0, :, :]  # [6144, 3]
            # construct training / testing data for 1d data
            print(
                'generating fixed window length training and testing sequences for 1d data'
            )
            raw_seq_arr_1d = train_obj.generate_fixlen_timeseries(weather_arr)
            # test_series_1d.shape -> (169, 1296, 3)
            train_arr_1d, test_arr_1d = train_obj.train_test_split(
                raw_seq_arr_1d)
            #
        else:
            print('ignore 1d data')
            train_arr_1d = None
            test_arr_1d = None

        if os.path.isfile('../rideaustin/austin_28_20160801-20170413.npy'):
            print('loading raw data array...')
            rawdata_arr = np.load(
                '../rideaustin/austin_28_20160801-20170413.npy')
        else:
            print('generating raw data array')
            rawdata_arr = train_obj.df_to_tensor()
            np.save('../rideaustin/austin_28_20160801-20170413.npy',
                    rawdata_arr)
    else:
        print("Please input correct city name")


####################### city ignorant treatment ################
# lamda = 0
# if specified training dir to resume training,
# the save_path is the same dir as train_dir
# otherwise, create ta new dir for training
    if suffix == '':
        save_path = './fusion_model_originalfeatures_' + str(
            place) + '_' + str(fairloss) + '_' + str(use_1d_fea) + '_' + str(
                use_2d_fea) + '_' + str(multivar) + '/'
    else:
        save_path = './fusion_model_originalfeatures_' + str(
            place) + '_' + str(fairloss) + '_' + str(use_1d_fea) + '_' + str(
                use_2d_fea) + '_' + str(multivar) + '_' + suffix + '/'

    if train_dir:
        save_path = train_dir

    print("training dir: ", train_dir)
    print("save_path: ", save_path)
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # generate mask arr for city boundary
    demo_mask_arr = train_obj.demo_mask()

    # generate demographic in array format
    print('generating demo_arr array')
    demo_arr = train_obj.selected_demo_to_tensor()
    if not os.path.isfile(save_path + str(place) + '_demo_arr_' + str(HEIGHT) +
                          '.npy'):
        np.save(save_path + str(place) + '_demo_arr_' + str(HEIGHT) + '.npy',
                demo_arr)

    timer = str(time.time())
    if resume_training == False:
        # Model fusion without fairness
        print('Train Model fusion without fairness')
        conv3d_predicted = fused_model_augment.Conv3D(
            train_obj,
            train_arr,
            test_arr,
            intersect_pos_set,
            # multi_demo_sensitive, demo_pop, multi_pop_g1, multi_pop_g2,
            # multi_grid_g1, multi_grid_g2, fairloss,
            train_arr_1d,
            test_arr_1d,
            data_2d,
            data_3d,
            demo_mask_arr,
            save_path,
            HEIGHT,
            WIDTH,
            TIMESTEPS,
            BIKE_CHANNEL,
            NUM_3D_FEA,
            NUM_2D_FEA,
            NUM_1D_FEA,
            BATCH_SIZE,
            TRAINING_STEPS,
            LEARNING_RATE).conv3d_predicted
    else:
        # resume training
        print('resume trainging from : ', train_dir)
        conv3d_predicted = fused_model_augment.Conv3D(
            train_obj,
            train_arr,
            test_arr,
            intersect_pos_set,
            # multi_demo_sensitive, demo_pop, multi_pop_g1, multi_pop_g2,
            # multi_grid_g1, multi_grid_g2,fairloss,
            train_arr_1d,
            test_arr_1d,
            data_2d,
            data_3d,
            demo_mask_arr,
            train_dir,
            HEIGHT,
            WIDTH,
            TIMESTEPS,
            BIKE_CHANNEL,
            NUM_3D_FEA,
            NUM_2D_FEA,
            NUM_1D_FEA,
            BATCH_SIZE,
            TRAINING_STEPS,
            LEARNING_RATE,
            False,
            checkpoint,
            True,
            train_dir).conv3d_predicted

    conv3d_predicted.index = pd.to_datetime(conv3d_predicted.index)
    conv3d_predicted.to_csv(save_path + 'fused_model_pred_' + timer + '.csv')
    #convlstm_predicted = pd.read_csv(save_path + 'convlstm_predicted.csv', index_col=0)
    #convlstm_predicted.index = pd.to_datetime(convlstm_predicted.index)
    eval_obj4 = evaluation.evaluation(test_df_cut, conv3d_predicted,
                                      train_obj.demo_raw)
    diff_df = eval_obj4.group_difference()
    diff_df.to_csv(save_path + str(place) + '_evaluation.csv')

    finegrain_diff_df = eval_obj4.individual_difference()
    finegrain_diff_df.to_csv(save_path + 'IFG_eval.csv')

    print('rmse for conv3d: ', eval_obj4.rmse_val)
    print('mae for conv3d: ', eval_obj4.mae_val)

    # plot train test accuracy
    train_test = pd.read_csv(save_path + 'ecoch_res_df_' + '.csv')
    train_test = train_test.loc[:,
                                ~train_test.columns.str.contains('^Unnamed')]
    total_loss = train_test[['train_loss', 'test_loss']].plot()
    plt.savefig(save_path + 'total_loss_finish.png')
    acc_loss = train_test[['train_acc', 'test_acc']].plot()
    plt.savefig(save_path + 'acc_loss_finish.png')
    # fair_loss = train_test[['train_fair', 'test_fair']].plot()
    # plt.savefig(save_path + 'fair_loss_finish.png')
    plt.close()

    txt_name = save_path + 'fused_model_df_' + str(beta) + '_' + timer + '.txt'
    with open(txt_name, 'w') as the_file:
        the_file.write(
            'Only account for grids that intersect with city boundary \n')

        the_file.write('place\n')
        the_file.write(str(place) + '\n')
        the_file.write('use_1d_fea\n')
        the_file.write(str(use_1d_fea) + '\n')
        the_file.write('use_2d_fea\n')
        the_file.write(str(use_2d_fea) + '\n')

        the_file.write(str(LEARNING_RATE) + '\n')

        the_file.write('rmse for conv3d\n')
        the_file.write(str(eval_obj4.rmse_val) + '\n')
        the_file.write('mae for conv3d\n')
        the_file.write(str(eval_obj4.mae_val) + '\n')

        the_file.close()
예제 #5
0
def main():
    args = parse_args()
    # lamda = args.lamda
    # beta = args.beta
    # # use_1d_fea = bool(args.use_1d_fea)
    # # use_2d_fea = bool(args.use_2d_fea)
    # fairloss = args.fairloss
    # multivar=  bool(args.multivar)
    suffix = args.suffix

    # the following arguments for resuming training
    resume_training = args.resume_training
    train_dir = args.train_dir
    checkpoint = args.checkpoint
    place = args.place
    epoch = args.epoch
    learning_rate = args.learning_rate

    # print("received arguments: lamda: ",lamda)
    # print("received arguments: beta: ",beta)

    # print("use_1d_fea: ", use_1d_fea)
    # print("use_2d_fea: ", use_2d_fea)
    # print("fairloss: ", fairloss)
    # print("multivar: ", multivar)
    print("resume_training: ", resume_training)
    print("training dir path: ", train_dir)
    print("checkpoint: ", checkpoint)
    print("place: ", place)
    print("epochs to train: ", epoch)
    print("start learning rate: ", learning_rate)

    if checkpoint is not None:
        checkpoint = train_dir + checkpoint
        print('pick up checkpoint: ', checkpoint)

    if place == "Seattle":
        print('load data for Seattle...')
        globals()['TRAINING_STEPS'] = epoch
        globals()['LEARNING_RATE'] = learning_rate
        print('TRAINING_STEPS: ', TRAINING_STEPS)

        # hourly_grid_timeseries = pd.read_csv('./hourly_grid_1000_timeseries_trail.csv', index_col = 0)
        # hourly_grid_timeseries.index = pd.to_datetime(hourly_grid_timeseries.index)
        rawdata = pd.read_csv(
            'lime_whole_grid_32_20_hourly_1000_171001-181031.csv', index_col=0)
        rawdata.index = pd.to_datetime(rawdata.index)
        # a set of region codes (e.g.: 10_10) that intersect with the city
        intersect_pos = pd.read_csv(
            '../auxillary_data/intersect_pos_32_20.csv')
        intersect_pos_set = set(intersect_pos['0'].tolist())
        # demographic data
        # should use 2018 data
        demo_raw = pd.read_csv(
            '../auxillary_data/whole_grid_32_20_demo_1000_intersect_geodf_2018_corrected.csv',
            index_col=0)
        train_obj = train(rawdata, demo_raw)
        #ignore non-intersection cells in test_df
        # this is for evaluation
        test_df_cut = train_obj.test_df.loc[:,
                                            train_obj.test_df.columns.
                                            isin(list(intersect_pos_set))]
        # generate binary demo feature according to 2018 city mean
        train_obj.generate_binary_demo_attr(intersect_pos_set)

        if os.path.isfile('bikedata_32_20_171001-181031.npy'):
            print('loading raw data array...')
            rawdata_arr = np.load('bikedata_32_20_171001-181031.npy')
        else:
            print('generating raw data array')
            rawdata_arr = train_obj.df_to_tensor()
            np.save('bikedata_32_20_171001-181031.npy', rawdata_arr)

        print(
            'generating fixed window length training and testing sequences...')
        # raw_seq_arr.shape (169, 9336, 32, 20)
        raw_seq_arr = train_obj.generate_fixlen_timeseries(rawdata_arr)
        train_arr, test_arr = train_obj.train_test_split(raw_seq_arr)
        print('input train_arr shape: ', train_arr.shape)

        # train_hours: 8084
        train_hours = datetime_utils.get_total_hour_range(
            train_obj.train_start_time, train_obj.train_end_time)
        total_length = raw_seq_arr.shape[1]  # 9336
        test_len = total_length - train_hours  # 1296

        # 32112
        start_train_hour = datetime_utils.get_total_hour_range(
            '2014-02-01', '2017-09-30')
        # 40152
        end_train_hour = datetime_utils.get_total_hour_range(
            '2014-02-01', '2018-08-31')

        # --------------------------------------------------------------
        print('loading latent representation')
        # latent_rep_path = '/home/ubuntu/CTensor/predictions/autoencoder_v1_Seattle/inference/infer_latent_representation.npy'
        # latent_rep_path = '/home/ubuntu/CTensor/autoencoder_alltoall/autoencoder_v2_dim1_epoch15/train_lat_rep.npy'
        # latent_rep_path = '/home/ubuntu/CTensor/toy_examples/toy_autoencoder_v6_cos_dim3_trail/inference/encoded_list'
        latent_rep_path = '/home/ubuntu/CTensor/autoencoder_featuremap_grouping/autoencoder_v6_cos_dim5_firstlevel_from50_ag_1d2dcalc_n7/inference/encoded_list'
        file = open(latent_rep_path, 'rb')
        encoded_list = pickle.load(file)
        print(len(encoded_list[0]))

        # close the file
        file.close()
        # rearrange encoded_list
        # original dimension: # of batches, # of datasets, [batch_size, ......],
        # arrange into :  # of datasets, # of batches,  [batch_size, ......],

        encoded_list_rearrange = [[None for j in range(len(encoded_list))]
                                  for i in range(len(encoded_list[0]))]
        for i, batch in enumerate(encoded_list):
            for j, ds in enumerate(batch):
                encoded_list_rearrange[j][i] = encoded_list[i][j]

        encoded_list_rearrange_concat = [
            np.concatenate(batch, axis=0) for batch in encoded_list_rearrange
        ]
        print('encoded_list_rearrange_concat[0].shape',
              encoded_list_rearrange_concat[0].shape)

        keys_list = []
        n_groups = 7
        for i in range(1, n_groups + 1):
            keys_list.append('group_' + str(i))
        feature_map_dict = dict(zip(keys_list, encoded_list_rearrange_concat))

        # get the group that we need: group 1, group 4, and group 5
        # and concat
        '''
        grouping_dict = {
             'group_1':['weather', 'house_price', 'POI_business', 'POI_food',
                  'POI_government', 'POI_publicservices', 'POI_recreation', 'POI_transportation', 'transit_routes'],
            'group_2': ['airquality'],
            'group_3': ['POI_hospitals', 'building_permit', 'collisions', 'seattle911calls'],
            'group_4':['POI_school', 'slope'],
             'group_5': ['seattle_street', 'bikelane'],
            'group_6':['total_flow_count'],
            'group_7':['transit_signals', 'transit_stop']

            }
        '''
        temp_arr1 = feature_map_dict['group_1']  # train_hours, 32, 20, dim
        temp_arr2 = feature_map_dict['group_4']
        temp_arr3 = feature_map_dict['group_5']
        latent_rep = np.concatenate([temp_arr1, temp_arr2, temp_arr3], axis=-1)
        #latent_rep= temp_arr1

        # latent_rep = np.load(latent_rep_path)
        #  (41616, 1, 32, 20, 1) for v1,  (41616, 32, 20, 1) for v2
        print('latent_rep.shape: ', latent_rep.shape)
        # latent_rep =latent_rep.reshape((41616, 32, 20, 5))

        latent_train_series = latent_rep[
            start_train_hour:end_train_hour, :, :, :]
        latent_test_series = latent_rep[end_train_hour:end_train_hour +
                                        test_len, :, :, :]
        # latent_train_series = np.squeeze(latent_train_series, axis=1)
        # latent_test_series = np.squeeze(latent_test_series, axis=1)
        print('latent_test_series.shape: ', latent_test_series.shape)
        dim = latent_test_series.shape[-1]
        # ---------------------------------------------------------------

    elif place == "Austin":
        print('load data for Austin...')
        globals()['HEIGHT'] = 28
        globals()['WIDTH'] = 28
        globals()['TIMESTEPS'] = 168
        globals()['BIKE_CHANNEL'] = 1
        globals()['NUM_2D_FEA'] = 3  # street count / streent len / poi count
        globals()['NUM_1D_FEA'] = 3
        globals()['BATCH_SIZE'] = 32
        globals()['TRAINING_STEPS'] = epoch
        # globals()['LEARNING_RATE']  = 0.003
        globals()['LEARNING_RATE'] = learning_rate
        print('global HEIGHT: ', HEIGHT)

        train_start_time = '2016-08-01'
        train_end_time = '2017-02-28'
        test_start_time = '2017-03-01 00:00:00'
        test_end_time = '2017-04-13 23:00:00'
        print('train_start_time for Austin: ', train_start_time)

        # hourly_grid_timeseries = pd.read_csv('./hourly_grid_1000_timeseries_trail.csv', index_col = 0)
        # hourly_grid_timeseries.index = pd.to_datetime(hourly_grid_timeseries.index)
        rawdata = pd.read_csv(
            '../rideaustin/rideaustin_grided_hourly_2000_20160801-20170413.csv',
            index_col=0)
        rawdata.index = pd.to_datetime(rawdata.index)

        # a set of region codes (e.g.: 10_10) that intersect with the city
        intersect_pos = pd.read_csv(
            '../rideaustin/austin_intersect_pos_28_28.csv')
        intersect_pos_set = set(intersect_pos['0'].tolist())
        # demographic data
        # should use 2018 data
        demo_raw = pd.read_csv(
            '../rideaustin/austin_demo_data/austin_28_28_demo_2000_intersect_geodf_2017.csv',
            index_col=0)
        train_obj = train(rawdata, demo_raw, train_start_time, train_end_time,
                          test_start_time, test_end_time)
        #ignore non-intersection cells in test_df
        # this is for evaluation
        test_df_cut = train_obj.test_df.loc[:,
                                            train_obj.test_df.columns.
                                            isin(list(intersect_pos_set))]

        # generate binary demo feature according to 2017 Austin city mean
        train_obj.generate_binary_demo_attr(intersect_pos_set, 70.2222, 8.7057,
                                            32.6351, 42.0087, 6.453)

        # load 2d and 1d features
        if use_2d_fea:
            print("use 2d feature")
            # landuse arr 28 28 1
            landuse_arr = np.load(
                '../feature_transform/austin_landuse_arr.npy')
            street_arr = np.load('../feature_transform/austin_street_arr.npy')
            # concatenate 2d data
            data_2d = np.concatenate([landuse_arr, street_arr], axis=2)
        else:
            print('ignore 2d data')
            data_2d = None

        if use_1d_fea:
            # weather: (1,1,6144,3)
            weather_arr = np.load(
                '../feature_transform/austin_weather_arr_1by1bytime.npy')
            weather_arr = weather_arr[0, 0, :, :]  # [6144, 3]
            # construct training / testing data for 1d data
            print(
                'generating fixed window length training and testing sequences for 1d data'
            )
            raw_seq_arr_1d = train_obj.generate_fixlen_timeseries(weather_arr)
            # test_series_1d.shape -> (169, 1296, 3)
            train_arr_1d, test_arr_1d = train_obj.train_test_split(
                raw_seq_arr_1d)
            #
        else:
            print('ignore 1d data')
            train_arr_1d = None
            test_arr_1d = None

        if os.path.isfile('../rideaustin/austin_28_20160801-20170413.npy'):
            print('loading raw data array...')
            rawdata_arr = np.load(
                '../rideaustin/austin_28_20160801-20170413.npy')
        else:
            print('generating raw data array')
            rawdata_arr = train_obj.df_to_tensor()
            np.save('../rideaustin/austin_28_20160801-20170413.npy',
                    rawdata_arr)
    else:
        print("Please input correct city name")


####################### city ignorant treatment ################
# lamda = 0
# if specified training dir to resume training,
# the save_path is the same dir as train_dir
# otherwise, create ta new dir for training
    if suffix == '':
        save_path = './bike_intermediatefea_model_' + str(dim) + '/'
    else:
        save_path = './bike_intermediatefea_model_' + str(
            dim) + '_' + suffix + '/'

    if train_dir:
        save_path = train_dir

    print("training dir: ", train_dir)
    print("save_path: ", save_path)
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # save demongraphic array
    #if os.path.isfile(save_path + 'demo_arr_32_20.npy'):
    #   print('loading demopgraphic data array...')
    #  demo_arr = np.load(save_path + 'demo_arr_32_20.npy')
    #else:

    # generate mask arr for city boundary
    demo_mask_arr = train_obj.demo_mask()

    # generate demographic in array format
    print('generating demo_arr array')
    demo_arr = train_obj.selected_demo_to_tensor()
    if not os.path.isfile(save_path + str(place) + '_demo_arr_' + str(HEIGHT) +
                          '.npy'):
        np.save(save_path + str(place) + '_demo_arr_' + str(HEIGHT) + '.npy',
                demo_arr)

    # calculate statistics for demo
    # pop_df, pop_ratio_df = train_obj.generate_pop_df()
    # pop_df.to_csv(save_path + 'pop_df.csv')
    # pop_ratio_df.to_csv(save_path + 'pop_ratio_df.csv')
    #
    # # demo_pop: if IFG, RFG, equal mean, use normalized pop.
    # # if pairwise, use non-normalized pop
    # if fairloss == "pairwise":
    # #demo_pop = demo_arr[:,:,1]  # normalized pop
    #     demo_pop = demo_arr[:,:,0]  #  pop # use pop for pairwise loss
    # else:
    #     demo_pop = demo_arr[:,:,1]  # normalized pop
    # demo_pop = np.expand_dims(demo_pop, axis=2)
    # print('demo_pop.shape: ',  demo_pop.shape)

    # demo sensitive
    '''
    ['pop','normalized_pop','bi_caucasian','bi_age','bi_high_incm',
    'bi_edu_univ','bi_nocar_hh','white_pop','age65_under','edu_uni']
    '''
    # demo_sensitive = demo_arr[:,:,2]  # caucasian
    # demo_sensitive = np.expand_dims(demo_sensitive, axis=2)

    # normalized population of each group
    '''
    caucasian	non_caucasian	senior	young	high_incm	low_incm
    high_edu	low_edu	  fewer_car	more_car
    '''
    # pop_g1 = pop_df['caucasian'].values[1]
    # pop_g2 = pop_df['non_caucasian'].values[1]
    #
    # if fairloss == 'RFG':  # metric1: region-based
    #     if multivar:
    #         print('MULTIVAR')
    #         fea_dim = [2,3,5]  # caucasian, age, edu_univ
    #         multi_pop_g1 = [pop_df['caucasian'].values[1], pop_df['young'].values[1], pop_df['high_edu'].values[1]]
    #         multi_pop_g2 = [pop_df['non_caucasian'].values[1], pop_df['senior'].values[1], pop_df['low_edu'].values[1]]
    #     else:  # single var
    #         fea_dim = [2]  # binary caucasian
    #         # multi_demo_sensitive = demo_arr[:,:,fea_dim]  # caucasian
    #         multi_pop_g1 = [pop_df['caucasian'].values[1]]
    #         multi_pop_g2 = [pop_df['non_caucasian'].values[1]]
    # elif fairloss == "IFG":
    #     if multivar:
    #         print('MULTIVAR')
    #         fea_dim = [7, 8, 9]  # multivar
    #     else:
    #         fea_dim = [7]  # white percent
    #     # multi_demo_sensitive = demo_arr[:,:,fea_dim]  # caucasian
    #     multi_pop_g1 = [pop_df['caucasian'].values[1], pop_df['young'].values[1], pop_df['high_edu'].values[1]]
    #     multi_pop_g2 = [pop_df['non_caucasian'].values[1], pop_df['senior'].values[1], pop_df['low_edu'].values[1]]
    # elif fairloss == "equalmean":
    #     fea_dim = [2]  # binar caucasian
    #     # multi_demo_sensitive = demo_arr[:,:,fea_dim]  # caucasian
    #     # multi_pop_g1 = [pop_df['caucasian'].values[1], pop_df['young'].values[1], pop_df['high_edu'].values[1]]
    #     # multi_pop_g2 = [pop_df['non_caucasian'].values[1], pop_df['senior'].values[1], pop_df['low_edu'].values[1]]
    #     multi_pop_g1 = [pop_df['caucasian'].values[1]]
    #     multi_pop_g2 = [pop_df['non_caucasian'].values[1]]
    #
    #     # multi_grid_g1 = [pop_df['caucasian'].values[0]]
    #     # multi_grid_g2 = [pop_df['non_caucasian'].values[0]]
    # elif fairloss == "pairwise":
    #     multi_pop_g1 = [pop_df['caucasian'].values[1]]
    #     multi_pop_g2 = [pop_df['non_caucasian'].values[1]]
    #     fea_dim = [2]  # binar caucasian
    #
    # multi_demo_sensitive = demo_arr[:,:,fea_dim]  # caucasian
    # multi_grid_g1 = [pop_df['caucasian'].values[0]]  # only for equal mean
    # multi_grid_g2 = [pop_df['non_caucasian'].values[0]]
    #

    # multi-var fairness input
    #fea_dim = [2,3,5]  # caucasian, age, edu_univ
    # fea_dim = [7]  # white percent
    # multi_demo_sensitive = demo_arr[:,:,fea_dim]  # caucasian

    # multi_pop_g1 = [pop_df['caucasian'].values[1], pop_df['young'].values[1], pop_df['high_edu'].values[1]]
    # multi_pop_g2 = [pop_df['non_caucasian'].values[1], pop_df['senior'].values[1], pop_df['low_edu'].values[1]]

    timer = str(time.time())
    if resume_training == False:
        # Model fusion without fairness
        print('Train Model fusion without fairness')
        conv3d_predicted = fused_model_with_latent_features.Conv3D(
            train_obj,
            train_arr,
            test_arr,
            intersect_pos_set,
            # multi_demo_sensitive, demo_pop, multi_pop_g1, multi_pop_g2,
            # multi_grid_g1, multi_grid_g2, fairloss,
            # train_arr_1d, test_arr_1d, data_2d,
            latent_train_series,
            latent_test_series,
            demo_mask_arr,
            save_path,
            HEIGHT,
            WIDTH,
            TIMESTEPS,
            BIKE_CHANNEL,
            NUM_2D_FEA,
            NUM_1D_FEA,
            BATCH_SIZE,
            TRAINING_STEPS,
            LEARNING_RATE).conv3d_predicted
    else:
        # resume training
        print('resume trainging from : ', train_dir)
        conv3d_predicted = fused_model_with_latent_features.Conv3D(
            train_obj,
            train_arr,
            test_arr,
            intersect_pos_set,

            # train_arr_1d, test_arr_1d, data_2d,
            latent_train_series,
            latent_test_series,
            demo_mask_arr,
            train_dir,
            HEIGHT,
            WIDTH,
            TIMESTEPS,
            BIKE_CHANNEL,
            NUM_2D_FEA,
            NUM_1D_FEA,
            BATCH_SIZE,
            TRAINING_STEPS,
            LEARNING_RATE,
            False,
            checkpoint,
            True,
            train_dir).conv3d_predicted

    conv3d_predicted.index = pd.to_datetime(conv3d_predicted.index)
    conv3d_predicted.to_csv(save_path + 'fused_model_pred_' + timer + '.csv')
    #convlstm_predicted = pd.read_csv(save_path + 'convlstm_predicted.csv', index_col=0)
    #convlstm_predicted.index = pd.to_datetime(convlstm_predicted.index)
    eval_obj4 = evaluation.evaluation(test_df_cut, conv3d_predicted,
                                      train_obj.demo_raw)
    diff_df = eval_obj4.group_difference()
    diff_df.to_csv(save_path + str(place) + '_evaluation.csv')

    finegrain_diff_df = eval_obj4.individual_difference()
    finegrain_diff_df.to_csv(save_path + 'IFG_eval.csv')

    print('rmse for conv3d: ', eval_obj4.rmse_val)
    print('mae for conv3d: ', eval_obj4.mae_val)
    print('mape for conv3d: ', eval_obj4.mape_val)

    # plot train test accuracy
    train_test = pd.read_csv(save_path + 'ecoch_res_df_' + '.csv')
    train_test = train_test.loc[:,
                                ~train_test.columns.str.contains('^Unnamed')]
    total_loss = train_test[['train_loss', 'test_loss']].plot()
    plt.savefig(save_path + 'total_loss_finish.png')
    acc_loss = train_test[['train_acc', 'test_acc']].plot()
    plt.savefig(save_path + 'acc_loss_finish.png')
    # fair_loss = train_test[['train_fair', 'test_fair']].plot()
    # plt.savefig(save_path + 'fair_loss_finish.png')
    plt.close()

    txt_name = save_path + 'latent_fea_df_' + timer + '.txt'
    with open(txt_name, 'w') as the_file:
        the_file.write(
            'Only account for grids that intersect with city boundary \n')
        # the_file.write('lamda\n')
        # the_file.write(str(lamda) + '\n')
        # the_file.write('beta\n')
        # the_file.write(str(beta) + '\n')
        the_file.write('dim\n')
        the_file.write(str(dim) + '\n')
        # the_file.write('use_1d_fea\n')
        # the_file.write(str(use_1d_fea) + '\n')
        # the_file.write('use_2d_fea\n')
        # the_file.write(str(use_2d_fea) + '\n')
        # the_file.write('fairloss\n')
        # the_file.write(str(fairloss) + '\n')
        # the_file.write('multivar\n')
        # the_file.write(str(multivar) + '\n')
        the_file.write('learning rate\n')
        the_file.write(str(LEARNING_RATE) + '\n')
        the_file.write('rmse for conv3d\n')
        the_file.write(str(eval_obj4.rmse_val) + '\n')
        the_file.write('mae for conv3d\n')
        the_file.write(str(eval_obj4.mae_val) + '\n')
        the_file.write('mape for conv3d\n')
        the_file.write(str(eval_obj4.mape_val) + '\n')

        the_file.close()