def quick_load_bj_taxi_external(timeslots, description, length): """ load external data quickly, if time slots is the same :param timeslots: the demanded time :param description: global or local :return: """ start_time = str(timeslots[0][0]) end_time = str(timeslots[-1][0]) path = r'/home/ryj/renyajie/exp/GLST_Net/inter_data/data' filename = 'bj_taxi_external_{}_{}_{}_{}.h5'.format( description, length, start_time, end_time) f = h5py.File(os.path.join(path, filename), 'a') # encode the time encode_time = np.asarray([utils.encode_time(batch) for batch in timeslots]) # if the same, load directly if 'timeslots' in f and (encode_time == f['timeslots'][()]).all(): print('cache load bj taxi {} external data from {} to {}'.format( description, start_time, end_time)) print('-' * 30) vacation = f['vacation'][()] dayOfWeek = f['dayOfWeek'][()] weather = f['weather'][()] continuous_external = f['continuous_external'][()] hour = f['hour'][()] f.close() return vacation, hour, dayOfWeek, weather, continuous_external f.close() # else calculate then cache f = h5py.File(os.path.join(path, filename), 'w') print('calculate bj taxi {} external data from {} to {}'.format( description, start_time, end_time)) if description == 'global': vacation, hour, dayOfWeek, weather, continuous_external = \ load_bj_taxi_external(timeslots, 'bj taxi global external data') else: vacation, hour, dayOfWeek, weather, continuous_external = \ load_bj_taxi_external(timeslots, 'bj taxi local external data') print('cache bj taxi {} external data from {} to {}'.format( description, start_time, end_time)) f['timeslots'] = encode_time f['vacation'] = vacation f['hour'] = hour f['dayOfWeek'] = dayOfWeek f['weather'] = weather f['continuous_external'] = continuous_external f.close() return vacation, hour, dayOfWeek, weather, continuous_external
def quick_load_ny_bike_external(timeslots, description, length): """ load external data quickly, if time slots is the same :param timeslots: the demanded time :param description: global or local :return: """ start_time = str(timeslots[0][0]) end_time = str(timeslots[-1][0]) path = r'/home/ryj/renyajie/exp/GLST_Net/inter_data/data' filename = 'ny_bike_external_{}_{}_{}_{}.h5'.format( description, length, start_time, end_time) f = h5py.File(os.path.join(path, filename), 'a') # encode the time encode_time = np.asarray([utils.encode_time(batch) for batch in timeslots]) # if the same, load directly if 'timeslots' in f and (encode_time == f['timeslots'][()]).all(): print('cache load ny bike {} external data from {} to {}'.format( description, start_time, end_time)) print('-' * 30) dayOfWeek = f['dayOfWeek'][()] hour = f['hour'][()] f.close() return hour, dayOfWeek f.close() # else calculate then cache f = h5py.File(os.path.join(path, filename), 'w') print('calculate ny bike {} external data from {} to {}'.format( description, start_time, end_time)) if description == 'global': hour, dayOfWeek = load_ny_bike_external( timeslots, 'ny bike global external data') else: hour, dayOfWeek = load_ny_bike_external(timeslots, 'ny bike local external data') print('cache ny bike {} external data from {} to {}'.format( description, start_time, end_time)) f['timeslots'] = encode_time f['hour'] = hour f['dayOfWeek'] = dayOfWeek f.close() return hour, dayOfWeek
def quick_get_data(dataset, predict_time, index_cut, neighbor_size, lstm_seq_len, flow_gate_len, att_lstm_num, att_lstm_seq_len): start_time = str(predict_time[0]) end_time = str(predict_time[-1]) path = r'/home/ryj/renyajie/exp/GLST_Net/inter_data/data' filename = '{}_baseline_stdn_neighbor_{}_{}_{}_{}.h5'.format(dataset, neighbor_size, lstm_seq_len, flow_gate_len, att_lstm_num, att_lstm_seq_len) f = h5py.File(os.path.join(path, filename), 'a') # encode the time encode_time = np.asarray([utils.encode_time(batch) for batch in predict_time]) # if the same, load directly if 'predict_time' in f and (encode_time == f['predict_time'][()]).all(): print('cache load stdn {} data from {} to {}, neighbor is {}'.format(dataset, start_time, end_time, neighbor_size)) print('-' * 30) train_dict = {key + '_train' : f[key + '_train'][()] for key in data_name} test_dict = {key + '_test' : f[key + '_test'][()] for key in data_name} external_dim = f['external_dim'][()] if(train_dict['lstm_input_train'].shape[1] == lstm_seq_len and train_dict['flow_input_train'].shape[3] == neighbor_size * 2 + 1 and train_dict['att_nbhd_input_train'].shape[1] == att_lstm_num * att_lstm_seq_len and train_dict['att_lstm_input_train'].shape[1] == att_lstm_num): f.close() return train_dict, test_dict, external_dim f.close() # else calculate then cache f = h5py.File(os.path.join(path, filename), 'w') print('calculate stdn {} data from {} to {}, neighbor is {}'.format(dataset, start_time, end_time, neighbor_size)) train_dict, test_dict, external_dim = \ get_data(dataset, predict_time, index_cut, neighbor_size, lstm_seq_len, flow_gate_len, att_lstm_num, att_lstm_seq_len) print('cache stdn {} data from {} to {}, neighbor is {}'.format(dataset, start_time, end_time, neighbor_size)) f['predict_time'] = encode_time for data in [test_dict, train_dict]: for key, value in data.items(): f[key] = value f['external_dim'] = external_dim f.close() return train_dict, test_dict, external_dim
def quick_get_data(dataset, predict_time, index_cut, neighbor_size, len_close, len_period, len_trend): """ load st-res data quickly if there is the file :param predict_time: :param index_cut: :param neighbor_size: :param len_close: :param len_period: :param len_trend: :return: """ start_time = str(predict_time[0]) end_time = str(predict_time[-1]) path = r'/home/ryj/renyajie/exp/GLST_Net/inter_data/data' filename = '{}_baseline_stres_neighbor_{}_{}_{}_{}.h5'.format( dataset, neighbor_size, len_close, len_period, len_trend) f = h5py.File(os.path.join(path, filename), 'a') # encode the time encode_time = np.asarray( [utils.encode_time(batch) for batch in predict_time]) # if the same, load directly if 'predict_time' in f and (encode_time == f['predict_time'][()]).all(): print('cache load st-res {} data from {} to {}, neighbor is {}'.format( dataset, start_time, end_time, neighbor_size)) print('-' * 30) X_train = f['X_train'][()] X_test = f['X_test'][()] Y_train = f['Y_train'][()] Y_test = f['Y_test'][()] T_train = f['T_train'][()] T_test = f['T_test'][()] External_train = f['External_train'][()] External_test = f['External_test'][()] external_dim = f['external_dim'][()] f.close() return X_train, X_test, Y_train, Y_test, T_train, T_test, External_train, External_test, external_dim f.close() # else calculate then cache f = h5py.File(os.path.join(path, filename), 'w') print('calculate st-res {} data from {} to {}, neighbor is {}'.format( dataset, start_time, end_time, neighbor_size)) X_train, X_test, Y_train, Y_test, T_train, T_test, External_train, External_test, external_dim = \ get_data(dataset, predict_time, index_cut, neighbor_size, len_close, len_period, len_trend) encode_T_train = np.asarray( [utils.encode_time(batch) for batch in T_train]) encode_T_test = np.asarray([utils.encode_time(batch) for batch in T_test]) print('cache st-res {} data from {} to {}, neighbor is {}'.format( dataset, start_time, end_time, neighbor_size)) f['predict_time'] = encode_time f['X_train'] = X_train f['X_test'] = X_test f['Y_train'] = Y_train f['Y_test'] = Y_test f['T_train'] = encode_T_train f['T_test'] = encode_T_test f['External_train'] = External_train f['External_test'] = External_test f['external_dim'] = external_dim f.close() return X_train, X_test, Y_train, Y_test, T_train, T_test, External_train, External_test, external_dim
def quick_get_data(dataset, predict_time, hours, days, weeks): """ load astgcn data quickly if there is the file :param predict_time: :param index_cut: :param neighbor_size: :param hours: :param days: :param weeks: :param predict_len: 1 default :return: """ start_time = str(predict_time[0]) end_time = str(predict_time[-1]) path = r'/home/ryj/renyajie/exp/GLST_Net/inter_data/data' filename = '{}_baseline_astgcn_{}_{}_{}.h5'.format(dataset, hours, days, weeks) f = h5py.File(os.path.join(path, filename), 'a') # encode the time encode_time = np.asarray( [utils.encode_time(batch) for batch in predict_time]) # if the same, load directly if 'predict_time' in f and (encode_time == f['predict_time'][()]).all(): print('cache load astgcn {} data from {} to {}'.format( dataset, start_time, end_time)) print('-' * 30) X_train = f['X_train'][()] X_test = f['X_test'][()] Y_train = f['Y_train'][()] Y_test = f['Y_test'][()] T_train = f['T_train'][()] T_test = f['T_test'][()] f.close() return X_train, X_test, Y_train, Y_test, T_train, T_test f.close() # else calculate then cache f = h5py.File(os.path.join(path, filename), 'w') print('calculate astgcn {} data from {} to {}'.format( dataset, start_time, end_time)) X_train, X_test, Y_train, Y_test, T_train, T_test = \ get_data(dataset, predict_time, hours, days, weeks) encode_T_train = np.asarray( [utils.encode_time(batch) for batch in T_train]) encode_T_test = np.asarray([utils.encode_time(batch) for batch in T_test]) print('cache astgcn {} data from {} to {}'.format(dataset, start_time, end_time)) f['predict_time'] = encode_time f['X_train'] = X_train f['X_test'] = X_test f['Y_train'] = Y_train f['Y_test'] = Y_test f['T_train'] = encode_T_train f['T_test'] = encode_T_test f.close() return X_train, X_test, Y_train, Y_test, T_train, T_test
def load_ny_bike(proportion_test, len_global=7, len_local=4, neighbor_size=3, region_num=5): """ load all the data args: len_global: the time length of global data len_local: the time length of local data neighbor_size: the local size, size = (val * 2 + 1) * (val * 2 + 1) region_num: how many regions that a map contains ret: train set and test set, including: 1. global external and flow data 2. local external and flow data 3. ground truth """ date, data, mmn, index = load_ny_bike_flow() # get global and local flow data, ground truth and the corresponding date global_flow, stack_local_flow, ground_truth, current_local_flow, index_cut, \ predict_time, global_timeslots, local_timeslots = \ utils.get_flow_data(date, data, len_global, len_local, neighbor_size, region_num, unit_len=24, width=16, height=8) # get global and local external data g_hour, g_dayOfWeek = quick_load_ny_bike_external(global_timeslots, 'global', len_global) t_hour, t_dayOfWeek = quick_load_ny_bike_external(local_timeslots, 'local', len_local) # change encode to ascii for time predict_time = np.asarray(utils.encode_time(predict_time)) global_timeslots = np.asarray( [utils.encode_time(batch) for batch in global_timeslots]) local_timeslots = np.asarray( [utils.encode_time(batch) for batch in local_timeslots]) # build train set and test set according to the param:len_test data_dict = { 'global_flow': global_flow, 'stack_local_flow': stack_local_flow, 'ground_truth': ground_truth, 'current_local_flow': current_local_flow, 'index_cut': index_cut, 'predict_time': predict_time, 'global_timeslots': global_timeslots, 'local_timeslots': local_timeslots, 'g_hour': g_hour, 'g_dayOfWeek': g_dayOfWeek, 't_hour': t_hour, 't_dayOfWeek': t_dayOfWeek } data_dict = utils.duplicate_data(data_dict, region_num) total_length = g_dayOfWeek.shape[0] len_test = math.ceil(total_length * proportion_test) len_train = total_length - len_test print('train set length {:d}\ntest set length {:d}\n{}'.format( len_train, len_test, '-' * 30)) train_set, test_set, data_name = utils.divide_train_and_test( len_test, data_dict) print('predict start: {}\npredict end: {}'.format( data_dict["predict_time"][0].decode('utf-8'), data_dict["predict_time"][-1].decode('utf-8'))) print('-' * 30) return train_set, test_set, mmn, data_name
def load_rdw_ny_bike(proportion_test, len_recent=4, len_daily=4, len_week=4, neighbor_size=2, region_num=5): """ load all the data args: len_global: the time length of global data len_local: the time length of local data neighbor_size: the local size, size = (val * 2 + 1) * (val * 2 + 1) region_num: how many regions that a map contains ret: train set and test set, including: 1. global external and flow data 2. local external and flow data 3. ground truth """ date, data, mmn, index = load_ny_bike_flow() # get global and local flow data, ground truth and the corresponding date recent_local_flow, daily_local_flow, week_local_flow, ground_truth, current_local_flow \ , index_cut, predict_time, recent_time, daily_time, week_time, current_time = \ utils.get_flow_rdw_data(date, data, len_recent, len_daily, len_week, neighbor_size, region_num, unit_len=24, width=16, height=8) # get recent, daily, week, current external data recent_hour, recent_dayOfWeek = quick_load_ny_bike_external( recent_time, 'recent', len_recent) daily_hour, daily_dayOfWeek = quick_load_ny_bike_external( daily_time, 'daily', len_daily) week_hour, week_dayOfWeek = quick_load_ny_bike_external( week_time, 'week', len_week) current_hour, current_dayOfWeek = quick_load_ny_bike_external( current_time, 'current', 1) # change encode to ascii for time predict_time = np.asarray(utils.encode_time(predict_time)) recent_time = np.asarray( [utils.encode_time(batch) for batch in recent_time]) daily_time = np.asarray([utils.encode_time(batch) for batch in daily_time]) week_time = np.asarray([utils.encode_time(batch) for batch in week_time]) current_time = np.asarray( [utils.encode_time(batch) for batch in current_time]) # build train set and test set according to the param:len_test data_dict = { 'recent_local_flow': recent_local_flow, 'daily_local_flow': daily_local_flow, 'week_local_flow': week_local_flow, 'current_local_flow': current_local_flow, 'ground_truth': ground_truth, 'index_cut': index_cut, 'predict_time': predict_time, 'recent_time': recent_time, 'daily_time': daily_time, 'week_time': week_time, 'current_time': current_time, 'recent_hour': recent_hour, 'recent_dayOfWeek': recent_dayOfWeek, 'daily_hour': daily_hour, 'daily_dayOfWeek': daily_dayOfWeek, 'week_hour': week_hour, 'week_dayOfWeek': week_dayOfWeek, 'current_hour': current_hour, 'current_dayOfWeek': current_dayOfWeek } data_dict = utils.duplicate_rdw_data(data_dict, region_num) total_length = current_time.shape[0] len_test = math.ceil(total_length * proportion_test) len_train = total_length - len_test print('train set length {:d}\ntest set length {:d}\n{}'.format( len_train, len_test, '-' * 30)) train_set, test_set, data_name = utils.divide_train_and_test( len_test, data_dict) print('predict start: {}\npredict end: {}'.format( data_dict["predict_time"][0].decode('utf-8'), data_dict["predict_time"][-1].decode('utf-8'))) print('-' * 30) return train_set, test_set, mmn, data_name