def var_predict(df, n_forwards=(1, 3), n_lags=4, test_ratio=0.2): n_sample, n_output = df.shape n_test = int(round(n_sample * test_ratio)) n_train = n_sample - n_test df_train, df_test = df[:n_train], df[n_train:] scaler = StandardScaler(mean=df_train.values.mean(), std=df_train.values.std()) data = scaler.transform(df_train.values) var_model = VAR(data) var_result = var_model.fit(n_lags) max_n_forwards = np.max(n_forwards) # Do forecasting. result = np.zeros(shape=(len(n_forwards), n_test, n_output)) start = n_train - n_lags - max_n_forwards + 1 for input_ind in range(start, n_sample - n_lags): prediction = var_result.forecast( scaler.transform(df.values[input_ind:input_ind + n_lags]), max_n_forwards) for i, n_forward in enumerate(n_forwards): result_ind = input_ind - n_train + n_lags + n_forward - 1 if 0 <= result_ind < n_test: result[i, result_ind, :] = prediction[n_forward - 1, :] df_predicts = [] for i, n_forward in enumerate(n_forwards): df_predict = pd.DataFrame(scaler.inverse_transform(result[i]), index=df_test.index, columns=df_test.columns) df_predicts.append(df_predict) df_predict.to_csv("./df_predict.csv", sep=',', index=False) df_test.to_csv("./df_test.csv", sep=',', index=False) return df_predicts, df_test
def test_transform_df(self): df = pd.DataFrame([[35., 0.], [0., 17.5], [70., 35.]]) expected_result = np.array([[0., -1.], [-1, -0.5], [1., 0.]]) scaler = StandardScaler(mean=35., std=35.) result = scaler.transform(df) self.assertTrue(np.array_equal(expected_result, result.values))
def var_predict(df, n_forwards=(1, 3), n_lags=4, test_ratio=0.2): """ Multivariate time series forecasting using Vector Auto-Regressive Model. :param df: pandas.DataFrame, index: time, columns: sensor id, content: data. :param n_forwards: a tuple of horizons. :param n_lags: the order of the VAR model. :param test_ratio: :return: [list of prediction in different horizon], dt_test """ n_sample, n_output = df.shape n_test = int(round(n_sample * test_ratio)) n_train = n_sample - n_test df_train, df_test = df[:n_train], df[n_train:] scaler = StandardScaler(mean=df_train.values.mean(), std=df_train.values.std()) data = scaler.transform(df_train.values) var_model = VAR(data) var_result = var_model.fit(n_lags) max_n_forwards = np.max(n_forwards) # Do forecasting. result = np.zeros(shape=(len(n_forwards), n_test, n_output)) start = n_train - n_lags - max_n_forwards + 1 for input_ind in range(start, n_sample - n_lags): prediction = var_result.forecast(scaler.transform(df.values[input_ind: input_ind + n_lags]), max_n_forwards) for i, n_forward in enumerate(n_forwards): result_ind = input_ind - n_train + n_lags + n_forward - 1 if 0 <= result_ind < n_test: result[i, result_ind, :] = prediction[n_forward - 1, :] df_predicts = [] for i, n_forward in enumerate(n_forwards): df_predict = pd.DataFrame(scaler.inverse_transform(result[i]), index=df_test.index, columns=df_test.columns) df_predicts.append(df_predict) return df_predicts, df_test
def __init__(self, config, df_data, **kwargs): self._config = dict(config) self._epoch = 0 # logging. self._init_logging() self._logger.info(config) # Data preparation test_ratio = self._get_config('test_ratio') validation_ratio = self._get_config('validation_ratio') self._df_train, self._df_val, self._df_test = utils.train_val_test_split_df( df_data, val_ratio=validation_ratio, test_ratio=test_ratio) self._scaler = StandardScaler(mean=self._df_train.values.mean(), std=self._df_train.values.std()) self._x_train, self._y_train, self._x_val, self._y_val, self._x_test, self._y_test = self._prepare_train_val_test_data( ) self._eval_dfs = self._prepare_eval_df() # Build models. self._train_model, self._val_model, self._test_model = self._build_train_val_test_models( ) # Log model statistics. total_trainable_parameter = tf_utils.get_total_trainable_parameter_size( ) self._logger.info('Total number of trainable parameters: %d' % total_trainable_parameter) for var in tf.global_variables(): self._logger.debug('%s, %s' % (var.name, var.get_shape()))
def evaluate(self, sess, **kwargs): y_preds_all = [] half_length = int(len(self.clusters) / 2) sclusters = self.clusters[0:32] for cluster in sclusters: node_count, adj_mx = self.cluster_data(cluster) adj_mx = utils.calculate_random_walk_matrix(adj_mx).T adj_mx = self._build_sparse_matrix(adj_mx) global_step = sess.run(tf.train.get_or_create_global_step()) scaler_path = self._kwargs['data'].get( 'dataset_dir') + '/scaler.npy' scaler_data_ = np.load(scaler_path) mean, var = scaler_data_[0], scaler_data_[1] scaler = StandardScaler(mean=mean, std=var) # change val to test before run test_data_path = self._kwargs['data'].get( 'dataset_dir') + '/test_' + str(cluster) + '.tfrecords' test_dataset = tf.data.TFRecordDataset([test_data_path]) test_dataset = test_dataset.map(self._parse_record_fn) test_dataset = test_dataset.make_one_shot_iterator() test_next_element = test_dataset.get_next() test_results = self.run_epoch_generator(sess, self._test_model, test_next_element, adj_mx, return_output=True, training=False) test_loss, y_preds = test_results['loss'], test_results['outputs'] utils.add_simple_summary(self._writer, ['loss/test_loss'], [test_loss], global_step=global_step) y_preds = np.concatenate(y_preds, axis=0) y_preds = scaler.inverse_transform(y_preds[:, self.horizon - 1, :, 0]) y_preds = y_preds[:, 0:node_count] y_preds_all.append(y_preds) y_preds_all = np.concatenate(y_preds_all, axis=1) return y_preds_all
def test_reverse_transform(self): data = np.array([[0., -1.], [-1, -0.5], [1., 0.]]) expected_result = np.array([[35., 0.], [0., 17.5], [70., 35.]]) scaler = StandardScaler(mean=35., std=35.) result = scaler.inverse_transform(data) self.assertTrue(np.array_equal(expected_result, result))
def setup_dataloader( arr3d, seq_len, horizon, length_dict, train_batch_size, val_batch_size, test_batch_size, scale, features, logger, seq_sampling, ): train_length = length_dict['train_length'] val_length = length_dict['val_length'] test_length = length_dict['test_length'] test_arr3d = arr3d[-test_length:] val_arr3d = arr3d[train_length:train_length + val_length] if val_length > 0 else test_arr3d train_arr3d = arr3d[:train_length] train_arr2d = train_arr3d[:, :, 0] val_arr2d = val_arr3d[:, :, 0] test_arr2d = test_arr3d[:, :, 0] train_z_arr3d = train_arr3d.copy() val_z_arr3d = val_arr3d.copy() test_z_arr3d = test_arr3d.copy() scaler = StandardScaler(mean=train_arr2d.mean(), std=train_arr2d.std(), scale=scale) train_z_arr3d[:, :, 0] = scaler.transform(train_arr2d) val_z_arr3d[:, :, 0] = scaler.transform(val_arr2d) test_z_arr3d[:, :, 0] = scaler.transform(test_arr2d) dataloaders = {} dataloaders['test_loader'] = \ SpatioTemporalDataLoader(test_z_arr3d, test_batch_size, seq_len, horizon, shuffle=False, features=features, seq_sampling=seq_sampling) assert dataloaders[ 'test_loader'].num_batch > 0, 'num_batch for test dataset should be > 0' dataloaders['val_loader'] = \ SpatioTemporalDataLoader(val_z_arr3d, val_batch_size, seq_len, horizon, shuffle=False, features=features, seq_sampling=seq_sampling) dataloaders['train_loader'] = \ SpatioTemporalDataLoader(train_z_arr3d, train_batch_size, seq_len, horizon, shuffle=True, features=features, seq_sampling=seq_sampling) dataloaders['scaler'] = scaler logger.info('[train] | # timesteps: {:06d} | # samples: {:06d} | # batches: {:06d}'.\ format(train_length, dataloaders['train_loader'].size, dataloaders['train_loader'].num_batch)) logger.info('[validation] | # timesteps: {:06d} | # samples: {:06d} | # batches: {:06d}'.\ format(val_length, dataloaders['val_loader'].size, dataloaders['val_loader'].num_batch)) logger.info('[test] | # timesteps: {:06d} | # samples: {:06d} | # batches: {:06d}'.\ format(test_length, dataloaders['test_loader'].size, dataloaders['test_loader'].num_batch)) return dataloaders
def main(args): with open(args.config_filename) as f: supervisor_config = yaml.load(f) graph_pkl_filename = supervisor_config['data'].get( 'graph_pkl_filename') sensor_ids, sensor_id_to_ind, adj_mx = load_graph_data( graph_pkl_filename) supervisor_config['model']['num_nodes'] = num_nodes = len(sensor_ids) # Data preprocessing traffic_df_filename = supervisor_config['data']['hdf_filename'] df_data = pd.read_hdf(traffic_df_filename) #df_data = df_data.iloc[int(df_data.shape[0]/3):,:] validation_ratio = supervisor_config.get('data').get( 'validation_ratio') test_ratio = supervisor_config.get('data').get('test_ratio') df_train, df_val, df_test = train_val_test_split( df_data, val_ratio=validation_ratio, test_ratio=test_ratio) batch_size = supervisor_config.get('data').get('batch_size') val_batch_size = supervisor_config.get('data').get('val_batch_size') test_batch_size = supervisor_config.get('data').get('test_batch_size') horizon = supervisor_config.get('model').get('horizon') seq_len = supervisor_config.get('model').get('seq_len') scaler = StandardScaler(mean=df_train.values.mean(), std=df_train.values.std()) data_train = generate_seq2seq_data(df_train, batch_size, seq_len, horizon, num_nodes, 'train', scaler) data_val = generate_seq2seq_data(df_val, val_batch_size, seq_len, horizon, num_nodes, 'val', scaler) data_train.update(data_val) #data_train['scaler'] = scaler data_test = generate_seq2seq_data(df_test, test_batch_size, seq_len, horizon, num_nodes, 'test', scaler) #data_test['scaler'] = scaler tf_config = tf.ConfigProto() if args.use_cpu_only: tf_config = tf.ConfigProto(device_count={'GPU': 0}) tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: supervisor = DCRNNSupervisor(adj_mx, data_train, supervisor_config) data_tag = supervisor_config.get('data').get('dataset_dir') folder = data_tag + '/model/' if not os.path.exists(folder): os.makedirs(folder) # Train supervisor.train(sess=sess) # Test yaml_files = glob.glob('%s/model/*/*.yaml' % data_tag, recursive=True) yaml_files.sort(key=os.path.getmtime) config_filename = yaml_files[-1] #'config_%d.yaml' % config_id with open(config_filename) as f: config = yaml.load(f) # Load model and evaluate supervisor.load(sess, config['train']['model_filename']) y_preds = supervisor.evaluate(sess, data_test) n_test_samples = data_test['y_test'].shape[0] folder = data_tag + '/results/' if not os.path.exists(folder): os.makedirs(folder) for horizon_i in range(data_test['y_test'].shape[1]): y_pred = scaler.inverse_transform(y_preds[:, horizon_i, :, 0]) eval_dfs = df_test[seq_len + horizon_i:seq_len + horizon_i + n_test_samples] df = pd.DataFrame(y_pred, index=eval_dfs.index, columns=eval_dfs.columns) #df = pd.DataFrame(y_pred, columns=df_test.columns) filename = os.path.join( '%s/results/' % data_tag, 'dcrnn_speed_prediction_%s.h5' % str(horizon_i + 1)) df.to_hdf(filename, 'results') print( 'Predictions saved as %s/results/dcrnn_prediction_[1-12].h5...' % data_tag)