def load_and_arrange_data(conf, verbose=False): train, test, _, metadata = load_data() if 'random_seed' in conf: train_ids, val_ids = stratified_cv_series_id( train.series_id.unique(), fold_idx=conf['fold_idx'], random_seed=conf['random_seed']) else: train_ids, val_ids = split_series_id(train.series_id.unique(), fold_idx=conf['fold_idx']) val = train[train.series_id.isin(val_ids)] val.reset_index(inplace=True, drop=True) train = train[train.series_id.isin(train_ids)] train.reset_index(inplace=True, drop=True) train = pd.concat([train, test]) train.reset_index(inplace=True, drop=True) train_x, train_y = prepare_data_for_train(train, metadata, conf['input_days'], conf['window'], conf['only_working_day'], verbose=verbose) val_x, val_y = prepare_data_for_train(val, metadata, conf['input_days'], conf['window'], conf['only_working_day'], verbose=verbose) train_x = np.concatenate([train_x[key] for key in conf['inputs']], axis=2) val_x = np.concatenate([val_x[key] for key in conf['inputs']], axis=2) return train_x, train_y, val_x, val_y
def load_and_arrange_data(conf, verbose=False): """ Prepares the data for training using cross-validation """ train, test, _, metadata = load_data() if 'random_seed' in conf: print('Using random seed') train_ids, val_ids = stratified_cv_series_id( train.series_id.unique(), fold_idx=conf['fold_idx'], random_seed=conf['random_seed']) else: train_ids, val_ids = split_series_id(train.series_id.unique(), fold_idx=conf['fold_idx']) val = train[train.series_id.isin(val_ids)] val.reset_index(inplace=True, drop=True) train = train[train.series_id.isin(train_ids)] train.reset_index(inplace=True, drop=True) train = pd.concat([train, test]) train.reset_index(inplace=True, drop=True) _train_x, train_y = prepare_data_for_train(train, conf['input_days'], conf['window'], verbose=verbose) _val_x, val_y = prepare_data_for_train(val, conf['input_days'], conf['window'], verbose=verbose) train_x, val_x = {}, {} train_x['past_features'] = np.concatenate([_train_x['past_%s' % key] \ for key in conf['past_features']], axis=2) val_x['past_features'] = np.concatenate([_val_x['past_%s' % key] \ for key in conf['past_features']], axis=2) train_x['future_features'] = np.concatenate([_train_x['future_%s' % key] \ for key in conf['future_features']], axis=2) val_x['future_features'] = np.concatenate([_val_x['future_%s' % key] \ for key in conf['future_features']], axis=2) train_x['cluster_features'] = np.concatenate([_train_x[key] \ for key in conf['cluster_features']], axis=1) val_x['cluster_features'] = np.concatenate([_val_x[key] \ for key in conf['cluster_features']], axis=1) return train_x, train_y, val_x, val_y
def _load_and_arrange_data(conf): train, test, submission, metadata = load_data() if 'random_seed' in conf: train_ids, val_ids = stratified_cv_series_id( train.series_id.unique(), fold_idx=conf['fold_idx'], random_seed=conf['random_seed']) else: train_ids, val_ids = split_series_id(train.series_id.unique(), fold_idx=conf['fold_idx']) val = train[train.series_id.isin(val_ids)] val.reset_index(inplace=True, drop=True) train = train[train.series_id.isin(train_ids)] train.reset_index(inplace=True, drop=True) train = pd.concat([train, test]) train.reset_index(inplace=True, drop=True) train_x, train_y = prepare_data_for_train( train, metadata, conf['input_days'], conf['window'], verbose=False) val_x, val_y = prepare_data_for_train( val, metadata, conf['input_days'], conf['window'], verbose=False) train_x = {key:train_x[key] for key in train_x if key not in conf['remove_inputs']} val_x = {key:val_x[key] for key in val_x if key not in conf['remove_inputs']} return train_x, train_y, val_x, val_y
def test_split_series_id(series_id, val_id, train_id, fold_idx): ret = split_series_id(series_id, fold_idx) assert ret[0] == train_id assert ret[1] == val_id