Пример #1
0
def load_and_arrange_data(conf, verbose=False):
    train, test, _, metadata = load_data()

    if 'random_seed' in conf:
        train_ids, val_ids = stratified_cv_series_id(
            train.series_id.unique(),
            fold_idx=conf['fold_idx'],
            random_seed=conf['random_seed'])
    else:
        train_ids, val_ids = split_series_id(train.series_id.unique(),
                                             fold_idx=conf['fold_idx'])
    val = train[train.series_id.isin(val_ids)]
    val.reset_index(inplace=True, drop=True)
    train = train[train.series_id.isin(train_ids)]
    train.reset_index(inplace=True, drop=True)

    train = pd.concat([train, test])
    train.reset_index(inplace=True, drop=True)

    train_x, train_y = prepare_data_for_train(train,
                                              metadata,
                                              conf['input_days'],
                                              conf['window'],
                                              conf['only_working_day'],
                                              verbose=verbose)
    val_x, val_y = prepare_data_for_train(val,
                                          metadata,
                                          conf['input_days'],
                                          conf['window'],
                                          conf['only_working_day'],
                                          verbose=verbose)

    train_x = np.concatenate([train_x[key] for key in conf['inputs']], axis=2)
    val_x = np.concatenate([val_x[key] for key in conf['inputs']], axis=2)
    return train_x, train_y, val_x, val_y
Пример #2
0
def load_and_arrange_data(conf, verbose=False):
    """
    Prepares the data for training using cross-validation
    """
    train, test, _, metadata = load_data()

    if 'random_seed' in conf:
        print('Using random seed')
        train_ids, val_ids = stratified_cv_series_id(
            train.series_id.unique(),
            fold_idx=conf['fold_idx'],
            random_seed=conf['random_seed'])
    else:
        train_ids, val_ids = split_series_id(train.series_id.unique(),
                                             fold_idx=conf['fold_idx'])
    val = train[train.series_id.isin(val_ids)]
    val.reset_index(inplace=True, drop=True)
    train = train[train.series_id.isin(train_ids)]
    train.reset_index(inplace=True, drop=True)

    train = pd.concat([train, test])
    train.reset_index(inplace=True, drop=True)

    _train_x, train_y = prepare_data_for_train(train,
                                               conf['input_days'],
                                               conf['window'],
                                               verbose=verbose)
    _val_x, val_y = prepare_data_for_train(val,
                                           conf['input_days'],
                                           conf['window'],
                                           verbose=verbose)

    train_x, val_x = {}, {}
    train_x['past_features'] = np.concatenate([_train_x['past_%s' % key] \
        for key in conf['past_features']], axis=2)
    val_x['past_features'] = np.concatenate([_val_x['past_%s' % key] \
        for key in conf['past_features']], axis=2)

    train_x['future_features'] = np.concatenate([_train_x['future_%s' % key] \
        for key in conf['future_features']], axis=2)
    val_x['future_features'] = np.concatenate([_val_x['future_%s' % key] \
        for key in conf['future_features']], axis=2)

    train_x['cluster_features'] = np.concatenate([_train_x[key] \
        for key in conf['cluster_features']], axis=1)
    val_x['cluster_features'] = np.concatenate([_val_x[key] \
        for key in conf['cluster_features']], axis=1)

    return train_x, train_y, val_x, val_y
def _load_and_arrange_data(conf):
    train, test, submission, metadata = load_data()

    if 'random_seed' in conf:
        train_ids, val_ids = stratified_cv_series_id(
            train.series_id.unique(), fold_idx=conf['fold_idx'], random_seed=conf['random_seed'])
    else:
        train_ids, val_ids = split_series_id(train.series_id.unique(), fold_idx=conf['fold_idx'])

    val = train[train.series_id.isin(val_ids)]
    val.reset_index(inplace=True, drop=True)
    train = train[train.series_id.isin(train_ids)]
    train.reset_index(inplace=True, drop=True)

    train = pd.concat([train, test])
    train.reset_index(inplace=True, drop=True)

    train_x, train_y = prepare_data_for_train(
        train, metadata, conf['input_days'], conf['window'], verbose=False)
    val_x, val_y = prepare_data_for_train(
        val, metadata, conf['input_days'], conf['window'], verbose=False)
    train_x = {key:train_x[key] for key in train_x if key not in conf['remove_inputs']}
    val_x = {key:val_x[key] for key in val_x if key not in conf['remove_inputs']}
    return train_x, train_y, val_x, val_y
Пример #4
0
def test_split_series_id(series_id, val_id, train_id, fold_idx):
    ret = split_series_id(series_id, fold_idx)
    assert ret[0] == train_id
    assert ret[1] == val_id