def create_submission(model_dir):
    submission_path = os.path.join(
        DATASET_PATH, 'submissions', 'seq2seq', '%s.csv' % os.path.basename(model_dir))
    if os.path.exists(submission_path):
        return
    meta_model = _load_meta_model(model_dir)

    _, test, submission, metadata = load_data()

    test_preds = {}
    submission_series_id = submission[submission.prediction_window == 'hourly'].series_id.unique()
    for series_id in submission_series_id:
        window = submission[submission.series_id == series_id].prediction_window.values[0]
        df = test[test.series_id == series_id].copy()
        df.reset_index(inplace=True)
        if len(df) > 7*24:
            df = df.loc[len(df)-7*24:]
        x, mean_value, next_day = prepare_x(window, df, metadata, series_id)
        pred = meta_model.predict(x, window, next_day)*mean_value
        test_preds[series_id] = pred[0, :, 0]

    base_submission = submission

    new_consumption = base_submission.consumption.values.copy()
    submission_series_id = base_submission.series_id.values

    for series_id in test_preds:
        new_consumption[submission_series_id == series_id] = test_preds[series_id]

    new_submission = base_submission.copy()
    new_submission['consumption'] = new_consumption
    new_submission.to_csv(submission_path, index=False)
    print(os.path.basename(submission_path))
Exemplo n.º 2
0
def load_and_arrange_data(conf, verbose=False):
    train, test, _, metadata = load_data()

    if 'random_seed' in conf:
        train_ids, val_ids = stratified_cv_series_id(
            train.series_id.unique(),
            fold_idx=conf['fold_idx'],
            random_seed=conf['random_seed'])
    else:
        train_ids, val_ids = split_series_id(train.series_id.unique(),
                                             fold_idx=conf['fold_idx'])
    val = train[train.series_id.isin(val_ids)]
    val.reset_index(inplace=True, drop=True)
    train = train[train.series_id.isin(train_ids)]
    train.reset_index(inplace=True, drop=True)

    train = pd.concat([train, test])
    train.reset_index(inplace=True, drop=True)

    train_x, train_y = prepare_data_for_train(train,
                                              metadata,
                                              conf['input_days'],
                                              conf['window'],
                                              conf['only_working_day'],
                                              verbose=verbose)
    val_x, val_y = prepare_data_for_train(val,
                                          metadata,
                                          conf['input_days'],
                                          conf['window'],
                                          conf['only_working_day'],
                                          verbose=verbose)

    train_x = np.concatenate([train_x[key] for key in conf['inputs']], axis=2)
    val_x = np.concatenate([val_x[key] for key in conf['inputs']], axis=2)
    return train_x, train_y, val_x, val_y
def main():
    print(colored('\tPreparing data', 'blue'))
    train, test, _, metadata = load_data()

    add_is_off_column(train, metadata)
    add_is_off_column(test, metadata)
    add_is_holiday_column(train)
    add_is_holiday_column(test)
    train.to_csv(TRAIN_PATH, index=False)
    test.to_csv(TEST_PATH, index=False)
def main():
    print(colored('\tFinding clusters', 'blue'))
    train, test, _, metadata = load_data()

    train_clusters = get_clusters_with_time_continuity(train, metadata)
    train_clusters = [cluster_to_int(cluster) for cluster in train_clusters]
    with open(TRAIN_CLUSTERS_PATH, 'w') as f:
        json.dump(train_clusters, f)

    test_clusters = get_clusters_with_time_continuity(test, metadata)
    test_clusters = [cluster_to_int(cluster) for cluster in test_clusters]
    with open(TEST_CLUSTERS_PATH, 'w') as f:
        json.dump(test_clusters, f)
Exemplo n.º 5
0
def load_and_arrange_data(conf, verbose=False):
    """
    Prepares the data for training using cross-validation
    """
    train, test, _, metadata = load_data()

    if 'random_seed' in conf:
        print('Using random seed')
        train_ids, val_ids = stratified_cv_series_id(
            train.series_id.unique(),
            fold_idx=conf['fold_idx'],
            random_seed=conf['random_seed'])
    else:
        train_ids, val_ids = split_series_id(train.series_id.unique(),
                                             fold_idx=conf['fold_idx'])
    val = train[train.series_id.isin(val_ids)]
    val.reset_index(inplace=True, drop=True)
    train = train[train.series_id.isin(train_ids)]
    train.reset_index(inplace=True, drop=True)

    train = pd.concat([train, test])
    train.reset_index(inplace=True, drop=True)

    _train_x, train_y = prepare_data_for_train(train,
                                               conf['input_days'],
                                               conf['window'],
                                               verbose=verbose)
    _val_x, val_y = prepare_data_for_train(val,
                                           conf['input_days'],
                                           conf['window'],
                                           verbose=verbose)

    train_x, val_x = {}, {}
    train_x['past_features'] = np.concatenate([_train_x['past_%s' % key] \
        for key in conf['past_features']], axis=2)
    val_x['past_features'] = np.concatenate([_val_x['past_%s' % key] \
        for key in conf['past_features']], axis=2)

    train_x['future_features'] = np.concatenate([_train_x['future_%s' % key] \
        for key in conf['future_features']], axis=2)
    val_x['future_features'] = np.concatenate([_val_x['future_%s' % key] \
        for key in conf['future_features']], axis=2)

    train_x['cluster_features'] = np.concatenate([_train_x[key] \
        for key in conf['cluster_features']], axis=1)
    val_x['cluster_features'] = np.concatenate([_val_x[key] \
        for key in conf['cluster_features']], axis=1)

    return train_x, train_y, val_x, val_y
def _load_and_arrange_data(conf):
    train, test, submission, metadata = load_data()

    if 'random_seed' in conf:
        train_ids, val_ids = stratified_cv_series_id(
            train.series_id.unique(), fold_idx=conf['fold_idx'], random_seed=conf['random_seed'])
    else:
        train_ids, val_ids = split_series_id(train.series_id.unique(), fold_idx=conf['fold_idx'])

    val = train[train.series_id.isin(val_ids)]
    val.reset_index(inplace=True, drop=True)
    train = train[train.series_id.isin(train_ids)]
    train.reset_index(inplace=True, drop=True)

    train = pd.concat([train, test])
    train.reset_index(inplace=True, drop=True)

    train_x, train_y = prepare_data_for_train(
        train, metadata, conf['input_days'], conf['window'], verbose=False)
    val_x, val_y = prepare_data_for_train(
        val, metadata, conf['input_days'], conf['window'], verbose=False)
    train_x = {key:train_x[key] for key in train_x if key not in conf['remove_inputs']}
    val_x = {key:val_x[key] for key in val_x if key not in conf['remove_inputs']}
    return train_x, train_y, val_x, val_y
def all_series_ids():
    train, test, _, _ = load_data()
    all_series_id = train.series_id.unique().tolist() + test.series_id.unique(
    ).tolist()
    return all_series_id