def get_dset(path_to_data, args=ARGS):

    print('-' * 40)
    print("READING DATASET")

    # Get dataset
    dset_df = get_storms_df(path_to_data)
    print(f"Dataset shape: {dset_df.shape})")
    print(f"Training: {dset_df.train.sum()}| Testing: {dset_df.test.sum()}")

    return dset_df
def get_dset(path_to_data, args=ARGS):

    print('-' * 40)
    print("READING DATASET")

    # Get dataset
    dset_df = get_storms_df(path_to_data)
    print(f"Dataset shape: {dset_df.shape})")
    print(f"Training: {dset_df.train.sum()}| Testing: {dset_df.test.sum()}")

    # Change images
    if args.preprocessed is not None:
        print(f"USING PREPROCESSED DATA: {args.preprocessed}")
        dset_df['image_path'] = dset_df.apply(lambda irow: os.path.join(
            path_to_data, args.preprocessed, irow.image_id + ".jpg"),
                                              axis=1)

    return dset_df
Пример #3
0
def main(model_id, debug, only_test):
    MODEL_ID = 'L1A_' + model_id if not debug else 'debug_L1A_' + model_id
    print('-' * 50)
    print(f'- Concatenating L1 predictions for {MODEL_ID}')
    print('-' * 50)

    # Dataset
    dset_df = get_storms_df(os.path.join(PATH_TO_ROOT, DATA_DIR))
    dset_df.set_index('image_id', inplace=True, drop=True)
    dset_df = exe.load_kfolds(
        dset_df, os.path.join(PATH_TO_ROOT, DATA_DIR, KFOLDS_FILE))
    dset_df.set_index('image_id', inplace=True, drop=True)

    if not only_test:
        # Concatenate predictions
        files = [
            i for i in os.listdir(os.path.join(PATH_TO_ROOT, PREDICTIONS_PATH))
            if i.startswith(MODEL_ID + "_fold_")
        ]
        train_df = []
        for file in files:
            tmp_df = pd.read_csv(os.path.join(PATH_TO_ROOT, PREDICTIONS_PATH,
                                              file),
                                 index_col=0)
            train_df.append(tmp_df.copy())
            # Evaluate
            y_pred = tmp_df.wind_speed.round().astype(int).values
            y_true = dset_df.loc[tmp_df.index,
                                 'wind_speed'].values.astype(np.float32)
            print(
                f"File: {file} | RMSE: {math.sqrt(mean_squared_error(y_true, y_pred)):.4}"
            )
        train_df = pd.concat(train_df, axis=0)
        train_df = train_df.loc[dset_df.index[dset_df.train]]

        # Evaluate
        y_pred = train_df.wind_speed.round().astype(int).values
        y_true = dset_df.loc[train_df.index,
                             'wind_speed'].values.astype(np.float32)
        print(
            f"Model: {MODEL_ID} | RMSE: {math.sqrt(mean_squared_error(y_true, y_pred)):.4}"
        )

        # Save folds predictions
        filepath = os.path.join(PATH_TO_ROOT, PREDICTIONS_PATH,
                                MODEL_ID + f"_folds.csv.gz")
        print(f"Saving FOLDS predictions: {filepath}")
        train_df.to_csv(filepath, index=True)

    # Concatenate predictions
    files = [
        i for i in os.listdir(os.path.join(PATH_TO_ROOT, PREDICTIONS_PATH))
        if i.startswith(MODEL_ID + "_test")
    ]
    test_df = []
    for file in files:
        tmp_df = pd.read_csv(os.path.join(PATH_TO_ROOT, PREDICTIONS_PATH,
                                          file),
                             index_col=0)
        test_df.append(tmp_df.copy())
    test_df = pd.concat(test_df, axis=0)
    test_df = test_df.groupby('image_id').mean()

    # Save test predictions
    filepath = os.path.join(PATH_TO_ROOT, PREDICTIONS_PATH,
                            MODEL_ID + f"_test.csv.gz")
    print(f"Saving TEST predictions: {filepath}")
    test_df.to_csv(filepath, index=True)

    # Save submission
    filepath = os.path.join(PATH_TO_ROOT, PREDICTIONS_PATH,
                            MODEL_ID + f"_submission.csv")
    sub_df = test_df.copy()
    sub_df = sub_df.round().astype(int)
    print(f"Saving SUBMISSION predictions: {filepath}")
    sub_df.to_csv(filepath, index=True)

    print('')
Пример #4
0
def main(data_folder, avg, num_iterations, predictions_folder):

    data_folder, avg, num_iterations, predictions_folder = 'data', False, 10000, 'predictions'

    MODEL_TYPE = 'L2A'
    TRAINING_VERSION = ''
    OUTPUT_NAME = f"{MODEL_TYPE}_{MODEL_VERSION}{TRAINING_VERSION}"
    OUTPUT_PATH = predictions_folder

    models_list = MODELS_LIST.copy()
    print('Models: ', models_list)
    print('MAIN Models: ', MAIN_PRED_LIST)

    # Load L1 predictions
    train_L1_pred_df, test_L1_pred_df = load_data(
        data_folder,
        models_list,
        avg=avg,
        predictions_folder=predictions_folder)

    # Load MAIN predictions. These model's predictions would be average
    if len(MAIN_PRED_LIST) > 0:
        train_MAIN_pred_df, test_MAIN_pred_df = load_data(
            data_folder,
            MAIN_PRED_LIST,
            avg=True,
            predictions_folder=predictions_folder)
        train_MAIN_pred_df.columns = [
            'MAIN_' + str(s1) for s1 in train_MAIN_pred_df.columns
        ]
        test_MAIN_pred_df.columns = [
            'MAIN_' + str(s1) for s1 in test_MAIN_pred_df.columns
        ]

    # Load main dataset
    dset_df = get_storms_df(data_folder)
    dset_df.set_index('image_id', inplace=True, drop=True)
    # Create storms dataset
    storm_df = dset_df[['storm_id', 'storm_duration']].copy()
    storm_df['image_id'] = storm_df.index
    storm_df.set_index(['storm_id', 'storm_duration'], inplace=True, drop=True)

    # Read prev_data_df: Dataframe with image_id for previous data
    prev_data_parameters = {
        'gaps': [1, 2, 4, 6, 8, 10, 12, 16, 20,
                 24],  # Previous data gap, in hours
        'margin':
        0.5  # If previous data doesn't exist, search for +/- margin, in hours
    }
    prev_data_df = get_prev_data_df(data_folder, dset_df, prev_data_parameters)
    prev_data_df.set_index('image_id', inplace=True, drop=True)

    # Load FOLDS
    folds = pd.read_csv(f"{data_folder}/4Kfolds_202012220741.csv")
    folds.set_index('image_id', drop=True, inplace=True)
    folds = folds.loc[train_L1_pred_df.index]
    folds.sort_index(inplace=True)

    ## Build features dataframe
    # Add previous data
    train_features_df, test_features_df = \
        add_prev_data(prev_data_df, train_L1_pred_df, test_L1_pred_df, verbose=True)
    if len(MAIN_PRED_LIST) > 0:
        # Get MAIN features
        train_main_feat_df, test_main_feat_df = add_prev_data(
            prev_data_df, train_MAIN_pred_df, test_MAIN_pred_df)
        # Add MAIN for train set
        new_train_data = [train_features_df, train_main_feat_df]
        train_features_df = pd.concat(new_train_data, axis=1)
        # Add MAIN for test set
        new_test_data = [test_features_df, test_main_feat_df]
        test_features_df = pd.concat(new_test_data, axis=1)

    train_features_df.sort_index(inplace=True)
    test_features_df.sort_index(inplace=True)

    # Add extra features
    def add_extra_features(features_df):
        features_df['ocean'] = dset_df.loc[features_df.index, 'ocean']
        features_df['storm_duration'] = dset_df.loc[features_df.index,
                                                    'storm_duration']
        return features_df

    train_features_df = add_extra_features(train_features_df)
    test_features_df = add_extra_features(test_features_df)

    # Final columns
    feature_columns = [s1 for s1 in train_features_df.columns if s1 not in []]

    # Train the model
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'rmse',
        'metric': {'rmse'},
        'num_threads': 8,
        'num_iterations': num_iterations,
        # Core parameters
        'learning_rate': 0.005,
        # Learning control parameters
        'max_depth': 5,
        'feature_fraction': 0.8,  # colsample_bytree
        'bagging_freq': 1,
        'bagging_fraction': 0.6,  # subsample
        'num_leaves': 16,
        'min_data_in_leaf': 15,
        'verbosity': -1,
    }

    all_valid_y = []
    all_valid_field_ids = []
    all_predicts = []
    all_valid_predicts = []
    best_iterations = []
    fe = []
    for fold in sorted(folds.fold.unique()):
        # print(f'Fold - {fold}')
        train_field_ids_i = folds[folds.fold != fold].index.values
        val_field_ids_i = folds[(folds.fold == fold)
                                & folds.for_validation].index.values

        train_data = train_features_df[feature_columns].astype(np.float32)

        X_train, X_valid = train_data.loc[train_field_ids_i, :].values, \
                           train_data.loc[val_field_ids_i, :].values
        y_train, y_valid = dset_df.loc[train_field_ids_i, 'wind_speed'].values, \
                           dset_df.loc[val_field_ids_i, 'wind_speed'].values,

        train_data = lgb.Dataset(X_train, label=y_train)
        valid_data = lgb.Dataset(X_valid, label=y_valid)
        model_lgb = lgb.train(params,
                              train_data,
                              valid_sets=[valid_data],
                              early_stopping_rounds=50,
                              verbose_eval=False)
        best_iterations.append(model_lgb.best_iteration)
        print(
            f"Fold - {fold} - {model_lgb.best_score['valid_0']['rmse']:.4}@{model_lgb.best_iteration}"
        )

        # Save model
        #filepath = os.path.join(MODELS_PATH, OUTPUT_NAME + f"_model_{fold}.gbm")
        #model_lgb.save_model(filepath, model_lgb.best_iteration)

        # As a little trick, at prediction time, substitute predicted past values for actual values.

        # Update validation L1 with actual known wind_speed values
        VALID_train_L1_pred_df = train_L1_pred_df.copy()
        for column in VALID_train_L1_pred_df.columns:
            VALID_train_L1_pred_df.loc[train_field_ids_i,
                                       column] = dset_df.loc[train_field_ids_i,
                                                             'wind_speed']

        # Update validation MAIN with actual known wind_speed values
        if len(MAIN_PRED_LIST) > 0:
            VALID_train_MAIN_pred_df = train_MAIN_pred_df.copy()
            VALID_train_MAIN_pred_df.loc[train_field_ids_i,
                                         'MAIN_wind_speed'] = dset_df.loc[
                                             train_field_ids_i, 'wind_speed']

        ## Build features dataframe
        # Add previous data
        VALID_train_features_df, INFER_test_features_df = \
            add_prev_data(prev_data_df, VALID_train_L1_pred_df, test_L1_pred_df, verbose=True)

        # Get MAIN features
        if len(MAIN_PRED_LIST) > 0:
            train_main_feat_df, test_main_feat_df = add_prev_data(
                prev_data_df, VALID_train_MAIN_pred_df, test_MAIN_pred_df)
        # Add MAIN for train set
        if len(MAIN_PRED_LIST) > 0:
            new_train_data = [VALID_train_features_df, train_main_feat_df]
            VALID_train_features_df = pd.concat(new_train_data, axis=1)
        else:
            VALID_train_features_df = VALID_train_features_df
        VALID_train_features_df.sort_index(inplace=True)

        # Add MAIN for test set
        if len(MAIN_PRED_LIST) > 0:
            new_test_data = [INFER_test_features_df, test_main_feat_df]
            INFER_test_features_df = pd.concat(new_test_data, axis=1)
        INFER_test_features_df.sort_index(inplace=True)

        # Add extra features
        VALID_train_features_df = add_extra_features(VALID_train_features_df)
        INFER_test_features_df = add_extra_features(INFER_test_features_df)

        # Select columns
        VALID_train_features_df = VALID_train_features_df[
            feature_columns].astype(np.float32)
        INFER_test_features_df = INFER_test_features_df[
            feature_columns].astype(np.float32)

        # Make predictions for VALID data set
        y_valid_pred = model_lgb.predict(
            VALID_train_features_df.loc[val_field_ids_i],
            num_iteration=model_lgb.best_iteration)

        # Make predictions for TEST data set
        ypred = model_lgb.predict(INFER_test_features_df,
                                  num_iteration=model_lgb.best_iteration)

        all_valid_field_ids.extend(val_field_ids_i)
        all_predicts.append(ypred)
        all_valid_y.extend(y_valid)
        all_valid_predicts.append(y_valid_pred)

        fe.append(model_lgb.feature_importance())
    all_valid_predicts = np.concatenate(all_valid_predicts)
    all_valid_y = np.array(all_valid_y)
    print(f'Validation RMSE - {rmse(all_valid_y, all_valid_predicts):.4f}')

    # Save folds predictions
    valid_preds_df = pd.DataFrame(np.vstack(all_valid_predicts),
                                  index=all_valid_field_ids,
                                  columns=['wind_speed'])
    valid_preds_df.index.name = 'image_id'
    filepath = os.path.join(OUTPUT_PATH, OUTPUT_NAME + '_folds.csv.gz')
    print(f"Saving VALIDATION predictions: {filepath}")
    valid_preds_df.to_csv(filepath, index=True)

    # Save test predictions
    result = np.zeros_like(all_predicts[0])
    for predict in all_predicts:
        result += predict
    result = result / len(all_predicts)
    test_preds_df = pd.DataFrame(result,
                                 index=test_features_df.index,
                                 columns=['wind_speed'])
    test_preds_df.index.name = 'image_id'
    filepath = os.path.join(OUTPUT_PATH, OUTPUT_NAME + '_test.csv.gz')
    print(f"Saving TEST predictions: {filepath}")
    test_preds_df.to_csv(filepath, index=True)

    # Save submission
    filepath = os.path.join(OUTPUT_PATH, OUTPUT_NAME + f"_submission.csv")
    sub_df = test_preds_df.copy()
    sub_df = sub_df.round().astype(int)
    print(f"Saving SUBMISSION predictions: {filepath}")
    sub_df.to_csv(filepath, index=True)