def get_dset(path_to_data, args=ARGS): print('-' * 40) print("READING DATASET") # Get dataset dset_df = get_storms_df(path_to_data) print(f"Dataset shape: {dset_df.shape})") print(f"Training: {dset_df.train.sum()}| Testing: {dset_df.test.sum()}") return dset_df
def get_dset(path_to_data, args=ARGS): print('-' * 40) print("READING DATASET") # Get dataset dset_df = get_storms_df(path_to_data) print(f"Dataset shape: {dset_df.shape})") print(f"Training: {dset_df.train.sum()}| Testing: {dset_df.test.sum()}") # Change images if args.preprocessed is not None: print(f"USING PREPROCESSED DATA: {args.preprocessed}") dset_df['image_path'] = dset_df.apply(lambda irow: os.path.join( path_to_data, args.preprocessed, irow.image_id + ".jpg"), axis=1) return dset_df
def main(model_id, debug, only_test): MODEL_ID = 'L1A_' + model_id if not debug else 'debug_L1A_' + model_id print('-' * 50) print(f'- Concatenating L1 predictions for {MODEL_ID}') print('-' * 50) # Dataset dset_df = get_storms_df(os.path.join(PATH_TO_ROOT, DATA_DIR)) dset_df.set_index('image_id', inplace=True, drop=True) dset_df = exe.load_kfolds( dset_df, os.path.join(PATH_TO_ROOT, DATA_DIR, KFOLDS_FILE)) dset_df.set_index('image_id', inplace=True, drop=True) if not only_test: # Concatenate predictions files = [ i for i in os.listdir(os.path.join(PATH_TO_ROOT, PREDICTIONS_PATH)) if i.startswith(MODEL_ID + "_fold_") ] train_df = [] for file in files: tmp_df = pd.read_csv(os.path.join(PATH_TO_ROOT, PREDICTIONS_PATH, file), index_col=0) train_df.append(tmp_df.copy()) # Evaluate y_pred = tmp_df.wind_speed.round().astype(int).values y_true = dset_df.loc[tmp_df.index, 'wind_speed'].values.astype(np.float32) print( f"File: {file} | RMSE: {math.sqrt(mean_squared_error(y_true, y_pred)):.4}" ) train_df = pd.concat(train_df, axis=0) train_df = train_df.loc[dset_df.index[dset_df.train]] # Evaluate y_pred = train_df.wind_speed.round().astype(int).values y_true = dset_df.loc[train_df.index, 'wind_speed'].values.astype(np.float32) print( f"Model: {MODEL_ID} | RMSE: {math.sqrt(mean_squared_error(y_true, y_pred)):.4}" ) # Save folds predictions filepath = os.path.join(PATH_TO_ROOT, PREDICTIONS_PATH, MODEL_ID + f"_folds.csv.gz") print(f"Saving FOLDS predictions: {filepath}") train_df.to_csv(filepath, index=True) # Concatenate predictions files = [ i for i in os.listdir(os.path.join(PATH_TO_ROOT, PREDICTIONS_PATH)) if i.startswith(MODEL_ID + "_test") ] test_df = [] for file in files: tmp_df = pd.read_csv(os.path.join(PATH_TO_ROOT, PREDICTIONS_PATH, file), index_col=0) test_df.append(tmp_df.copy()) test_df = pd.concat(test_df, axis=0) test_df = test_df.groupby('image_id').mean() # Save test predictions filepath = os.path.join(PATH_TO_ROOT, PREDICTIONS_PATH, MODEL_ID + f"_test.csv.gz") print(f"Saving TEST predictions: {filepath}") test_df.to_csv(filepath, index=True) # Save submission filepath = os.path.join(PATH_TO_ROOT, PREDICTIONS_PATH, MODEL_ID + f"_submission.csv") sub_df = test_df.copy() sub_df = sub_df.round().astype(int) print(f"Saving SUBMISSION predictions: {filepath}") sub_df.to_csv(filepath, index=True) print('')
def main(data_folder, avg, num_iterations, predictions_folder): data_folder, avg, num_iterations, predictions_folder = 'data', False, 10000, 'predictions' MODEL_TYPE = 'L2A' TRAINING_VERSION = '' OUTPUT_NAME = f"{MODEL_TYPE}_{MODEL_VERSION}{TRAINING_VERSION}" OUTPUT_PATH = predictions_folder models_list = MODELS_LIST.copy() print('Models: ', models_list) print('MAIN Models: ', MAIN_PRED_LIST) # Load L1 predictions train_L1_pred_df, test_L1_pred_df = load_data( data_folder, models_list, avg=avg, predictions_folder=predictions_folder) # Load MAIN predictions. These model's predictions would be average if len(MAIN_PRED_LIST) > 0: train_MAIN_pred_df, test_MAIN_pred_df = load_data( data_folder, MAIN_PRED_LIST, avg=True, predictions_folder=predictions_folder) train_MAIN_pred_df.columns = [ 'MAIN_' + str(s1) for s1 in train_MAIN_pred_df.columns ] test_MAIN_pred_df.columns = [ 'MAIN_' + str(s1) for s1 in test_MAIN_pred_df.columns ] # Load main dataset dset_df = get_storms_df(data_folder) dset_df.set_index('image_id', inplace=True, drop=True) # Create storms dataset storm_df = dset_df[['storm_id', 'storm_duration']].copy() storm_df['image_id'] = storm_df.index storm_df.set_index(['storm_id', 'storm_duration'], inplace=True, drop=True) # Read prev_data_df: Dataframe with image_id for previous data prev_data_parameters = { 'gaps': [1, 2, 4, 6, 8, 10, 12, 16, 20, 24], # Previous data gap, in hours 'margin': 0.5 # If previous data doesn't exist, search for +/- margin, in hours } prev_data_df = get_prev_data_df(data_folder, dset_df, prev_data_parameters) prev_data_df.set_index('image_id', inplace=True, drop=True) # Load FOLDS folds = pd.read_csv(f"{data_folder}/4Kfolds_202012220741.csv") folds.set_index('image_id', drop=True, inplace=True) folds = folds.loc[train_L1_pred_df.index] folds.sort_index(inplace=True) ## Build features dataframe # Add previous data train_features_df, test_features_df = \ add_prev_data(prev_data_df, train_L1_pred_df, test_L1_pred_df, verbose=True) if len(MAIN_PRED_LIST) > 0: # Get MAIN features train_main_feat_df, test_main_feat_df = add_prev_data( prev_data_df, train_MAIN_pred_df, test_MAIN_pred_df) # Add MAIN for train set new_train_data = [train_features_df, train_main_feat_df] train_features_df = pd.concat(new_train_data, axis=1) # Add MAIN for test set new_test_data = [test_features_df, test_main_feat_df] test_features_df = pd.concat(new_test_data, axis=1) train_features_df.sort_index(inplace=True) test_features_df.sort_index(inplace=True) # Add extra features def add_extra_features(features_df): features_df['ocean'] = dset_df.loc[features_df.index, 'ocean'] features_df['storm_duration'] = dset_df.loc[features_df.index, 'storm_duration'] return features_df train_features_df = add_extra_features(train_features_df) test_features_df = add_extra_features(test_features_df) # Final columns feature_columns = [s1 for s1 in train_features_df.columns if s1 not in []] # Train the model params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'rmse', 'metric': {'rmse'}, 'num_threads': 8, 'num_iterations': num_iterations, # Core parameters 'learning_rate': 0.005, # Learning control parameters 'max_depth': 5, 'feature_fraction': 0.8, # colsample_bytree 'bagging_freq': 1, 'bagging_fraction': 0.6, # subsample 'num_leaves': 16, 'min_data_in_leaf': 15, 'verbosity': -1, } all_valid_y = [] all_valid_field_ids = [] all_predicts = [] all_valid_predicts = [] best_iterations = [] fe = [] for fold in sorted(folds.fold.unique()): # print(f'Fold - {fold}') train_field_ids_i = folds[folds.fold != fold].index.values val_field_ids_i = folds[(folds.fold == fold) & folds.for_validation].index.values train_data = train_features_df[feature_columns].astype(np.float32) X_train, X_valid = train_data.loc[train_field_ids_i, :].values, \ train_data.loc[val_field_ids_i, :].values y_train, y_valid = dset_df.loc[train_field_ids_i, 'wind_speed'].values, \ dset_df.loc[val_field_ids_i, 'wind_speed'].values, train_data = lgb.Dataset(X_train, label=y_train) valid_data = lgb.Dataset(X_valid, label=y_valid) model_lgb = lgb.train(params, train_data, valid_sets=[valid_data], early_stopping_rounds=50, verbose_eval=False) best_iterations.append(model_lgb.best_iteration) print( f"Fold - {fold} - {model_lgb.best_score['valid_0']['rmse']:.4}@{model_lgb.best_iteration}" ) # Save model #filepath = os.path.join(MODELS_PATH, OUTPUT_NAME + f"_model_{fold}.gbm") #model_lgb.save_model(filepath, model_lgb.best_iteration) # As a little trick, at prediction time, substitute predicted past values for actual values. # Update validation L1 with actual known wind_speed values VALID_train_L1_pred_df = train_L1_pred_df.copy() for column in VALID_train_L1_pred_df.columns: VALID_train_L1_pred_df.loc[train_field_ids_i, column] = dset_df.loc[train_field_ids_i, 'wind_speed'] # Update validation MAIN with actual known wind_speed values if len(MAIN_PRED_LIST) > 0: VALID_train_MAIN_pred_df = train_MAIN_pred_df.copy() VALID_train_MAIN_pred_df.loc[train_field_ids_i, 'MAIN_wind_speed'] = dset_df.loc[ train_field_ids_i, 'wind_speed'] ## Build features dataframe # Add previous data VALID_train_features_df, INFER_test_features_df = \ add_prev_data(prev_data_df, VALID_train_L1_pred_df, test_L1_pred_df, verbose=True) # Get MAIN features if len(MAIN_PRED_LIST) > 0: train_main_feat_df, test_main_feat_df = add_prev_data( prev_data_df, VALID_train_MAIN_pred_df, test_MAIN_pred_df) # Add MAIN for train set if len(MAIN_PRED_LIST) > 0: new_train_data = [VALID_train_features_df, train_main_feat_df] VALID_train_features_df = pd.concat(new_train_data, axis=1) else: VALID_train_features_df = VALID_train_features_df VALID_train_features_df.sort_index(inplace=True) # Add MAIN for test set if len(MAIN_PRED_LIST) > 0: new_test_data = [INFER_test_features_df, test_main_feat_df] INFER_test_features_df = pd.concat(new_test_data, axis=1) INFER_test_features_df.sort_index(inplace=True) # Add extra features VALID_train_features_df = add_extra_features(VALID_train_features_df) INFER_test_features_df = add_extra_features(INFER_test_features_df) # Select columns VALID_train_features_df = VALID_train_features_df[ feature_columns].astype(np.float32) INFER_test_features_df = INFER_test_features_df[ feature_columns].astype(np.float32) # Make predictions for VALID data set y_valid_pred = model_lgb.predict( VALID_train_features_df.loc[val_field_ids_i], num_iteration=model_lgb.best_iteration) # Make predictions for TEST data set ypred = model_lgb.predict(INFER_test_features_df, num_iteration=model_lgb.best_iteration) all_valid_field_ids.extend(val_field_ids_i) all_predicts.append(ypred) all_valid_y.extend(y_valid) all_valid_predicts.append(y_valid_pred) fe.append(model_lgb.feature_importance()) all_valid_predicts = np.concatenate(all_valid_predicts) all_valid_y = np.array(all_valid_y) print(f'Validation RMSE - {rmse(all_valid_y, all_valid_predicts):.4f}') # Save folds predictions valid_preds_df = pd.DataFrame(np.vstack(all_valid_predicts), index=all_valid_field_ids, columns=['wind_speed']) valid_preds_df.index.name = 'image_id' filepath = os.path.join(OUTPUT_PATH, OUTPUT_NAME + '_folds.csv.gz') print(f"Saving VALIDATION predictions: {filepath}") valid_preds_df.to_csv(filepath, index=True) # Save test predictions result = np.zeros_like(all_predicts[0]) for predict in all_predicts: result += predict result = result / len(all_predicts) test_preds_df = pd.DataFrame(result, index=test_features_df.index, columns=['wind_speed']) test_preds_df.index.name = 'image_id' filepath = os.path.join(OUTPUT_PATH, OUTPUT_NAME + '_test.csv.gz') print(f"Saving TEST predictions: {filepath}") test_preds_df.to_csv(filepath, index=True) # Save submission filepath = os.path.join(OUTPUT_PATH, OUTPUT_NAME + f"_submission.csv") sub_df = test_preds_df.copy() sub_df = sub_df.round().astype(int) print(f"Saving SUBMISSION predictions: {filepath}") sub_df.to_csv(filepath, index=True)