def aplicaRUS(X_train, y_train): X_train, y_train = RandomUnderSampler(random_state=77145416).fit_resample( X_train, y_train) print("Numero de instancias: " + str(len(X_train))) print("Instancias por clase:") print(np.unique(y_train, return_counts=True)) df = pd.concat([X_train.reset_index(drop=True), y_train], axis=1) return (df)
def load_data_for_deepfm(country, test_only=False, undersample=False, oversample=False, overunder=False): TRAIN_PATH, TEST_PATH, QUESTIONS_PATH = get_country_filepaths(country) dfTest, y_test, w_test = load_data(TEST_PATH) dfTest["target"] = y_test # rename "hid" --> "id" for data parser dfTest.index.name = "id" dfTest = dfTest.reset_index() # get column names cols = [c for c in dfTest.columns if c not in ["id", "target"]] # columns to ignore cols = [c for c in cols if (not c in config.IGNORE_COLS)] if not test_only: dfTrain, y_train, w_train = load_data(TRAIN_PATH) if undersample or oversample or overunder: print("X shape before resampling: ", dfTrain.shape) if undersample: dfTrain, y_train = RandomUnderSampler().fit_sample( dfTrain, y_train) dfTrain = pd.DataFrame(data=dfTrain, columns=cols) print("X shape after undersampling: ", dfTrain.shape) if oversample: dfTrain, y_train = SMOTE().fit_sample(dfTrain, y_train) dfTrain = pd.DataFrame(data=dfTrain, columns=cols) print("X shape after oversampling: ", dfTrain.shape) if overunder: dfTrain, y_train = SMOTEENN().fit_sample(dfTrain, y_train) dfTrain = pd.DataFrame(data=dfTrain, columns=cols) print("X shape after SMOTEENN: ", dfTrain.shape) # rename label "poor" --> "target" # also combine label into features df for data parser dfTrain["target"] = y_train dfTrain.index.name = "id" dfTrain = dfTrain.reset_index() def preprocess(df): cols = [c for c in df.columns if c not in ["id", "target"]] return df if not test_only: dfTrain = preprocess(dfTrain) X_train = dfTrain[cols].values y_train = dfTrain["target"].values dfTest = preprocess(dfTest) X_test = dfTest[cols].values ids_test = dfTest["id"].values # get col index if categorical cat_features_indices = [ i for i, c in enumerate(cols) if c in config.CATEGORICAL_COLS ] if not test_only: return dfTrain, dfTest, X_train, y_train, X_test, ids_test, cat_features_indices else: return dfTest, X_test, ids_test, cat_features_indices