예제 #1
0
def aplicaRUS(X_train, y_train):
    X_train, y_train = RandomUnderSampler(random_state=77145416).fit_resample(
        X_train, y_train)
    print("Numero de instancias: " + str(len(X_train)))
    print("Instancias por clase:")
    print(np.unique(y_train, return_counts=True))
    df = pd.concat([X_train.reset_index(drop=True), y_train], axis=1)
    return (df)
예제 #2
0
def load_data_for_deepfm(country,
                         test_only=False,
                         undersample=False,
                         oversample=False,
                         overunder=False):

    TRAIN_PATH, TEST_PATH, QUESTIONS_PATH = get_country_filepaths(country)

    dfTest, y_test, w_test = load_data(TEST_PATH)
    dfTest["target"] = y_test
    # rename "hid" --> "id" for data parser
    dfTest.index.name = "id"
    dfTest = dfTest.reset_index()

    # get column names
    cols = [c for c in dfTest.columns if c not in ["id", "target"]]
    # columns to ignore
    cols = [c for c in cols if (not c in config.IGNORE_COLS)]

    if not test_only:
        dfTrain, y_train, w_train = load_data(TRAIN_PATH)

        if undersample or oversample or overunder:
            print("X shape before resampling: ", dfTrain.shape)
        if undersample:
            dfTrain, y_train = RandomUnderSampler().fit_sample(
                dfTrain, y_train)
            dfTrain = pd.DataFrame(data=dfTrain, columns=cols)
            print("X shape after undersampling: ", dfTrain.shape)
        if oversample:
            dfTrain, y_train = SMOTE().fit_sample(dfTrain, y_train)
            dfTrain = pd.DataFrame(data=dfTrain, columns=cols)
            print("X shape after oversampling: ", dfTrain.shape)
        if overunder:
            dfTrain, y_train = SMOTEENN().fit_sample(dfTrain, y_train)
            dfTrain = pd.DataFrame(data=dfTrain, columns=cols)
            print("X shape after SMOTEENN: ", dfTrain.shape)

        # rename label "poor" --> "target"
        # also combine label into features df for data parser
        dfTrain["target"] = y_train
        dfTrain.index.name = "id"
        dfTrain = dfTrain.reset_index()

    def preprocess(df):
        cols = [c for c in df.columns if c not in ["id", "target"]]
        return df

    if not test_only:
        dfTrain = preprocess(dfTrain)
        X_train = dfTrain[cols].values
        y_train = dfTrain["target"].values

    dfTest = preprocess(dfTest)
    X_test = dfTest[cols].values
    ids_test = dfTest["id"].values

    # get col index if categorical
    cat_features_indices = [
        i for i, c in enumerate(cols) if c in config.CATEGORICAL_COLS
    ]

    if not test_only:
        return dfTrain, dfTest, X_train, y_train, X_test, ids_test, cat_features_indices
    else:
        return dfTest, X_test, ids_test, cat_features_indices