예제 #1
0
        "loss": {
            "name": loss,
            "alpha": alpha
        },
        "max_depth": max_depth,
        "n_estimators": n_estimators
    }
}

# Part 1 - Data Preprocessing
# Importing the dataset
dataset = pd.read_csv('train.csv')

# feature selection
if feature_selection == "infogain":
    categorical_features = get_cached_features(parameters["feature_selection"])
    continuous_values = []

categorical_features_count = len(categorical_features)
selected_features = categorical_features + continuous_values

X = dataset.iloc[:, selected_features].values
y = dataset.iloc[:, 1].values

column_ranges = []

print("replacing missing values and encode categorical features")
t0 = time.time()
print("number of examples: " + str(len(X[:, 0])))
for i in range(len(X[0, :])):
    if i <= categorical_features_count:
def preproc(dataset,
            mode,
            oneHot,
            scale=False,
            scaler=None,
            feature_selection=False):
    # categorical and binary features
    categorical_features = [
        3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 17, 18, 19, 23, 24, 25, 26, 27,
        28, 29, 30, 31, 32, 33, 53, 54, 55, 56, 57, 58
    ]

    # continuous values
    continuous_values = [
        2, 4, 15, 16, 20, 21, 22, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
        45, 46, 47, 48, 49, 50, 51, 52
    ]

    if feature_selection:
        feat = "infogain"
    else:
        feat = "none"

    number_of_features = 10
    alpha = 32
    max_depth = 4
    n_estimators = 100
    loss = "rank:pairwise"
    parameters = {
        "feature_selection": {
            "name": feat,
            "number_of_features": number_of_features
        },
        "classifier": {
            "name": "xgboost",
            "loss": {
                "name": loss,
                "alpha": alpha
            },
            "max_depth": max_depth,
            "n_estimators": n_estimators
        }
    }

    if feat == "infogain":
        categorical_features = get_cached_features(
            parameters["feature_selection"])
        continuous_values = []

    categorical_features_count = len(categorical_features)

    if not feature_selection:
        if mode == 'train':
            selected_features = categorical_features + continuous_values
        elif mode == 'test':
            selected_features = categorical_features + continuous_values
            selected_features = np.array(selected_features) - 1
        else:
            Warning(
                "Mode must be train or set, otherwise it will lead to local"
                " a variable referenced before assignment error")
    else:
        if mode == 'train':
            selected_features = categorical_features
        elif mode == 'test':
            selected_features = categorical_features
            selected_features = np.array(selected_features) - 1
        else:
            Warning(
                "Mode must be train or set, otherwise it will lead to local"
                " a variable referenced before assignment error")

    X = dataset.iloc[:, selected_features].values
    y = dataset.iloc[:, 1].values
    column_ranges = []

    print("replacing missing values")
    t0 = time.time()
    print("number of examples: " + str(len(X[:, 0])))
    for i in range(len(X[0, :])):
        if i <= categorical_features_count:
            # si c'est une variable de catégories, on prend comme stratégie de remplacer par la
            # valeur la plus fréquente
            (values, counts) = np.unique(X[:, i], return_counts=True)
            counts = [
                counts[i] if values[i] >= 0 else 0 for i in range(len(values))
            ]
            ind = np.argmax(counts)
            column_ranges.append(max(values))
            replacement_value = values[ind]
        else:
            # sinon on prend simplement la moyenne
            replacement_value = np.mean(X[:, i])

        for j in range(len(X[:, i])):
            if X[j, i] < -0.5:
                X[j, i] = replacement_value

    if oneHot:
        print("One hot encoding")
        # NB: les nouvelles colonnes sont placées juste devant l'autre colonne
        onehotencoder = OneHotEncoder(
            categorical_features=range(categorical_features_count))
        X = onehotencoder.fit_transform(X).toarray()

        # pour éviter le piège de la "dummy variable" on va retirer une colonne pour chaque ajout de colonnes
        # typiquement pour les trucs binaires on aurait pas du rajouter de colonnes (on verra ca plus tard)
        # donc la c'est un peu technique, il faudra relire ca en détail

        to_delete = []
        t = 0

        for i in range(categorical_features_count):
            to_delete.append(t)
            t += column_ranges[i]

        mask = []

        for s in range(len(X[0, :])):
            if s not in to_delete:
                mask.append(s)

        X = X[:, mask]

    if scale:
        if scaler == None:
            sc = StandardScaler()
            X = sc.fit_transform(X)
            return X, y, sc
        else:
            X = scaler.transform(X)
    return X, y