Exemplo n.º 1
0
def RemoveMultiOutliers(dataset_X, dataset_Y, col1, col2):
    # fit the model for outlier detection (default)
    clf = LocalOutlierFactor(n_neighbors=20, contamination=0.01)
    # use fit_predict to compute the predicted labels of the training samples
    # (when LOF is used for outlier detection, the estimator has no predict,
    # decision_function and score_samples methods).
    X = list(
        zip(dataset_X[col1].values.tolist(), dataset_X[col2].values.tolist()))
    y_pred = clf._fit_predict(X)
    to_remove = set()
    for i in range(len(X)):
        if y_pred[i] == -1:
            to_remove = to_remove | (set(
                dataset_X.loc[(dataset_X[col1] == X[i][0])
                              & (dataset_X[col2] <= X[i][1])].index.tolist()))
    to_remove = list(to_remove)
    dataset_X.drop(to_remove, inplace=True)
    dataset_Y.drop(to_remove, inplace=True)