Пример #1
0
def train_model(X_train, characteristic, n_threads):
    X_train = pd.DataFrame(X_train)
    X_train = lp.load_data_float(X_train)
    # Train block
    train_len = len(X_train)
    if train_len > 1000:
        clf = iso.iForest(X_train, ntrees=characteristic.ntrees, sample_size=characteristic.sample_size, ExtensionLevel=1, n_threads=n_threads)
    else:
        clf = iso.iForest(X_train, ntrees=5000, sample_size=train_len, ExtensionLevel=1)
    return clf
Пример #2
0
    def transform(self, X):
        # instantiate extended isolation forest object
        ext_iso = eif.iForest(
            X=X[self.columns].values,
            ntrees=self.n_trees,
            sample_size=self.sample_size,
            ExtensionLevel=self.extension_level,
        )

        # calculate anomaly scores
        anomaly_scores = ext_iso.compute_paths(X_in=X[self.columns].values)

        # store anomaly score for each observation in Pandas DataFrame, sort descending
        anomaly_scores_sorted = pd.DataFrame(
            anomaly_scores, index=X.index,
            columns=["anomaly score"]).sort_values(["anomaly score"],
                                                   ascending=False)

        # identify outliers by limiting anomaly_scores_sorted to the (anamalies_ratio * total observation)
        # observations with the highest anomaly scores
        self.outliers = np.array(anomaly_scores_sorted[:int(
            np.ceil(self.anomalies_ratio * X.shape[0]))].index)

        # optionally drop outlier observations from input dataset
        if self.drop_outliers:
            X = X.drop(self.outliers, axis=0).reset_index(drop=True)

        return X
Пример #3
0
    def extended_isolation_forest(self, contamination):

        self.report.append('extended_isolation_forest')
        if_eif = iso.iForest(self.training.astype('float64').values,
                             ntrees=100,
                             sample_size=256,
                             ExtensionLevel=2)
        anomaly_scores = if_eif.compute_paths(X_in=self.training.values)
        anomaly_scores = pd.Series(anomaly_scores)
        anomaly_scores.index = self.training.index
        return self.uni_boxplot_outlier_det(anomaly_scores)
Пример #4
0
 def _construct(self, X):
     import eif
     model = super()._construct(X)
     model.psi = min(self.psi, X.shape[0])
     model.t = self.t
     model.random_state = self.random_state
     model.forest = eif.iForest(X,
                                ntrees=model.t,
                                sample_size=model.psi,
                                seed=model.random_state,
                                ExtensionLevel=X.shape[1] - 1,
                                **self.eif_params)
     return model
Пример #5
0
 def learning_process_prediction_ext_iso_f(self):
     print("Extended isolation forest train process is initialized!!")
     get_time()
     self.get_x_values()
     self.model_e_iso_f = iso.iForest(
         self.X,
         ntrees=self.params['num_of_trees'],
         sample_size=self.params['sample_size'],
         ExtensionLevel=len(self.features) - 1)
     self.data[self.model_params['args']
               ['pred_field']] = self.model_e_iso_f.compute_paths(
                   X_in=self.X)
     self.train_test_split()
     print("Extended Isolation Forest Model Train Process Done!")
Пример #6
0
        IsolationForest(
            n_estimators=500,
            behaviour="new",
            contamination=outliers_fraction,
            random_state=42,
        ),
    ),
    (
        "Local Outlier Factor",
        LocalOutlierFactor(
            n_neighbors=35, contamination=outliers_fraction, novelty=False
        ),
    ),
    (
        "Extended IF",
        iso.iForest(datasets3D[0], ntrees=500, sample_size=255, ExtensionLevel=1),
    ),
    (
        "USPORF",
        UnsupervisedRandomForest(
            feature_combinations="auto",
            max_depth=None,
            max_features="auto",
            min_samples_split="auto",
            n_estimators=500,
            n_jobs=None,
            projection_matrix="RerF",
        ),
    ),
]
Пример #7
0
def AnomalyDetection(df,
                     chamber,
                     model,
                     percent,
                     x_train,
                     x_test,
                     scoring=True,
                     contamination=0.001,
                     show_params=False,
                     show=True,
                     save=False):
    slicing = int(len(df) * percent)

    if model == "extendedIsolationForest":
        import eif as iso
        # ExtensionLevel=0 is the same as regular Isolation Forest
        clf = iso.iForest(x_train.values,
                          ntrees=200,
                          sample_size=256,
                          ExtensionLevel=1)
        print("fitting finished")
        train_pred = clf.compute_paths(X_in=x_train.values)
        test_pred = clf.compute_paths(X_in=x_test.values)
        print("scoring finished")

    else:
        if model == "IsolationForest":
            from sklearn.ensemble import IsolationForest
            # contamination : the proportion of outliers in the data set. Used when fitting to define the threshold on the scores of the sample
            clf = IsolationForest(n_estimators=50,
                                  contamination=contamination,
                                  random_state=0)

        elif model == "LocalOutlierFactor":
            from sklearn.neighbors import LocalOutlierFactor
            # If you really want to use neighbors.LocalOutlierFactor for novelty detection,
            # i.e. predict labels or compute the score of abnormality of new unseen data,
            # you can instantiate the estimator with the novelty parameter set to True before fitting the estimator.
            clf = LocalOutlierFactor(n_neighbors=5, novelty=True)

        elif model == "OneClassSVM":
            from sklearn.svm import OneClassSVM
            clf = OneClassSVM(gamma='auto')
        else:
            clf = None
            print("model selection error")
            return

        clf.fit(x_train)
        print("fitting finished")

        # pred = -(clf.predict(x_test)) # predict: Returns 1 for outliers and -1 for inliers
        # pred = -(clf.score_samples(x_test[features])) # score_samples : Returns anomaly score 0~1 (0 for normal, 1 for anomal)

        if scoring == True:
            train_pred = -(clf.score_samples(x_train))
            test_pred = -(clf.score_samples(x_test))
        else:
            train_pred = -(clf.predict(x_train))
            test_pred = -(clf.predict(x_test))

        print("scoring finished")
        r = np.sort(np.array(r))
    return r, theta

if __name__ == "__main__":
    confusion_matrices = []
    All_orbits = []
    X_buffer = []
    Y_buffer = []
    buffer = False
    binary_set = True
    use_previously_saved_models = False
    categorical_num = True
    
    for index in range(SET_PARAMS.Number_of_multiple_orbits):
        Y, Y_buffer, X, X_buffer, Orbit = Dataset_order(index, binary_set, buffer, categorical_num, use_previously_saved_models)
        All_orbits.append(Orbit)

        F1 = iso.iForest(X, ntrees = 500, sample_size = 1000, ExtensionLevel=1)

        xxx = np.array([[0,0.]])
        SL0 = F1.compute_paths_single_tree(xxx, 0)

        S1 = F1.compute_paths(X_in=X)

        ss1=np.argsort(S1)

        number_of_errors = np.sum(Y % 2 == 1)
        print(np.sum(Y[ss1[:number_of_errors]])/number_of_errors, index)

"""
To determine whether a single point within
data = pd.read_csv(
    "C:/Users/Reinis Fisers/PycharmProjects/TF_TEST/HalfYearFilteredNoNAN.csv")
data = data.tail(100000)
x = data['WindSpeed_mps']
y = data['Power_kW']

###  Create a two dimensional array with datatset  ###
z = np.array((list(zip(x, y))))

###  Create the dataframe  ###
new_data = pd.DataFrame(np.array(z), columns=['A', 'B'])

###  Fitting into Extended Isolation Forest Model  ###
anomalies_ratio = 0.02
eif = iso.iForest(new_data.values,
                  ntrees=3000,
                  sample_size=100,
                  ExtensionLevel=0.9)
anomaly_scores = eif.compute_paths(X_in=new_data.values)
anomaly_scores_sorted = np.argsort(anomaly_scores)
indices_with_preds = anomaly_scores_sorted[
    -int(np.ceil(anomalies_ratio * new_data.shape[0])):]
outliers = np.zeros_like(y)
outliers[indices_with_preds] = 1

###  Getting the cleaned date from outliers  ###
x_cleaned = data[np.where(outliers != 1, True, False)]
x_cleaned.to_csv("EIF4.csv")

### Loading the created dataset  ###
data1 = pd.read_csv("C:/Users/Reinis Fisers/PycharmProjects/TF_TEST/EIF4.csv")
x1 = data1['WindSpeed_mps']
Пример #10
0
    ax1.set_xlabel("Anomaly")
    ax1.set_ylim(0, forest.limit)

    ax1.axes.get_xaxis().set_visible(False)
    ax1.axes.get_yaxis().set_visible(False)
    plt.show()


if __name__ == "__main__":

    X_train, X_test, y_train, y_test = load_data()
    print("training sample nums: ", len(X_train))

    eifmodel = iso.iForest(X_train.values,
                           ntrees=100,
                           sample_size=256,
                           ExtensionLevel=1,
                           n_jobs=4)

    #  save model
    joblib.dump(eifmodel, './eiforest.pkl')
    # eifmodel = joblib.load('./eiforest.pkl')
    print("test sample nums: ", len(X_test))
    print("test anoamly sample nums: ", sum(y_test))

    stime = time.time()
    y_pred_test = eifmodel.compute_paths(X_test.values, n_jobs=4)
    ctime = time.time() - stime
    print("cost time is: {:.4f} ".format(ctime))
    fpr, tpr, thresholds = roc_curve(y_test.values, y_pred_test)
    areaUnderROC = auc(fpr, tpr)