예제 #1
0
def isolation_forest(df,
                     df_class,
                     df_train,
                     df_train_class,
                     maximize_score=F1_SCORE,
                     n_jobs=-1,
                     bootstrap=True,
                     n_estimators=40,
                     train_size=0.5,
                     random_seed=10):

    df_train_common_class = df_anomaly_instances(df_train_class)
    df_train_with_common_class = df_train.join(df_train_common_class)

    df_common_class = df_anomaly_instances(df_class)

    train, valid = get_train_valid_sets(df_train_with_common_class,
                                        train_size=train_size,
                                        random_seed=random_seed)

    model = IsolationForest(random_state=random_seed,
                            n_jobs=n_jobs,
                            max_samples=train.shape[0],
                            bootstrap=bootstrap,
                            n_estimators=n_estimators)
    model.fit(train.drop('class', axis=1).values)

    thresholds = np.linspace(0, 1, 200)
    thresholds = np.round(thresholds, 7)  # round thresholds

    y_scores = min_max_normalization(
        model.decision_function(valid.drop('class', axis=1).values))
    y_scores = np.round(y_scores, 7)  # round scores

    training_threshold_scores = get_threshold_scores(thresholds, y_scores,
                                                     valid['class'])
    selected_index = get_max_score_index_for_score_type(
        training_threshold_scores, maximize_score)
    selected_threshold = thresholds[selected_index]

    # detection on dataset
    scores = min_max_normalization(model.decision_function(df.values))
    scores = np.round(scores, 7)  # round scores

    y_hat_results = (scores < selected_threshold).astype(int)
    y_truth = df_common_class['class'].values.astype(int)
    detection_threshold_scores = get_threshold_scores(thresholds, scores,
                                                      df_common_class['class'])
    info = get_detection_meta(selected_threshold, y_hat_results, y_truth)

    info['thresholds'] = thresholds.tolist()
    info['training_threshold_scores'] = training_threshold_scores.tolist()
    info['detection_threshold_scores'] = detection_threshold_scores.tolist()

    return scores, y_hat_results, df_common_class, info
예제 #2
0
def cluster_gaussian_mixture(df,
                             df_class,
                             df_train,
                             df_train_class,
                             maximize_score=F1_SCORE,
                             n_components=3,
                             n_init=3,
                             train_size=0.5,
                             random_seed=10):

    df_train_common_class = df_anomaly_instances(df_train_class)
    df_train_with_common_class = df_train.join(df_train_common_class)

    df_common_class = df_anomaly_instances(df_class)

    train, valid = get_train_valid_sets(df_train_with_common_class,
                                        train_size=train_size,
                                        random_seed=random_seed)

    gmm = GaussianMixture(n_components=n_components,
                          n_init=n_init,
                          random_state=random_seed)
    gmm.fit(train.drop('class', axis=1).values)

    thresholds = np.linspace(0, 1, 200)
    thresholds = np.round(thresholds, 7)  # round thresholds

    y_scores = min_max_normalization(
        gmm.score_samples(valid.drop('class', axis=1).values))
    y_scores = np.round(y_scores, 7)  # round scores

    training_threshold_scores = get_threshold_scores(thresholds, y_scores,
                                                     valid['class'])
    selected_index = get_max_score_index_for_score_type(
        training_threshold_scores, maximize_score)
    selected_threshold = thresholds[selected_index]

    # detection on dataset
    scores = min_max_normalization(gmm.score_samples(df.values))
    scores = np.round(scores, 7)  # round scores

    y_hat_results = (scores < selected_threshold).astype(int)
    y_truth = df_common_class['class'].values.astype(int)
    detection_threshold_scores = get_threshold_scores(thresholds, scores,
                                                      df_common_class['class'])
    info = get_detection_meta(selected_threshold, y_hat_results, y_truth)

    info['thresholds'] = thresholds.tolist()
    info['training_threshold_scores'] = training_threshold_scores.tolist()
    info['detection_threshold_scores'] = detection_threshold_scores.tolist()

    return scores, y_hat_results, df_common_class, info
예제 #3
0
def lisa_geo(df, df_class, time_series_id, maximize_score=F1_SCORE):

    df_corr_dist = get_df_corr_geo_distance(df)

    df_class_copy = df_class.copy()
    df_class_copy = df_class_copy.rename(columns={time_series_id: 'class'})

    # append mean values of each row to dataframe
    df_val_mean = df_copy_with_mean(df)

    # LISA Time Series
    df_results = df_lisa_time_series(time_series_id, df_val_mean, df_corr_dist, global_correlation=True)

    thresholds = np.linspace(0, 1, 200)
    thresholds = np.round(thresholds, 7)  # round thresholds

    scores = min_max_normalization(df_results[time_series_id].values)
    scores = np.round(scores, 7)  # round scores

    threshold_scores = get_threshold_scores(thresholds, scores, df_class_copy['class'])
    selected_index = get_max_score_index_for_score_type(threshold_scores, maximize_score)
    selected_threshold = thresholds[selected_index]

    y_hat_results = (scores < selected_threshold).astype(int)
    y_truth = df_class_copy['class'].values.astype(int)
    info = get_detection_meta(selected_threshold, y_hat_results, y_truth)

    info['thresholds'] = thresholds.tolist()
    info['detection_threshold_scores'] = threshold_scores.tolist()

    return scores, y_hat_results, info
예제 #4
0
def histogram(df, df_class, df_train, df_train_class, maximize_score=F1_SCORE, train_size=0.5, random_seed=10):

    df_train_common_class = df_anomaly_instances(df_train_class)
    df_train_with_common_class = df_train.join(df_train_common_class)

    df_common_class = df_anomaly_instances(df_class)

    train, valid = get_train_valid_sets(df_train_with_common_class, train_size=train_size, random_seed=random_seed)

    # square root of number of instances as number of bins
    num_bins = (np.sqrt(train.shape[0])).astype(int)
    logging.debug('Number of bins %d' % num_bins)

    # create and train model
    model = hist_model(bins=num_bins)
    model.fit(train.drop('class', axis=1).values)

    thresholds = np.linspace(0, 1, 200)
    thresholds = np.round(thresholds, 7)  # round thresholds

    y_scores = min_max_normalization(model.predict(valid.drop('class', axis=1).values))
    y_scores = np.round(y_scores, 7) # round scores

    training_threshold_scores = get_threshold_scores(thresholds, y_scores, valid['class'])
    selected_index = get_max_score_index_for_score_type(training_threshold_scores, maximize_score)
    selected_threshold = thresholds[selected_index]

    # detection on dataset
    scores = min_max_normalization(model.predict(df.values))
    scores = np.round(scores, 7)  # round scores

    y_hat_results = (scores < selected_threshold).astype(int)
    y_truth = df_common_class['class'].values.astype(int)
    detection_threshold_scores = get_threshold_scores(thresholds, scores, df_common_class['class'])
    info = get_detection_meta(selected_threshold, y_hat_results, y_truth)

    info['thresholds'] = thresholds.tolist()
    info['training_threshold_scores'] = training_threshold_scores.tolist()
    info['detection_threshold_scores'] = detection_threshold_scores.tolist()

    return scores, y_hat_results, df_common_class, info
예제 #5
0
def svm(df, df_class, df_train, df_train_class, maximize_score=F1_SCORE, nu=0.5, kernel='rbf', train_size=0.5, random_seed=10):

    df_train_common_class = df_anomaly_instances(df_train_class)
    df_train_with_common_class = df_train.join(df_train_common_class)

    df_common_class = df_anomaly_instances(df_class)

    train, valid = get_train_valid_sets(df_train_with_common_class, train_size=train_size, random_seed=random_seed)

    model = OneClassSVM(gamma='scale', nu=nu, kernel=kernel)
    model.fit(train.drop('class', axis=1).values)

    thresholds = np.linspace(0, 1, 200)
    thresholds = np.round(thresholds, 7)  # round thresholds

    y_scores = min_max_normalization(model.decision_function(valid.drop('class', axis=1).values))
    y_scores = np.round(y_scores, 7)  # round scores

    training_threshold_scores = get_threshold_scores(thresholds, y_scores, valid['class'])
    selected_index = get_max_score_index_for_score_type(training_threshold_scores, maximize_score)
    selected_threshold = thresholds[selected_index]

    # detection on dataset
    scores = min_max_normalization(model.decision_function(df.values))
    scores = np.round(scores, 7)  # round scores

    y_hat_results = (scores < selected_threshold).astype(int)
    y_truth = df_common_class['class'].values.astype(int)
    detection_threshold_scores = get_threshold_scores(thresholds, scores, df_common_class['class'])
    info = get_detection_meta(selected_threshold, y_hat_results, y_truth)

    info['thresholds'] = thresholds.tolist()
    info['training_threshold_scores'] = training_threshold_scores.tolist()
    info['detection_threshold_scores'] = detection_threshold_scores.tolist()

    return scores, y_hat_results, df_common_class, info
예제 #6
0
def lisa_dtw(df, df_class, time_series_id, maximize_score=F1_SCORE, window_size=10, distance_function=EUCLIDEAN):

    df_correlation = dtw_pearson(df, time_series_id, distance_function, window_size=window_size)

    # after correlation have been computed we remove the size of the window from the head of the data frame, as those correlations cannot be computed
    offset = window_size - 1
    df_correlation = df_correlation.iloc[offset:]
    df = df.iloc[offset:]
    df_class = df_class.iloc[offset:]

    # mean values of each row of dataframe
    df_mean = df_copy_with_mean(df)

    df_class_copy = df_class.copy()
    df_class_copy = df_class_copy.rename(columns={time_series_id: 'class'})

    # LISA Time Series
    df_results = df_lisa_time_series(time_series_id, df_mean, df_correlation)

    thresholds = np.linspace(0, 1, 200)
    thresholds = np.round(thresholds, 7)  # round thresholds

    scores = min_max_normalization(df_results[time_series_id].values)
    scores = np.round(scores, 7)  # round scores

    threshold_scores = get_threshold_scores(thresholds, scores, df_class_copy['class'])
    selected_index = get_max_score_index_for_score_type(threshold_scores, maximize_score)
    selected_threshold = thresholds[selected_index]

    y_hat_results = (scores < selected_threshold).astype(int)
    y_truth = df_class_copy['class'].values.astype(int)
    info = get_detection_meta(selected_threshold, y_hat_results, y_truth)

    info['thresholds'] = thresholds.tolist()
    info['detection_threshold_scores'] = threshold_scores.tolist()

    return scores, y_hat_results, info, df, df_class
예제 #7
0
def robust_pca_huber_loss(df,
                          df_class,
                          df_train,
                          df_train_class,
                          delta=1,
                          n_components=2,
                          maximize_score=F1_SCORE,
                          train_size=0.5,
                          random_seed=10):

    df_train_common_class = df_anomaly_instances(df_train_class)
    df_train_with_common_class = df_train.join(df_train_common_class)
    df_common_class = df_anomaly_instances(df_class)

    # for supervised detection(!) - stratify parameter makes a split so that the proportion of values in the sample produced will be the same as the proportion of values provided to parameter stratify
    #X_train, X_test, y_train, y_test = train_test_split(df_train, df_train_common_class, train_size=train_size, random_state=random_seed, stratify=df_train_common_class)

    train, valid = get_train_valid_sets(df_train_with_common_class,
                                        train_size=train_size,
                                        random_seed=random_seed)

    X_train = train.drop('class', axis=1)
    X_test = valid.drop('class', axis=1)

    # Dimensionality reduction with Robust PCA and Huber Loss Function
    huber_loss = loss.HuberLoss(delta=delta)
    M_rpca = MRobustPCA(n_components, huber_loss)

    # Fit R-PCA on Train Set
    M_rpca.fit(X_train)

    # R-PCA on Test Set
    X_test_reduced = M_rpca.transform(X_test)
    X_test_reduced = pd.DataFrame(data=X_test_reduced, index=X_test.index)
    X_test_reconstructed = M_rpca.inverse_transform(X_test_reduced)
    X_test_reconstructed = pd.DataFrame(data=X_test_reconstructed,
                                        index=X_test.index)

    y_test_scores = normalized_anomaly_scores(X_test, X_test_reconstructed)
    y_test_scores = np.round(y_test_scores, 7)  # round scores

    # computed scores are always in between 0-1 due to min max normalization
    thresholds = np.linspace(0, 1, 200)
    thresholds = np.round(thresholds, 7)  # round thresholds

    training_threshold_scores = get_threshold_scores(
        thresholds, y_test_scores, valid['class'], upper_boundary=True
    )  # or replace valid['class'] with y_test for supervised detection
    selected_index = get_max_score_index_for_score_type(
        training_threshold_scores, maximize_score)
    selected_threshold = thresholds[selected_index]

    # Run on Dataset
    X_df_reduced = M_rpca.transform(df)
    X_df_reduced = pd.DataFrame(data=X_df_reduced, index=df.index)
    X_df_reconstructed = M_rpca.inverse_transform(X_df_reduced)
    X_df_reconstructed = pd.DataFrame(data=X_df_reconstructed, index=df.index)

    # detection on dataset
    scores = normalized_anomaly_scores(df, X_df_reconstructed)
    scores = np.round(scores, 7)  # round scores

    y_hat_results = (scores > selected_threshold).astype(int)
    y_truth = df_common_class.values.astype(int)
    detection_threshold_scores = get_threshold_scores(thresholds,
                                                      scores,
                                                      df_common_class['class'],
                                                      upper_boundary=True)
    info = get_detection_meta(selected_threshold,
                              y_hat_results,
                              y_truth,
                              upper_boundary=True)

    info['thresholds'] = thresholds.tolist()
    info['training_threshold_scores'] = training_threshold_scores.tolist()
    info['detection_threshold_scores'] = detection_threshold_scores.tolist()

    return scores, y_hat_results, df_common_class, info