예제 #1
0
def execute_ee(df, **kwargs):
    """
    Elliptic Envelope を実行し、異常かどうかのラベルを返す。
    
    Parameters
    ----------
    df : pandas.DataFrame of shape (n_objects, n_features)
        m_ap30 の表。
    
    Other Parameters
    ----------------
    以下、詳細は
    https://scikit-learn.org/stable/modules/generated
        /sklearn.covariance.EllipticEnvelope.html
    を参照。
    support_fraction : float, default 0.9
    contamination : float, default 0.01
    random_state : int or RandomState instance, default 42
    
    Returns
    -------
    y_pred : numpy.ndarray
        異常かどうかのラベル。
    """

    support_fraction = kwargs.get('support_fraction', 0.9)
    contamination = kwargs.get('contamination', 0.01)
    random_state = kwargs.get('random_state', 42)

    clf = EllipticEnvelope(support_fraction=support_fraction,
                           contamination=contamination,
                           random_state=random_state)
    y_pred = clf.fit_predict(df)
    return y_pred
예제 #2
0
def ee_outliers(col_name):
    
    ee = EllipticEnvelope()
    ee_preds = ee.fit_predict(np.array(df_processed[col_name]).reshape(-1,1))

    ee_preds_class = ["ok" if i == 1 else "ee_outlier" for i in ee_preds]
    df_processed["ee_outlier"] = ee_preds_class
    def DetectOutliersUsingEnvelope(self):

        data = self.__df[[self.__x, self.__y]].values
        clf = EllipticEnvelope()
        x_min_value, x_max_value = min(
            self.__df[self.__x].values) - self.__factor, max(
                self.__df[self.__x].values) + self.__factor
        y_min_value, y_max_value = min(
            self.__df[self.__y].values) - self.__factor, max(
                self.__df[self.__y].values) + self.__factor

        xx, yy = np.meshgrid(np.linspace(x_min_value, x_max_value, 500),
                             np.linspace(y_min_value, y_max_value, 500))
        clf.fit(data)
        Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        pred = clf.fit_predict(data)  #Outliers = -1, inliers = 1
        if self.__drop_outliers:  # Let's drop outliers from dataset!
            for index, outlier in enumerate(pred):
                if outlier == -1:
                    self.__df = self.__df.drop(index, axis=0)
            return self.__df

        else:
            return xx, yy, Z
예제 #4
0
def outlier_detector(df, percentage=0.01):

    model_detection = EllipticEnvelope(
        # contamination=percentage # outlier percentage. If you want to add manually
        random_state=0)

    predict_x = model_detection.fit_predict(
        df)  # prediction [1 or -1] # -1 are outliers

    outlier = np.where(predict_x == -1, 1,
                       0)  # convert to 1 or 0 # work as boolean

    # Add outlier column in DF
    df.loc[df.index, "outlier"] = outlier
    df["outlier"] = df["outlier"].astype("bool")

    # print summary
    df_check = df.copy()
    df_check["count"] = 1
    df_check = df_check.groupby(["outlier"])["count"].count()
    df_check = df_check.to_frame()
    sum_total = df_check.iloc[:, 0].sum()  # add results in percentage of total
    df_check["%_of_Total"] = df_check.iloc[:, 0].apply(
        lambda x: x / sum_total)  # .astype(str) + '%'
    print(df_check)

    df_no_outlier = df[df["outlier"] == False]
    df_no_outlier = df_no_outlier.iloc[:, :-1]

    print(
        f"\nAmount of Outliers Removed: {len(df.index) - len(df_no_outlier.index)}"
    )

    return df_no_outlier
예제 #5
0
def main():
    '''
    The procedure contains two simple steps:
        - Scale the data to the standard distribution with mean 0 and unit variance.
          This might be too simplistic.
        - Apply the elliptic envelope.  The contamination level is set manually.
    '''
    domains = []
    raw = []

    with open(sys.argv[1]) as fhandle:
        for line in fhandle:
            record = json.loads(line.strip())

            for analyser in record['analysers']:
                if analyser['analyser'] == 'FeaturesGenerator':
                    raw.extend(analyser['output'])

                if analyser['analyser'] == 'WordSegmentation':
                    domains.extend(analyser['output'].keys())

            if len(raw) != len(domains):
                print(record)
                sys.exit(0)

    x_samples = scale(np.array(raw))

    engine = EllipticEnvelope(contamination=0.015, support_fraction=1.0)
    y_samples = engine.fit_predict(x_samples)

    for index, y_sample in enumerate(y_samples):
        if y_sample == -1:
            print(domains[index])
def outliers_EllipticEnvelope(df, contamination):
    dataset = df.copy()
    clf = EllipticEnvelope(contamination=contamination)

    df_with_ellip = dataset.join(pd.DataFrame(clf.fit_predict(dataset),
                                              index=dataset.index, columns=['elliptic']), how='left')

    return df_with_ellip.loc[df_with_ellip['elliptic'] != 1].index
예제 #7
0
def elliptic_envelope(df, modelDir, norm_confidence=0.95):
	from sklearn.covariance import EllipticEnvelope
	from scipy.stats import normaltest

	if "ds" in df.columns:
		del df["ds"]
	model = EllipticEnvelope()
	test_stats, p_vals = normaltest(df.values, axis=0)
	normal_cols = p_vals >= (1 - norm_confidence)
	df = df.loc[:, normal_cols]
	if df.shape[1] == 0:
		return None
	df.outlier = model.fit_predict(df.values)
	df.outlier = df.outlier < 0  # 1 if inlier, -1 if outlier
	return df
def elliptic_envelope(X_train, y_names):
    cov = EllipticEnvelope(contamination=0.05)
    y_pred = cov.fit_predict(X_train)

    plot(X_train, y_pred, "Covariance")

    outlier_list = []
    for i in range(0, len(y_pred)):
        if y_pred[i] == -1:
            outlier_list.append(i)

    print("Covariance")
    for i in range(0, len(outlier_list)):
        print(outlier_list[i], y_names[outlier_list[i]])
    print("Number of outliers:", len(outlier_list))
예제 #9
0
def elliptic_envelope(df, modelDir, norm_confidence=0.95):
    from sklearn.covariance import EllipticEnvelope
    from scipy.stats import normaltest

    if "ds" in df.columns:
        del df["ds"]
    model = EllipticEnvelope()
    test_stats, p_vals = normaltest(df.values, axis=0)
    normal_cols = p_vals >= (1 - norm_confidence)
    df = df.loc[:, normal_cols]
    if df.shape[1] == 0:
        return None
    df.outlier = model.fit_predict(df.values)
    df.outlier = df.outlier < 0  # 1 if inlier, -1 if outlier
    return df
예제 #10
0
def detect_anomoly():
    firebase = firebase.FirebaseApplication(
        'https://hackmit-df9ea.firebaseio.com', None)
    result = firebase.get('/Test/Double', None)
    items = result.items()
    result = np.array(list(result.values())[-60:]).reshape(-1, 1)

    # normalize result
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    result2 = scaler.fit_transform(result)
    from sklearn.covariance import EllipticEnvelope
    clf = EllipticEnvelope(contamination=.3)
    y_Pred = clf.fit_predict(result2)

    an_outlier = len(list(filter(lambda x: (x < 0), y_Pred))) > 0
    if clf.precision_ > 100 and an_outlier:
        return True
    return False
예제 #11
0
def filter_outliers(points,
                    n_estimators=100,
                    contamination=0.05,
                    type='isolation_forest'):
    if type == 'elliptic':
        clf = EllipticEnvelope(contamination=contamination)
    elif type == 'isolation_forest':
        clf = IsolationForest(n_estimators=n_estimators,
                              contamination=contamination)
    else:
        raise ValueError("Unknown outlier filter type")
    y = clf.fit_predict(points.numpy())
    y = torch.tensor(y)
    num_valid = (y > 0).sum()
    num_filtered = (y <= 0).sum()
    logger.info('filtered points',
                num_filtered=num_filtered.item(),
                num_valid=num_valid.item())
    return y > 0
예제 #12
0
def get_outlyingness(data, contamination=0.1):
    """ Outlier detection from covariance estimation in a Gaussian distributed dataset.
    
    :param data: Data in which to detect outliers. Take care that n_samples > n_features ** 2 .
    :type data: pandas.DataFrame
    :param contamination: The amount of contamination of the data set, i.e. the proportion of outliers in the data set.
    Range is (0, 0.5).
    :type contamination: float
    :returns: Decision on each row if it's an outlier. And contour array for drawing ellipse in graph.
    :rtype: tuple[numpy.ndarray, numpy.ndarray]
    """
    robust_cov = EllipticEnvelope(support_fraction=1., contamination=contamination)
    outlyingness = robust_cov.fit_predict(data)
    decision = (outlyingness-1).astype(bool)
    
    # Visualisation.
    xx, yy = np.meshgrid(np.linspace(0, 100, 101),
                         np.linspace(0, 100, 101))
    z = robust_cov.predict(np.c_[xx.ravel(), yy.ravel()])
    z = z.reshape(xx.shape)
    
    return decision, z
예제 #13
0
                         code='Cluster_LOF_Labels',
                         title='Cluster Distribution')

LOF_G2_Bar = bar_cluster(df=df_X_train_std,
                         x='Cluster_LOF_Labels',
                         code='Cluster_LOF_Labels',
                         title='Cluster Distribution')

LOF_G3_Bar = bar_cluster(df=df_X_train_std,
                         x='Cluster_LOF_Labels',
                         code='Cluster_LOF_Labels',
                         title='Cluster Distribution')

## EE
EE = EllipticEnvelope(random_state=10, contamination=0.0058).fit(X_train_std)
EE_labels = EE.fit_predict(X_train_std)

## 3D Plot of Training Data
# Create and modify dataframe for the cluster column
df_X_train_std = pd.DataFrame(X_train_std)
df_X_train_std['Cluster_EE'] = pd.Series(EE_labels, index=df_X_train_std.index)

# Rename Cluster label names from EE
cluster_label_names = {1: "Human", -1: "Hacker"}
df_X_train_std['Cluster_EE_Labels'] = df_X_train_std['Cluster_EE'].map(
    cluster_label_names)

df_X_train_std.columns = [
    'Kill Death Ratio', "Headshot Kill Ratio", 'Win Ratio', "Top 10 Ratio",
    'Cluster_EE', 'Cluster_EE_Labels'
]
예제 #14
0
    def establish_model(self):
        ##更改按钮状态
        self.new_model.setEnabled(False)
        self.new_model.setText('训练中')

        conn = pymysql.connect(host='localhost',
                               port=3306,
                               user='******',
                               password='',
                               database='resistance')
        cs1 = conn.cursor()
        all_character = pd.DataFrame()
        cs1.execute('use resistance')
        print(self.model_data_num.currentText().split(':'))
        cs1.execute("select * from {}_{} order by id desc limit {}".format(
            datetime.datetime.now().year,
            datetime.datetime.now().month,
            int(self.model_data_num.currentText().split(':')[1])))

        columns_name = cs1.fetchall()
        cs1.close()
        character_data = pd.DataFrame(np.array(columns_name)).iloc[:, -12:]
        print(character_data.shape)
        now_time = datetime.datetime.now()
        os.makedirs(os.getcwd() + '\\model\\{}'.format(
            str(now_time.year) + '-' + str(now_time.month) + '-' +
            str(now_time.day) + ' ' + str(now_time.hour) + '_' +
            str(now_time.minute) + '_' + str(now_time.second)))
        self.new_model_dir = str(now_time.year) + '-' + str(
            now_time.month) + '-' + str(now_time.day) + ' ' + str(
                now_time.hour) + '_' + str(now_time.minute) + '_' + str(
                    now_time.second)
        ##使用LOF进行预测
        lof = LocalOutlierFactor(
            novelty=True, n_neighbors=10,
            contamination=0.028)  # LOF模型实例化,0.028表示异常点在所有数据点中所占的比率
        # error_lof_index = data_ana[clf.fit_predict(data_ana) == -1].index  #将LOF模型判断为异常的点的索引列出
        lof_result = lof.fit(character_data)
        with open(
                os.getcwd() + '\\model\\{}\\lof.model'.format(
                    str(now_time.year) + '-' + str(now_time.month) + '-' +
                    str(now_time.day) + ' ' + str(now_time.hour) + '_' +
                    str(now_time.minute) + '_' + str(now_time.second)),
                'wb') as f:
            pickle.dump(lof, f)

        ##使用孤立森林进行预测

        ilf = IsolationForest(max_samples=100,
                              random_state=42,
                              n_estimators=200,
                              contamination=0.028)  # 模型实例化

        ilf_result = ilf.fit_predict(character_data)
        with open(
                os.getcwd() + '\\model\\{}\\ilf.model'.format(
                    str(now_time.year) + '-' + str(now_time.month) + '-' +
                    str(now_time.day) + ' ' + str(now_time.hour) + '_' +
                    str(now_time.minute) + '_' + str(now_time.second)),
                'wb') as f:
            pickle.dump(ilf, f)

            ##使用elf进行预测
        elf = EllipticEnvelope(support_fraction=1, contamination=0.028)
        elf_result = elf.fit_predict(character_data)
        with open(
                os.getcwd() + '\\model\\{}\\elf.model'.format(
                    str(now_time.year) + '-' + str(now_time.month) + '-' +
                    str(now_time.day) + ' ' + str(now_time.hour) + '_' +
                    str(now_time.minute) + '_' + str(now_time.second)),
                'wb') as f:
            pickle.dump(elf, f)
        self.new_model_msg.emit('ok')
        ##更改按钮状态
        self.new_model.setEnabled(True)
        self.new_model.setText('开始训练')
        self.st.stop_thread(self.sever_establish_model_th)
예제 #15
0
def plot_raw_overview(filename):
    event_type = 'all'

    if filename.name.startswith('sub-drouwen'):
        CHANS = [f'IH0{x + 1}' for x in range(8)]
    elif filename.name.startswith('sub-itens'):
        CHANS = [f'C0{x + 1}' for x in range(8)]
    elif filename.name.startswith('sub-lemmer'):
        CHANS = [f'IH{x + 1}' for x in range(8)]
    elif filename.name.startswith('sub-som705'):
        CHANS = [f'GA0{x + 1}' for x in range(8)]  # a bit random
    elif filename.name.startswith('sub-ommen'):
        CHANS = ['chan1',
                 'chan2']  # I dont 'understand why I cannot use 'chan64'
    elif filename.name.startswith('sub-vledder') or filename.name.startswith(
            'sub-ommen'):
        CHANS = ['chan1', 'chan64']
    elif '_acq-blackrock_' in filename.name:
        CHANS = ['chan1', 'chan128']
    else:
        print('you need to specify reference channel for this test')
        return None, None

    d = Dataset(filename, bids=True)
    event_names, event_onsets = select_events(d, event_type)

    is_ecog = d.dataset.task.channels.tsv['type'] == 'ECOG'
    is_seeg = d.dataset.task.channels.tsv['type'] == 'SEEG'
    chans = array(d.header['chan_name'])[is_ecog | is_seeg]
    data = d.read_data(begtime=event_onsets[0],
                       endtime=event_onsets[-1],
                       chan=list(chans))
    data.data[0][isnan(data.data[0])] = 0  # ignore nan

    data = montage(data, ref_chan=CHANS)
    freq = frequency(data, taper='hann', duration=2, overlap=0.5)

    hist = make_histogram(data, max=250, step=10)
    divs = []
    fig = plot_hist(hist)
    divs.append(to_div(fig))

    bad_chans = None

    if AUTOMATIC:
        from sklearn.covariance import EllipticEnvelope

        algorithm = EllipticEnvelope(
            contamination=P['data_quality']['histogram']['contamination'])
        prediction = algorithm.fit(hist.data[0]).predict(hist.data[0])
        new_bad_chans = data.chan[0][prediction == -1]
        print('bad channels with histogram / elliptic envelope: ' +
              ', '.join(new_bad_chans))
        bad_chans = set(new_bad_chans)

        fig = plot_outliers(hist.chan[0],
                            algorithm.dist_,
                            prediction,
                            yaxis_title='distance',
                            yaxis_type='log')
        divs.append(to_div(fig))

    fig = plot_freq(freq)
    divs.append(to_div(fig))

    if AUTOMATIC:
        from sklearn.neighbors import LocalOutlierFactor

        algorithm = LocalOutlierFactor(
            n_neighbors=P['data_quality']['spectrum']['n_neighbors'])
        prediction = algorithm.fit_predict(freq.data[0])

        new_bad_chans = data.chan[0][prediction == -1]
        print('bad channels with spectrum / local outlier factor: ' +
              ', '.join(new_bad_chans))
        bad_chans |= set(new_bad_chans)
        fig = plot_outliers(freq.chan[0],
                            algorithm.negative_outlier_factor_,
                            prediction,
                            yaxis_title='distance',
                            yaxis_type='linear')
        divs.append(to_div(fig))

        # we use again the reference channel. Ref channel was handpicked but it might have a weird spectrum
        bad_chans -= set(CHANS)

    return bad_chans, divs
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

from sklearn.decomposition import PCA

data = loadmat('ex8data2.mat')

X = data['X']

e1 = EllipticEnvelope()
labels1 = e1.fit_predict(X)

e2 = LocalOutlierFactor()
labels2 = e2.fit_predict(X)

n_components = 3

pca1 = PCA(n_components=n_components)
Xproj = pca1.fit_transform(X)

plt.figure()
plt.clf()
ax = plt.axes(projection='3d')

# ax.scatter(image_array[:, 0], image_array[:, 1], image_array[:, 2], c=labels, cmap='coolwarm', marker=',')

ax.scatter(Xproj[:, 0], Xproj[:, 1], Xproj[:, 2], marker='o', c=labels1)
예제 #17
0
SVM = svm.OneClassSVM(gamma='auto')

detected_results_SVM = SVM.fit_predict(featureData)

outliers_SVM = []
for i in range(len(detected_results_SVM)):
    if detected_results_SVM[i] < 0:
        outliers_SVM.append(i)

SVM_data = np.delete(featureData, (outliers_SVM), axis=0)
output_SVM = np.delete(outputLabels, (outliers_SVM), axis=0)

# Elliptic envelope
EE = EllipticEnvelope()

detected_results_EE = EE.fit_predict(featureData)

outliers_EE = []
for i in range(len(detected_results_EE)):
    if detected_results_EE[i] < 0:
        outliers_EE.append(i)

EE_data = np.delete(featureData, (outliers_EE), axis=0)
output_EE = np.delete(outputLabels, (outliers_EE), axis=0)

# Compare the outlier results
accuracy_scores = {}

model0 = svm.SVC()
model0.fit(IS_data[0:int(len(IS_data) / 2)],
           output_IS[0:int(len(IS_data) / 2)])
예제 #18
0
def elliptEnvMethod(data, uniqueTrains, **kwargs):

    elp = EllipticEnvelope(support_fraction=1)
    elp.fit_predict(data)

    # Squared Mahalanobis distances of the points of data
    # Note this is the same as using the "elp.dist_" parameter
    m_d = elp.mahalanobis(data)

    # Get the regular Mahalanobis distances
    elp_d = np.sqrt(m_d)

    # IMPLEMENT THE AUTOMATED CUT-OFF
    SCORE_INCREASE_RATIO = 1.3

    sortD = np.sort(elp_d)
    sortD = sortD[math.floor(
        len(sortD) / 2):]  # Get the end half of the sorted list of scores

    ratioD = np.array([sortD[i] / sortD[i - 1] for i in range(1, len(sortD))])

    # print(f'\nSorted distances: {sortD}\n\n Ratios: {ratioD}')

    ind = np.where(ratioD > SCORE_INCREASE_RATIO)

    if len(ind[0]) >= 1:
        ind = ind[0][0] + 1
        SIGMA = sortD[
            ind]  # Get the score which increases by the score_ratio compared to the previous score
    else:
        SIGMA = 100.0  # use an arbritary high score as there are no big score jumps

    SIGMA = max(SIGMA, 4.0)  # Limit the SIGMA function from being too low

    # Segment
    labels = (elp_d >= SIGMA).astype(int)

    # labelOLD = (elp_d > 4.0).astype(int)

    if False:
        print('.dist_ = {m1}\t .m(data)={}'.format(m2))  #\n {m_d}')
        print("Sigma labels: {}".format(labels))
        print("\nCovariance = {}".format(elp.covariance_))

    if False:
        labelColours = 'white'

        fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 6))
        colours = np.array(['blue', 'red'])

        ax[0].set_title('Elliptical envelope - Mahalanobis distance',
                        color='white')
        ax[0].plot(elp_d, 'bo', alpha=0.4)  #, color=colours[labels])

        ax[1].scatter(data[:, 0],
                      data[:, 1],
                      s=20,
                      color=colours[labels],
                      alpha=0.4)
        ax[1].set_title(
            'Elliptical envelope (adjusted cutoff={})'.format(SIGMA),
            color='white')

        # ax[2].scatter(data[:, 0], data[:, 1], s=20, color=colours[labelOLD], alpha=0.4)
        # ax[2].set_title('Elliptical envelope (adjusted cutoff={})'.format(4.0), color='white')

        for i, a in enumerate(ax.flat):
            ax[i].set_xlabel('Mean normalised')  #, fontsize = 15.0)
            ax[i].set_ylabel('Standard deviation normalised')

        # fig1 = plt.figure(figsize=(5,5))

        plt.show()

    anomalies = [
        train for ind, train in enumerate(uniqueTrains) if labels[ind] == 1
    ]

    anomalyDates = []
    if kwargs.get('dates', None) is not None:
        dates = kwargs.get('dates')

        anomalyDates = [dates[i] for i, val in enumerate(labels) if val == 1]

    return anomalies, labels, anomalyDates
예제 #19
0
    def findAnomalies(self, saveChart=False, saveEvaluation=False):
        outliers_fraction = 0.15
        clf = EllipticEnvelope(contamination=outliers_fraction)
        predicted_outlier = []
        list_of_df = self.dataCollector.getWithAnomaly()
        for df in list_of_df:
            if df.shape[0] > 0:
                data = df.drop(['anomaly', 'changepoint'], axis=1)
                self.st_tr_time.append(datetime.datetime.now().timestamp())
                prediction = pd.Series(clf.fit_predict(data) * -1, index=df.index) \
                    .rolling(5) \
                    .median() \
                    .fillna(0).replace(-1, 0)
                self.en_tr_time.append(datetime.datetime.now().timestamp())
                # predicted outliers saving
                predicted_outlier.append(prediction)
                df['rocov_anomaly'] = prediction
        true_outlier = [df.anomaly for df in list_of_df]
        if saveChart:
            for i in range(len(predicted_outlier)):
                plt.figure()

                plt.rcParams["font.family"] = "Times New Roman"
                csfont = {'fontname': 'Times New Roman'}
                plt.xlabel('Time', **csfont)
                plt.ylabel('Value', **csfont)
                plt.title('Robust covariance On File [{}]'.format(i + 1),
                          **csfont)

                predicted_outlier[i].plot(figsize=(12, 6),
                                          label='predictions',
                                          marker='o',
                                          markersize=5)
                true_outlier[i].plot(marker='o', markersize=2)

                # data = list_of_df[i]
                # plt.scatter(x=data[data['rocov_anomaly'] == data['anomaly']].index,
                #             y=data[data['rocov_anomaly'] == data['anomaly']]['anomaly'], label='True Prediction'
                #             , c='g', zorder=4)
                # plt.scatter(x=data[data['rocov_anomaly'] != data['anomaly']].index,
                #             y=data[data['rocov_anomaly'] != data['anomaly']]['anomaly'], label='False Prediction'
                #             , c='r', zorder=5)
                plt.legend(loc='upper right')
                plt.savefig(self.path_to_plt +
                            'anom/rocov-pre-{}.png'.format(i + 1),
                            format='png')
                print('Chart {} is Generated'.format(i + 1))
                plt.clf()
                plt.close('all')
        if saveChart:
            ts = 1
            for df in list_of_df:
                data = df.drop(['anomaly', 'changepoint'], axis=1)
                pc = PCA(n_components=2).fit_transform(data)
                df[['X', 'Y']] = pc
                plt.figure()
                sb.set(font='Times New Roman')
                sns = sb.scatterplot(data=df,
                                     x='X',
                                     y='Y',
                                     hue='rocov_anomaly',
                                     palette='bright')
                sns.set_title(
                    'The Anomaly Detected By Robust covariance, File {}'.
                    format(ts))
                sns.figure.savefig(self.path_to_plt +
                                   'chart/chart-{}.png'.format(ts))
                plt.close('all')
                print('The Chart of  File {} is Generated.'.format(ts))
                ts += 1
        if saveEvaluation:
            evaluator = Evaluator(true_outlier,
                                  predicted_outlier,
                                  metric='binary',
                                  numenta_time='30 sec')
            metrics = evaluator.getConfusionMetrics()
            TP = metrics['TP']
            TN = metrics['TN']
            FP = metrics['FP']
            FN = metrics['FN']
            print('\n-----------------------------------------------------')
            print('Robust covariance Outputs: ')
            print(f'\t False Alarm Rate: {round(FP / (FP + TN) * 100, 2)} %')
            print(f'\t Missing Alarm Rate: {round(FN / (FN + TP) * 100, 2)} %')
            print(
                f'\t Accuracy Rate: {round((TP + TN) / (TP + TN + FN + FP) * 100, 2)} %'
            )

            trainTime = np.array(self.en_tr_time).sum() - np.array(
                self.st_tr_time).sum()
            print(f'\t Train & Train Time {round(trainTime, 2)}s')

            data = {
                'far': round(FP / (FP + TN) * 100, 2),
                'mar': round(FN / (FN + TP) * 100, 2),
                'acc': round((TP + TN) / (TP + TN + FN + FP) * 100, 2),
                'tr': trainTime,
                'te': 0,
                'tp': TP,
                'tn': TN,
                'fp': FP,
                'fn': FN
            }
            output = OutputWriter(self.path_to_plt, 'RobustCov', data)
            output.write()
예제 #20
0
파일: anomaly.py 프로젝트: pranatalif/chat
model4 = LocalOutlierFactor(n_neighbors=200, algorithm="brute",
                            leaf_size=200, contamination=0.1)  # fix
# model5 = DBSCAN()
model6 = EllipticEnvelope(
    contamination=0.10, random_state=100, support_fraction=0.1)  # fix

# model fitting outlier detection
# print("====== OUTLIER DETECTION =======")
X_train_pred2, X_test_pred2 = model2.fit_predict(
    df_X_train), model2.fit_predict(df_X_test)
X_train_pred3, X_test_pred3 = model3.fit_predict(
    df_X_train), model3.fit_predict(df_X_test)
X_train_pred4, X_test_pred4 = model4.fit_predict(
    df_X_train), model4.fit_predict(df_X_test)
# y_pred5 = model5.fit_predict(df)
X_train_pred6, X_test_pred6 = model6.fit_predict(
    df_X_train), model6.fit_predict(df_X_test)


# print("====== NOVELTY DETECTION =======")
# model2.fit(df_X_train), model2.fit(df_X_test)
# novelty_X_train_pred2, novelty_X_test_pred2 = model2.predict(
#     df_X_train), model2.predict(df_X_test)
model2.fit(df_X_train)
novelty_X_train_pred2 = model2.predict(df_X_test)


# print("X_train_pred2 bbbbb: ", X_train_pred2)
# print("novelty_X_train_pred2 bbbbb: ", novelty_X_train_pred2)

# with np.printoptions(threshold=np.inf):
# print("X_train_pred2 shape: ", X_train_pred2.size)
from sklearn.covariance import EllipticEnvelope
from sklearn.metrics import mean_absolute_error
# load the dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv'
df = read_csv(url, header=None)
# retrieve the array
data = df.values
# split into input and output elements
X, y = data[:, :-1], data[:, -1]
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
# identify outliers in the training dataset
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]
# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)
# fit the model
model = LinearRegression()
model.fit(X_train, y_train)
# evaluate the model
yhat = model.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
print('MAE: %.3f' % mae)

예제 #22
0
def getRobustCovairance(_df):
  clf = EllipticEnvelope(contamination=OUTLIER_FRACTION)
  return clf.fit_predict(_df)
_temp = np.ceil(data['Fare'].quantile(0.75) + (IQR * 3))
data_processed.loc[data_processed.Fare > _temp, 'Fare'] = _temp

X_train_processed, X_test_processed, y_train_processed, y_test_processed = train_test_split(
    data_processed[['Age', 'Fare']].fillna(0),
    data_processed['Survived'],
    test_size=0.2)

from sklearn.covariance import EllipticEnvelope

df_outliers = data.copy()
df_outliers = df_outliers.fillna(0)

column_name = 'Fare'
obj = EllipticEnvelope()
_temp = obj.fit_predict(df_outliers[[column_name]])
print(np.unique(_temp, return_counts=True))
central = df_outliers[_temp==1][column_name].mean()
max_val = df_outliers[_temp==1][column_name].max()
min_val = df_outliers[_temp==1][column_name].min()
df_outliers.loc[_temp==-1,[column_name]] = df_outliers.loc[_temp==-1,[column_name]].apply(lambda x: [max_val if y > central else y for y in x])
df_outliers.loc[_temp==-1,[column_name]] = df_outliers.loc[_temp==-1,[column_name]].apply(lambda x: [min_val if y < central else y for y in x])
print(data.shape)
print(df_outliers.shape)

column_name = 'Age'
obj = EllipticEnvelope()
_temp = obj.fit_predict(df_outliers[[column_name]])
print(np.unique(_temp, return_counts=True))
central = df_outliers[_temp==1][column_name].mean()
max_val = df_outliers[_temp==1][column_name].max()
예제 #24
0
def minimum_covariance_determinant(X_train):
    # identify outliers in the training dataset
    ee = EllipticEnvelope(contamination=0.01)
    yhat = ee.fit_predict(X_train)
    return yhat
def get_gauss(db: pd.DataFrame) -> list:
    ee = EllipticEnvelope(contamination=0.01)
    yhat_gaus = ee.fit_predict(db)
    return yhat_gaus == -1
예제 #26
0
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

data = loadmat('ex8data1.mat')

X = data['X']

plt.figure()

estimator = EllipticEnvelope(contamination=.015)

labels = estimator.fit_predict(X)

e1 = IsolationForest()
labels1 = e1.fit_predict(X)

xx, yy = np.meshgrid(np.linspace(min(X[:, 0]), max(X[:, 0]), 150),
                     np.linspace(min(X[:, 1]), max(X[:, 1]), 150))

# Z = estimator.predict(np.c_[xx.ravel(), yy.ravel()])
# Z = Z.reshape(xx.shape)
# plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='black', alpha=0.5)
#
# Z1 = e1.predict(np.c_[xx.ravel(), yy.ravel()])
# Z1 = Z1.reshape(xx.shape)
# plt.contour(xx, yy, Z1, levels=[0], linewidths=2, colors='red', alpha=0.2)