def test_score_samples():
    X_train = [[1, 1], [1, 2], [2, 1]]
    clf1 = EllipticEnvelope(contamination=0.2).fit(X_train)
    clf2 = EllipticEnvelope().fit(X_train)
    assert_array_equal(clf1.score_samples([[2., 2.]]),
                       clf1.decision_function([[2., 2.]]) + clf1.offset_)
    assert_array_equal(clf2.score_samples([[2., 2.]]),
                       clf2.decision_function([[2., 2.]]) + clf2.offset_)
    assert_array_equal(clf1.score_samples([[2., 2.]]),
                       clf2.score_samples([[2., 2.]]))
def test_outlier_detection():
    rnd = np.random.RandomState(0)
    X = rnd.randn(100, 10)
    clf = EllipticEnvelope(contamination=0.1)
    assert_raises(NotFittedError, clf.predict, X)
    assert_raises(NotFittedError, clf.decision_function, X)
    clf.fit(X)
    y_pred = clf.predict(X)
    decision = clf.decision_function(X, raw_values=True)
    decision_transformed = clf.decision_function(X, raw_values=False)

    assert_array_almost_equal(decision, clf.mahalanobis(X))
    assert_array_almost_equal(clf.mahalanobis(X), clf.dist_)
    assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.0)
    assert sum(y_pred == -1) == sum(decision_transformed < 0)
예제 #3
0
    def filter_remove_outlayers(self, flat, minimum_value=0):
        """
        Remove outlayers using ellicptic envelope from scikits learn
        :param flat:
        :param minimum_value:
        :return:
        """
        from sklearn.covariance import EllipticEnvelope
        flat0 = flat.copy()
        flat0[np.isnan(flat)] = 0
        x,y = np.nonzero(flat0)
        # print np.prod(flat.shape)
        # print len(y)

        z = flat[(x,y)]

        data = np.asarray([x,y,z]).T

        clf = EllipticEnvelope(contamination=.1)
        clf.fit(data)
        y_pred = clf.decision_function(data)


        out_inds = y_pred < minimum_value
        flat[(x[out_inds], y[out_inds])] = np.NaN
        return flat
def outlier_removal2(features, samples, cv_predict):

    outliers_fraction = 0.1

    print cv_predict.shape
    print samples.shape
    test = np.column_stack((cv_predict, samples))
    #clf = EllipticEnvelope(contamination=.1)
    clf = EllipticEnvelope(contamination=.1)
    #clf = svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,
    #                                 kernel="rbf", gamma=0.1)
    clf.fit(test)
    y_pred = clf.decision_function(test).ravel()
    threshold = stats.scoreatpercentile(y_pred,
                                        100 * outliers_fraction)

    y_pred_new = y_pred > threshold
    print y_pred_new
    #print samples[y_pred_new]
    print samples.shape
    print samples[y_pred_new].shape
    print features.shape
    print features[y_pred_new].shape

    return features[y_pred_new], samples[y_pred_new]
    def clean_series(self, token, discard=5):

        """
        Remove outliers from the ratio series for a token.

        Args:
            discard (int): Drop the most outlying X% of the data.

        Returns: OrderedDict{year: wpm}
        """

        series = self.ratios[token]

        X = np.array(list(series.values()))[:, np.newaxis]

        env = EllipticEnvelope()
        env.fit(X)

        # Score each data point.
        y_pred = env.decision_function(X).ravel()

        # Get the discard threshold.
        threshold = stats.scoreatpercentile(y_pred, discard)

        return OrderedDict([
            (year, ratio)
            for (year, ratio), pred in zip(series.items(), y_pred)
            if pred > threshold
        ])
예제 #6
0
def find_outlier_test_homes(df,all_homes,  appliance, outlier_features, outliers_fraction=0.1):
    from scipy import stats

    from sklearn import svm
    from sklearn.covariance import EllipticEnvelope
    clf = EllipticEnvelope(contamination=.1)
    try:
        X = df.ix[all_homes[appliance]][outlier_features].values
        clf.fit(X)
    except:
        try:
            X = df.ix[all_homes[appliance]][outlier_features[:-1]].values
            clf.fit(X)
        except:
            try:
                X = df.ix[all_homes[appliance]][outlier_features[:-2]].values
                clf.fit(X)
            except:
                print "outlier cannot be found"
                return df.ix[all_homes[appliance]].index.tolist()


    y_pred = clf.decision_function(X).ravel()
    threshold = stats.scoreatpercentile(y_pred,
                                        100 * outliers_fraction)
    y_pred = y_pred > threshold
    return df.ix[all_homes[appliance]][~y_pred].index.tolist()
예제 #7
0
def filterOut(x):
    x = np.array(x)
    outliers_fraction=0.05
    #clf = svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,  kernel="rbf", gamma=0.1) 
    clf = EllipticEnvelope(contamination=outliers_fraction)
    clf.fit(x)
    y_pred = clf.decision_function(x).ravel()
    threshold = stats.scoreatpercentile(y_pred,
                                        100 * outliers_fraction)
    y_pred = y_pred > threshold
    return y_pred
def test_outlier_detection():
    """

    """
    rnd = np.random.RandomState(0)
    X = rnd.randn(100, 10)
    clf = EllipticEnvelope(contamination=0.1)
    clf.fit(X)
    y_pred = clf.predict(X)

    assert_array_almost_equal(clf.decision_function(X, raw_mahalanobis=True), clf.mahalanobis(X - clf.location_))
    assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.0)
예제 #9
0
    def module4(self):
        '''
            入力された一次元配列からanomaly detectionを用いて外れ値を検出する
        '''

        # get data
        img = cv2.imread('../saliency_detection/image/pearl.png')
        b,g,r = cv2.split(img) 
        B,G,R = map(lambda x,y,z: x*1. - (y*1. + z*1.)/2., [b,g,r],[r,r,g],[g,b,b])

        Y = (r*1. + g*1.)/2. - np.abs(r*1. - g*1.)/2. - b*1.
        # 負の部分は0にする
        R[R<0] = 0
        G[G<0] = 0
        B[B<0] = 0
        Y[Y<0] = 0
        rg = cv2.absdiff(R,G)
        by = cv2.absdiff(B,Y)
        img1 = rg
        img2 = by

        rg, by = map(lambda x:x.reshape((len(b[0])*len(b[:,0]),1)),[rg,by])
        data = np.hstack((rg,by))
        data = data.astype(np.float64)
        data = np.delete(data, range( 0,len(data[:,0]),2),0)

        # grid
        xx1, yy1 = np.meshgrid(np.linspace(-10, 300, 500), np.linspace(-10, 300, 500))
        
        # 学習して境界を求める # contamination大きくすると円は小さく
        clf = EllipticEnvelope(support_fraction=1, contamination=0.01)
        print 'data.shape =>',data.shape
        print 'learning...'
        clf.fit(data) #学習 # 0があるとだめっぽいかも
        print 'complete learning!'

        # 学習した分類器に基づいてデータを分類して楕円を描画
        z1 = clf.decision_function(np.c_[xx1.ravel(), yy1.ravel()])
        z1 = z1.reshape(xx1.shape)
        plt.contour(xx1,yy1,z1,levels=[0],linewidths=2,colors='r')

        # plot
        plt.scatter(data[:,0],data[:,1],color= 'black')
        plt.title("Outlier detection")
        plt.xlim((xx1.min(), xx1.max()))
        plt.ylim((yy1.min(), yy1.max()))
        plt.pause(.001)
        # plt.show()
        
        cv2.imshow('rg',img1/np.amax(img1))
        cv2.imshow('by',img2/np.amax(img2))
예제 #10
0
def test_outlier_detection():
    """

    """
    np.random.RandomState(0)
    X = np.random.randn(100, 10)
    clf = EllipticEnvelope(contamination=0.1)
    clf.fit(X)
    y_pred = clf.predict(X)

    assert_array_almost_equal(clf.decision_function(X, raw_mahalanobis=True),
                              clf.mahalanobis(X - clf.location_))
    assert_almost_equal(clf.score(X, np.ones(100)),
                        (100 - y_pred[y_pred == -1].size) / 100.)
def test_elliptic_envelope():
    rnd = np.random.RandomState(0)
    X = rnd.randn(100, 10)
    clf = EllipticEnvelope(contamination=0.1)
    assert_raises(NotFittedError, clf.predict, X)
    assert_raises(NotFittedError, clf.decision_function, X)
    clf.fit(X)
    y_pred = clf.predict(X)
    scores = clf.score_samples(X)
    decisions = clf.decision_function(X)

    assert_array_almost_equal(scores, -clf.mahalanobis(X))
    assert_array_almost_equal(clf.mahalanobis(X), clf.dist_)
    assert_almost_equal(clf.score(X, np.ones(100)),
                        (100 - y_pred[y_pred == -1].size) / 100.)
    assert (sum(y_pred == -1) == sum(decisions < 0))
예제 #12
0
def labelValidSkeletons(skel_file, valid_index, trajectories_data, fit_contamination = 0.05):
    #calculate valid widths if they were not used
    calculate_widths(skel_file)
    
    #calculate classifier for the outliers    
    X4fit = nodes2Array(skel_file, valid_index)        
    clf = EllipticEnvelope(contamination = fit_contamination)
    clf.fit(X4fit)
    
    #calculate outliers using the fitted classifier
    X = nodes2Array(skel_file) #use all the indexes
    y_pred = clf.decision_function(X).ravel() #less than zero would be an outlier

    #labeled rows of valid individual skeletons as GOOD_SKE
    trajectories_data['auto_label'] = ((y_pred>0).astype(np.int))*wlab['GOOD_SKE'] #+ wlab['BAD']*np.isnan(y_prev)
    saveLabelData(skel_file, trajectories_data)
예제 #13
0
    def __init__(self, M, samples, filtermode, threshold, projdir, seed):

        # outlier detection
        clf = LocalOutlierFactor(n_neighbors=20, contamination=threshold)
        y_pred = clf.fit_predict(M)

        cee = EllipticEnvelope(contamination=threshold, random_state=seed)
        cee.fit(M)
        scores_pred = cee.decision_function(M)
        y_pred2 = cee.predict(M)

        cif = IsolationForest(contamination=threshold, random_state=seed)
        cif.fit(M)
        scores_pred = cif.decision_function(M)
        y_pred3 = cif.predict(M)

        outlier_methods = ["lof", "ee", "if"]
        ol_df = DataFrame(np.column_stack((y_pred, y_pred2, y_pred3)),
                          index=samples[0].tolist(),
                          columns=outlier_methods)

        keep_samples, drop_samples, drop_indices = ([] for i in range(3))

        omnibus_methods = ["any", "any2", "all"]
        if filtermode in omnibus_methods:
            dft = ol_df.sum(axis=1)
            dft = DataFrame(dft)
            if filtermode == "any":
                drop_samples = dft[dft[0] != 3].index.values.tolist()
                keep_samples = dft[dft[0] == 3].index.values.tolist()
            elif filtermode == "any2":
                drop_samples = dft[dft[0] <= -1].index.values.tolist()
                keep_samples = dft[dft[0] > -1].index.values.tolist()
            elif filtermode == "all":
                drop_samples = dft[dft[0] == -3].index.values.tolist()
                keep_samples = dft[dft[0] != -3].index.values.tolist()

        elif filtermode in outlier_methods:
            drop_samples = ol_df[ol_df[filtermode] == -1].index.values.tolist()
            keep_samples = ol_df[ol_df[filtermode] == 1].index.values.tolist()

        drop_bool = np.isin(samples[0], drop_samples)
        drop_indices = np.where(drop_bool)[0].tolist()

        self.keep = keep_samples
        self.drop = drop_samples
        self.drop_indices = drop_indices
def test_elliptic_envelope():
    rnd = np.random.RandomState(0)
    X = rnd.randn(100, 10)
    clf = EllipticEnvelope(contamination=0.1)
    assert_raises(NotFittedError, clf.predict, X)
    assert_raises(NotFittedError, clf.decision_function, X)
    clf.fit(X)
    y_pred = clf.predict(X)
    scores = clf.score_samples(X)
    decisions = clf.decision_function(X)

    assert_array_almost_equal(
        scores, -clf.mahalanobis(X))
    assert_array_almost_equal(clf.mahalanobis(X), clf.dist_)
    assert_almost_equal(clf.score(X, np.ones(100)),
                        (100 - y_pred[y_pred == -1].size) / 100.)
    assert(sum(y_pred == -1) == sum(decisions < 0))
예제 #15
0
def outlierDetector(clf_name, rng, X_train):

    outliers_fraction = 0.04
    if clf_name == 'RobustCovariance':
        clf_ell = EllipticEnvelope(contamination=outliers_fraction)
        clf_ell.fit(X_train)
        anomaly_score = clf_ell.decision_function(X_train)
        outliers = clf_ell.predict(X_train)

    if clf_name == 'IsolationForest':
        clf_iforest = IsolationForest(n_estimators=100,
                                      random_state=rng,
                                      contamination=outliers_fraction)
        clf_iforest.fit(X_train)
        anomaly_score = clf_iforest.decision_function(X_train)
        outliers = clf_iforest.predict(X_train)

    return outliers
예제 #16
0
def labelValidSkeletons(skel_file):
    calculate_widths(skel_file)
    
    #get valid rows using the trajectory displacement and the skeletonization success
    valid_index, trajectories_data = getValidIndexes(skel_file)
    
    #calculate classifier for the outliers    
    X4fit = nodes2Array(skel_file, valid_index)        
    clf = EllipticEnvelope(contamination=.1)
    clf.fit(X4fit)
    
    #calculate outliers using the fitted classifier
    X = nodes2Array(skel_file)
    y_pred = clf.decision_function(X).ravel() #less than zero would be an outlier

    #labeled rows of valid individual skeletons as GOOD_SKE
    trajectories_data['auto_label'] = ((y_pred>0).astype(np.int))*wlab['GOOD_SKE'] #+ wlab['BAD']*np.isnan(y_prev)
    saveLabelData(skel_file, trajectories_data)
def res_ee(features, contamination=0.1, score=False):
    '''
    use loess curve residuals and elliptic envelope to identify outliers
    
    Parameters
    ----------
    features: dataframe
        dataframe of features
    contamination: decimal 
        proportion of outliers expected in data
    score: boolean
        return binary prediction and outlier scores
        
    Returns
    -------
    list
        Binary series with same length as input TS. A value of -1 indicates the
        corresponding value in TS is an outlier     
    list
        outlier score. series with same length as input TS. only returned if 
        score = True
    '''

    #res = np.asarray(res).reshape(-1,1)

    # instantiate and predict lof on features
    model = EllipticEnvelope(assume_centered=True,
                             store_precision=False,
                             contamination=.1,
                             random_state=888)

    model.fit(features)
    y_pred = model.predict(features)

    if score == False:
        return (y_pred.tolist())

    # outlier scores
    scores = model.decision_function(features)

    return (y_pred.tolist(), scores.tolist())
예제 #18
0
def labelValidSkeletons_old(skeletons_file, good_skel_row, fit_contamination = 0.05):
    base_name = getBaseName(skeletons_file)
    progress_timer = timeCounterStr('');
    
    print_flush(base_name + ' Filter Skeletons: Starting...')
    with pd.HDFStore(skeletons_file, 'r') as table_fid:
        trajectories_data = table_fid['/trajectories_data']

    trajectories_data['is_good_skel'] = trajectories_data['has_skeleton']
    
    if good_skel_row.size > 0:
        #nothing to do if there are not valid skeletons left. 
        
        print_flush(base_name + ' Filter Skeletons: Reading features for outlier identification.')
        #calculate classifier for the outliers    
        
        nodes4fit = ['/skeleton_length', '/contour_area'] + \
        ['/' + name_width_fun(part) for part in worm_partitions]
        
        X4fit = nodes2Array(skeletons_file, nodes4fit, good_skel_row)
        assert not np.any(np.isnan(X4fit))
        
        #%%
        print_flush(base_name + ' Filter Skeletons: Fitting elliptic envelope. Total time:' + progress_timer.getTimeStr())
        #TODO here the is a problem with singular covariance matrices that i need to figure out how to solve
        clf = EllipticEnvelope(contamination = fit_contamination)
        clf.fit(X4fit)
        
        print_flush(base_name + ' Filter Skeletons: Calculating outliers. Total time:' + progress_timer.getTimeStr())
        #calculate outliers using the fitted classifier
        X = nodes2Array(skeletons_file, nodes4fit) #use all the indexes
        y_pred = clf.decision_function(X).ravel() #less than zero would be an outlier

        print_flush(base_name + ' Filter Skeletons: Labeling valid skeletons. Total time:' + progress_timer.getTimeStr())
        #labeled rows of valid individual skeletons as GOOD_SKE
        trajectories_data['is_good_skel'] = (y_pred>0).astype(np.int)
    
    #Save the new is_good_skel column
    saveModifiedTrajData(skeletons_file, trajectories_data)

    print_flush(base_name + ' Filter Skeletons: Finished. Total time:' + progress_timer.getTimeStr())
def detect_outliers(X, station):
    if station=='hoerning':
            outlierfraction = 0.0015
            classifier = svm.OneClassSVM(nu=0.95*outlierfraction + 0.05,
                                         kernel='rbf', gamma=0.1)
            Xscaler = StandardScaler(copy=True, with_mean=True, with_std=True).fit(X)
            X_scaled = Xscaler.transform(X)
            classifier.fit(X_scaled)
            svcpred = classifier.decision_function(X_scaled).ravel()
            threshold = stats.scoreatpercentile(svcpred, 100*outlierfraction)
            inlierpred = svcpred>threshold        
            
    else:
        outlierfraction = 0.0015
        classifier = EllipticEnvelope(contamination=outlierfraction)
        classifier.fit(X)
        gausspred = classifier.decision_function(X).ravel()
        threshold = stats.scoreatpercentile(gausspred, 100*outlierfraction)
        inlierpred = gausspred>threshold
            
    return inlierpred
예제 #20
0
def outlier_detection(datframe, vis=0):
    """
    identify and remove outliers by EllipticalEnvelope
    visualize with PCA if desired
    """

    dat = datframe[datframe.columns[:14]]

    clf = EllipticEnvelope(contamination=.1)
    clf.fit(dat)
    y_pred = clf.decision_function(dat).ravel()

    outliers_fraction = 0.25
    threshold = stats.scoreatpercentile(y_pred, 100 * outliers_fraction)

    datframe['detect'] = y_pred
    datframe = datframe[datframe.detect > threshold]

    if vis == 1:
        pca_visualize(datframe[datframe.columns[:14]])

    return datframe
예제 #21
0
def detect_outliers(X, station):
    if station == 'hoerning':
        outlierfraction = 0.0015
        classifier = svm.OneClassSVM(nu=0.95 * outlierfraction + 0.05,
                                     kernel='rbf',
                                     gamma=0.1)
        Xscaler = StandardScaler(copy=True, with_mean=True,
                                 with_std=True).fit(X)
        X_scaled = Xscaler.transform(X)
        classifier.fit(X_scaled)
        svcpred = classifier.decision_function(X_scaled).ravel()
        threshold = stats.scoreatpercentile(svcpred, 100 * outlierfraction)
        inlierpred = svcpred > threshold

    else:
        outlierfraction = 0.0015
        classifier = EllipticEnvelope(contamination=outlierfraction)
        classifier.fit(X)
        gausspred = classifier.decision_function(X).ravel()
        threshold = stats.scoreatpercentile(gausspred, 100 * outlierfraction)
        inlierpred = gausspred > threshold

    return inlierpred
예제 #22
0
def outlier_removal(features, samples):

    outliers_fraction = 0.1


    #clf = EllipticEnvelope(contamination=.1)
    clf = EllipticEnvelope(contamination=.1)
    #clf = svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,
    #                                 kernel="rbf", gamma=0.1)
    clf.fit(features, samples)
    y_pred = clf.decision_function(features).ravel()
    threshold = stats.scoreatpercentile(y_pred,
                                        100 * outliers_fraction)

    y_pred_new = y_pred > threshold
    print y_pred_new
    #print samples[y_pred_new]
    #print samples.shape
    print samples[y_pred_new].shape
    print features.shape
    print features[y_pred_new].shape

    return features[y_pred_new], samples[y_pred_new]
예제 #23
0
def find_outlier_train(ser, outliers_fraction=0.1, min_units=0.2):
    # Returns outlier, inliers

    X = ser[ser>min_units].reshape(-1,1)
    #is_normal_data = is_normal(ser)
    # FOR NOW only using Robust estimator of Covariance
    is_normal_data = True
    if is_normal_data:
        # Use robust estimator of covariance
        from sklearn.covariance import EllipticEnvelope
        clf = EllipticEnvelope(contamination=.1)
    else:
        #Data is not normally distributed, use OneClassSVM based outlier detection
        from sklearn import svm
        clf = svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,
                                     kernel="rbf", gamma=0.1)
    from scipy import stats

    clf.fit(X)
    y_pred = clf.decision_function(X).ravel()
    threshold = stats.scoreatpercentile(y_pred,
                                        100 * outliers_fraction)
    y_pred = y_pred > threshold
    return ser[ser>min_units][~y_pred], ser[ser>min_units][y_pred]
예제 #24
0
def find_outlier_train(ser, outliers_fraction=0.1, min_units=0.2):
    # Returns outlier, inliers

    X = ser[ser > min_units].reshape(-1, 1)
    #is_normal_data = is_normal(ser)
    # FOR NOW only using Robust estimator of Covariance
    is_normal_data = True
    if is_normal_data:
        # Use robust estimator of covariance
        from sklearn.covariance import EllipticEnvelope
        clf = EllipticEnvelope(contamination=.1)
    else:
        #Data is not normally distributed, use OneClassSVM based outlier detection
        from sklearn import svm
        clf = svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,
                              kernel="rbf",
                              gamma=0.1)
    from scipy import stats

    clf.fit(X)
    y_pred = clf.decision_function(X).ravel()
    threshold = stats.scoreatpercentile(y_pred, 100 * outliers_fraction)
    y_pred = y_pred > threshold
    return ser[ser > min_units][~y_pred], ser[ser > min_units][y_pred]
예제 #25
0
## Local Outlier Factor

lof = LocalOutlierFactor(n_neighbors=2, novelty=True)
lof.fit(data_scaled_means)
answerLOF_proba = lof.decision_function(data_scaled_means)
answerLOF_proba = 1 - ((answerLOF_proba - answerLOF_proba.min()) /
                       (answerLOF_proba.max() - answerLOF_proba.min()))
answerLOF_proba = pd.DataFrame({'target': answerLOF_proba})
pickle.dump(lof, open("../../data/model/LocalOutlierFactor", "wb"))

## Elliptic Envelope

ee = EllipticEnvelope()
ee.fit(data_scaled_means)
answerEE_proba = ee.decision_function(data_scaled_means)
answerEE_proba = 1 - (answerEE_proba - 3 * answerEE_proba.min()) * 10**12
answerEE_proba = pd.DataFrame({'target': answerEE_proba})
pickle.dump(ee, open("../../data/model/EllipticEnvelope", "wb"))

##############

### Soft voting

voting_answer = pd.DataFrame({
    'target':
    ((answerIF_proba * 2 + answerLOF_proba * 1 + answerEE_proba * 2) /
     5).T.apply(lambda x: -1 if x.values[0] > 0.4 else 1)
})

##############
예제 #26
0
def find_outliers(datestart,dateend,plot=False,cut=-0.05):
    numtopics=84

    di=datetime2str2(datestart)
    dfin=datetime2str2(dateend)

    #print di,dfin
    if dfin<di:
        temp=dfin
        dfin=di
        di=temp
    #print di,dfin
    
    afile="/home/ubuntu/mysql_insightwiki_auth.txt"
    a=open(afile)
    passwd=a.readline().rstrip()
    a.close()
    host='localhost'; user='******';db='wikidata'
    con = mdb.connect(host, user, passwd, db)#,port=3307)
     
    with con:
        curt= con.cursor()
        #sql="SELECT COUNT(*) FROM `topics` "
        
        sql="SELECT `Id`,`topic_label`,`topic_string` FROM `topics`;"
        curt.execute(sql)
        topics=[[0,'nothing','Filler to match index']]
        for topic in curt:
            topics.append(topic)

    data={}
        
    df=range(numtopics+1)
    with con:
        curt= con.cursor()
        sql="SELECT `Id`,`topic_label`,`topic_string` FROM `topics`;"
        curt.execute(sql)
        for row in curt:
            cur = con.cursor()
            sql='''SELECT `page_views`.`dateonly` AS `vd`, AVG(`page_views`.`count`) AS `vc`, 
                `topics`.`topic_label`,`topics`.`topic_string` 
                FROM `topics` INNER JOIN `page_views` ON `topics`.`ID` = `page_views`.`topic_id` 
                WHERE `topic_id`=%s GROUP BY `page_views`.`dateonly`   '''
            data[row[1]]=read_sql(sql, con,params=[row[0]])
            df[row[0]]=data[row[1]]
    
    topicdata=df
    
    d=topicdata[topics[3][0]]
    p=d[ (d['vd']>di) & (d['vd']<dfin )]['vc'].values    
    topicdata=df
    
    #initializing array to hold the rows to cluster
    #the 0th position is fake so that my index matches the sql index
    clusinp=[]
    clusinp.append(gen_feat([0,0,0,0,0]))
    
    chinaoff=6000
    #populating my array to go into my Kmean
    for index,topic in enumerate(topics):
        #topic=list(topics[index])
        if topic[0]!=0:
            d=topicdata[topic[0]]
            ppre=d[ (d['vd']>di) & (d['vd']<dfin )]['vc'].values
            p=gen_feat(ppre)
            if topic[0]==52:
                p=gen_feat([x-chinaoff if x-chinaoff>=0 else 0 for x in ppre  ])
            clusinp.append(p)
    
    #cleaning up my array making it numpy to go into my kmean
    clusinp=np.array(clusinp)
    clusinp[0]=clusinp[5] #making sure my through away first row matches in size
    #contam=0.325
    contamfix=0.1
    
    colors = ['m', 'g', 'b']
    X1=clusinp
    xx1, yy1 = np.meshgrid(np.linspace(0, 10000, 500), np.linspace(-1.5, 1.5, 500))
    ee=EllipticEnvelope(support_fraction=1., contamination=contamfix)
    #ee=OneClassSVM(nu=contam2, gamma=0.05,kernel='rbf')
    ee.fit(clusinp)
    outliers=ee.decision_function(X1, raw_values=False)
    
    if plot==True:
        print "here"
        get_ipython().magic(u'matplotlib inline')
        Z1 = ee.decision_function(np.c_[xx1.ravel(), yy1.ravel()])    
        Z1 = Z1.reshape(xx1.shape)
        legend1 = plt.contour(xx1, yy1, Z1, levels=[0], linewidths=2, colors=colors[1])
        plt.scatter(X1[:, 0], X1[:, 1], color='black')
        plt.xlim((xx1.min(), xx1.max()))
        plt.ylim((yy1.min(), yy1.max()))
        plt.show()

    out=[]
    for index,outlier in enumerate(outliers):
        row=[index,outlier,topics[index][1],int(np.round(clusinp[index][0])),int(np.round(100*clusinp[index][1]))]
        #row=[index,outlier,topics[index][1],int(np.round(clusinp[index][0])),clusinp[index][1]]
        if outlier<cut and index!=0 and row[3]>8:
            out.append(row)
            #print index,outlier,topics[index][2],clusinp[index][0],clusinp[index][1]
    #out=sorted(out,operator.itemgetter(4))
    #out.sort()
    out=sorted(out,key =lambda x:-x[4])
    return out
    xx, yy = np.meshgrid(np.linspace(-0.1, 1.1, 1000), np.linspace(0, 100, 1000))
    n_inliers = int((1. - outliers_fraction) * n_samples)
    n_outliers = int(outliers_fraction * n_samples)

    # Fit the problem with varying cluster separation
    np.random.seed(42)
    # Data generation


    # Fit the model with the One-Class SVM
    #plt.figure(figsize=(10, 5))

    clf = EllipticEnvelope(contamination=.1)
    # fit the data and tag outliers
    clf.fit(XY)
    y_pred = clf.decision_function(XY).ravel()
    threshold = stats.scoreatpercentile(y_pred,
                                        100 * outliers_fraction)
    y_pred = y_pred > threshold
    # plot the levels lines and the points
    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    subplot = ax[i]
    subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),
                     cmap=plt.cm.Blues_r)
    a = subplot.contour(xx, yy, Z, levels=[threshold],
                        linewidths=2, colors='red')
    subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()],
                     colors='orange')
    b = subplot.scatter(XY[:-n_outliers, 0], XY[:-n_outliers, 1], c='white')
    c = subplot.scatter(XY[-n_outliers:, 0], XY[-n_outliers:, 1], c='white')
예제 #28
0
def run_model(train_data: np.ndarray, predict_data: np.ndarray):
    clf = EllipticEnvelope()
    clf.fit(train_data.reshape(-1, 1))
    outlier = clf.decision_function(predict_data.reshape(-1, 1))
    return outlier
예제 #29
0
        x = np.concatenate((no_mod_train, mod_train))

        ## extract good features
        fitter = umap.UMAP().fit(x.reshape((len(x)), 60))

        test_data = fitter.transform(
            np.concatenate((no_mod[number:], mod[number:])))

        model_EllipticEnvelope = EllipticEnvelope(contamination=0.05,
                                                  support_fraction=1)

        model_EllipticEnvelope.fit(fitter.embedding_[:number])

        # selected extra-outliers
        decision = model_EllipticEnvelope.decision_function(test_data)
        index = []
        for i in range(len(decision)):
            if decision[i] < 0.00 and decision[i] > threshold:
                index.append(i)

        #get rid of non-confident ones
        mod_training_filtered = np.delete(test_data, index, axis=0)
        #f.write('Number of signals left '+str(len(mod_training_filtered))+'\n')
        median_number.append(len(mod_training_filtered))
        labels = np.concatenate((np.ones(100), np.repeat(-1, 100)))
        labels = np.delete(labels, index, axis=0)
        prediction = model_EllipticEnvelope.predict(mod_training_filtered)
        median_false_posit.append(len(
            prediction[:100][prediction[:100] == -1]))
        #f.write('testing accuracy with threshold '+str(accuracy(labels, prediction))+'\n')
def anomaly_detection_ex8_ng():
    """Run anomaly detection.
        Example from Andrew Ng's coursera course
    """

    # =====================
    # load data

    dataset = loadmat('data/ex8data1.mat')
    # dataset = loadmat('data/ex8data2.mat')
    print(dataset.keys())

    X = dataset['X']
    print('X:', X.shape, X[0, :])  # 307x2

    Xval = dataset['Xval']
    print('X_val:', Xval.shape, Xval[0, :])  # 307x2
    yval = dataset['yval']
    print('y_val:', yval.shape, yval[0, :])  # 307x1

    # =====================
    # display
    fig = plt.figure(facecolor='white')
    fig1 = fig.add_subplot(2, 2, 1)
    plt.scatter(X[:, 0], X[:, 1], c='k')
    plt.title("Outlier detection")
    plt.xlabel('Latency (ms)')
    plt.ylabel('Throughput (mb/s)')

    # =====================
    # detecting outliers in a Gaussian distributed dataset.
    clf = EllipticEnvelope()
    clf.fit(X)

    # Calculate the decision function and use threshold to determine outliers
    y_pred = clf.decision_function(X).ravel()
    # print('y pred', y_pred)

    # =====================
    # find best threshold for outlier detection
    if False:
        samples = np.linspace(0.1, 10.0, num=100)
        best_f1 = 0.0
        best_perc = 0.0
        for sample in samples:
            Xval_pred = clf.decision_function(Xval)
            perc = sample
            th = np.percentile(Xval_pred, perc)
            outl = Xval_pred < th
            f1score = f1_score(yval, outl)
            print('f1 score (', sample, '):', f1score)

            if best_f1 < f1score:
                best_f1 = f1score
                best_perc = perc
        print('best f1:', best_f1, ', best perc:', best_perc)

    # set threshold for outlier detection
    percentile = 1.9  # 5.1 # 1.9 #best_perc # 1.9607843
    threshold = np.percentile(y_pred, percentile)
    outliers = y_pred < threshold
    # print('outliers:', X[outliers])

    # =====================
    # plot contours

    fig.add_subplot(2, 2, 2)

    # create the grid for plotting
    if False:
        xx, yy = np.meshgrid(np.linspace(0, 25, 200), np.linspace(0, 30, 200))
        Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)

        plt.contour(xx,
                    yy,
                    Z,
                    levels=[threshold],
                    linewidths=2,
                    colors='blue',
                    linestyles='dotted')

        threshold = np.percentile(y_pred, 1.0)
        plt.contour(xx,
                    yy,
                    Z,
                    levels=[threshold],
                    linewidths=2,
                    colors='blue',
                    linestyles='dotted')
        threshold = np.percentile(y_pred, 0.5)
        plt.contour(xx,
                    yy,
                    Z,
                    levels=[threshold],
                    linewidths=2,
                    colors='blue',
                    linestyles='dotted')

    # plot outliers
    plt.scatter(X[:, 0], X[:, 1], c='k')
    plt.scatter(X[outliers, 0], X[outliers, 1], c='r')
    print('num outliers:', sum(outliers))

    # samples_idx = yval == 1
    # print(yval[samples_idx])
    # print('X_val:', Xval.shape, Xval[0, :])  # 307x2
    # print(Xval[samples_idx])

    plt.show()
    #xx, yy = np.meshgrid(np.linspace(-0.1, 1.1, 1000), np.linspace(0, 100, 1000))
    n_inliers = int((1. - outliers_fraction) * n_samples)
    n_outliers = int(outliers_fraction * n_samples)

    # Fit the problem with varying cluster separation
    np.random.seed(42)
    # Data generation


    # Fit the model with the One-Class SVM
    #plt.figure(figsize=(10, 5))

    clf = EllipticEnvelope(contamination=.1)
    # fit the data and tag outliers
    clf.fit(XY)
    y_pred = clf.decision_function(XY).ravel()
    threshold = stats.scoreatpercentile(y_pred,
                                        100 * outliers_fraction)
    y_pred = y_pred > threshold
    # plot the levels lines and the points
    #Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    #Z = Z.reshape(xx.shape)


    df_outlier = df[~y_pred]
    df_feedback = df_outlier[(df_outlier["usage proportion"]>df["usage proportion"].median())
                & (df_outlier["usage_percentage"]>df["usage_percentage"].median())]

    feedback_homes = df_feedback["home"].values

    extra_pred = np.setdiff1d(feedback_homes, submetered_homes_feedback)
예제 #32
0
def make_subplot_again(X,
                       c,
                       ax,
                       pcX=0,
                       pcY=1,
                       fontSize=24,
                       fontName='sans serif',
                       ms=20,
                       leg=True,
                       title=None):
    outliers_fraction = 0.30
    clf = EllipticEnvelope(contamination=outliers_fraction)

    x = X['DK salary'].values
    y = X['points_per_dollar'].values.reshape(-1, 1)
    Xn = X
    X = X.values

    buff = 0.02
    bufferX = buff * (X[:, pcX].max() - X[:, pcX].min())
    bufferY = buff * (X[:, pcY].max() - X[:, pcY].min())
    mm = [(X[:, pcX].min() - bufferX, X[:, pcX].max() + bufferX),
          (X[:, pcY].min() - bufferY, X[:, pcY].max() + bufferY)]
    xx, yy = np.meshgrid(np.linspace(mm[0][0], mm[0][1], 500),
                         np.linspace(mm[1][0], mm[1][1], 500))

    # fit the data and tag outliers

    clf.fit(X)
    y_pred = clf.decision_function(X).ravel()
    threshold = scoreatpercentile(y_pred, 100 * outliers_fraction)

    y_pred = y_pred > threshold

    print y_pred
    Xn['pred'] = y_pred

    # plot the levels lines and the points
    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    ax.contourf(xx,
                yy,
                Z,
                levels=np.linspace(Z.min(), threshold, 7),
                cmap=plt.cm.Blues_r)
    a = ax.contour(xx,
                   yy,
                   Z,
                   levels=[threshold],
                   linewidths=2,
                   colors='burlywood')
    ax.contourf(xx, yy, Z, levels=[threshold, Z.max()], colors='orange')
    ax.axis('tight')

    care_about = Xn[Xn['points_per_dollar'] > 3.5]
    care_about_false = care_about[care_about['pred'] == False]

    x_c_f = care_about_false['DK salary']
    y_c_f = care_about_false['points_per_dollar']
    ax.scatter(x_c_f,
               y_c_f,
               alpha=0.5,
               lw=2,
               edgecolor='k',
               s=50,
               marker='d',
               c='#5DC541',
               label='Great Value')

    dont_care_about = Xn[Xn['points_per_dollar'] <= 3.5]
    dont_care_about_false = dont_care_about[dont_care_about['pred'] == False]

    x_d_f = dont_care_about_false['DK salary']
    y_d_f = dont_care_about_false['points_per_dollar']
    ax.scatter(x_d_f,
               y_d_f,
               alpha=0.5,
               lw=2,
               s=70,
               marker='+',
               c='#6F0D73',
               label='Bad Value')

    Xn_true = Xn[Xn['pred'] == True]
    x_true = Xn_true['DK salary']
    y_true = Xn_true['points_per_dollar']
    ax.scatter(x_true,
               y_true,
               alpha=0.5,
               marker='o',
               c='#BD4864',
               label='Normal Value')

    ax.annotate('Ben\nRoethlisburger\nWeek 8 2014',
                fontsize=20,
                xy=(5800, 8.237931),
                xytext=(7300, 7),
                arrowprops=dict(facecolor='black', shrink=0.05))
    ax.annotate('Tom Brady\nWeek 11 2014',
                fontsize=20,
                xy=(9800, 1.640816),
                xytext=(7000, -0.5),
                arrowprops=dict(facecolor='black', shrink=0.05))

    ## axes
    for tick in ax.xaxis.get_major_ticks():
        tick.label.set_fontsize(fontSize - 2)
    for tick in ax.yaxis.get_major_ticks():
        tick.label.set_fontsize(fontSize - 2)

    ax.set_xlabel('Salary', fontsize=fontSize, fontname=fontName)
    ax.set_ylabel('Points per $1000', fontsize=fontSize, fontname=fontName)
    plt.locator_params(axis='x', nbins=5)
    ax.set_aspect(1. / ax.get_data_ratio())
    ax.set_xlim(3000, 10000)
    ax.set_ylim(mm[1])
    ax.axhline(3.5, c='r', label='Threshold')
    box = ax.get_position()
    ax.set_position([box.x0 + box.width * 0.2, box.y0, box.width, box.height])
    ax.legend(loc='center right',
              bbox_to_anchor=(-0.2, 0.4),
              fontsize=20,
              scatterpoints=3,
              frameon=True)

    if title:
        ax.set_title(title, fontsize=fontSize + 2, fontname=fontName)
예제 #33
0
import pickle
from sklearn.cluster import KMeans
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import csv
from sklearn import svm
from sklearn.covariance import EllipticEnvelope
from scipy import stats

data = []
with open('newdata.csv', 'rb') as f:
    rdr = csv.reader(f)
    for row in rdr:
        data.append([int(row[1]), int(row[2])])
data = np.array(data)
# print(data)
outliers_fraction = 0.05
# est=svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,kernel="rbf", gamma=0.1)
est = EllipticEnvelope(contamination=.1)
# est=KMeans(n_clusters=3)
est.fit(data)
# labels=est.labels_
y_pred = est.decision_function(data).ravel()
threshold = stats.scoreatpercentile(y_pred, 100 * outliers_fraction)

labels = [(2 if y > threshold else 1) for y in y_pred]
# labels=est.labels_
print(labels)
plt.scatter(data[:, 0], data[:, 1], c=labels, lw=0)
plt.show()
def outliers_from_ellipticEnvelope():
    from sklearn.covariance import EllipticEnvelope
    env=EllipticEnvelope()
    env.fit(features_pca)
    outlier_pred=env.decision_function(features_pca).ravel()
    return outlier_pred
예제 #35
0
# plot the temperature repartition by categories
fig, axs = plt.subplots(2, 2)
df_class0.hist(ax=axs[0, 0], bins=32)
df_class1.hist(ax=axs[0, 1], bins=32)
df_class2.hist(ax=axs[1, 0], bins=32)
df_class3.hist(ax=axs[1, 1], bins=32)

# In[ ]:

# apply ellipticEnvelope(gaussian distribution) at each categories
envelope = EllipticEnvelope(contamination=outliers_fraction)
X_train = df_class0.values.reshape(-1, 1)
envelope.fit(X_train)
df_class0 = pd.DataFrame(df_class0)
df_class0['deviation'] = envelope.decision_function(X_train)
df_class0['anomaly'] = envelope.predict(X_train)

envelope = EllipticEnvelope(contamination=outliers_fraction)
X_train = df_class1.values.reshape(-1, 1)
envelope.fit(X_train)
df_class1 = pd.DataFrame(df_class1)
df_class1['deviation'] = envelope.decision_function(X_train)
df_class1['anomaly'] = envelope.predict(X_train)

envelope = EllipticEnvelope(contamination=outliers_fraction)
X_train = df_class2.values.reshape(-1, 1)
envelope.fit(X_train)
df_class2 = pd.DataFrame(df_class2)
df_class2['deviation'] = envelope.decision_function(X_train)
df_class2['anomaly'] = envelope.predict(X_train)
예제 #36
0
# Fit the model
clf = EllipticEnvelope(support_fraction=1., contamination=contamination)
clf.fit(data)

# Perform outlier detection
predicted_data = clf.predict(data)
inlier_predicted_data = data[predicted_data == 1]
outlier_predicted_data = data[predicted_data == -1]
num_inliers_predicted = inlier_predicted_data.shape[0]
num_outliers_predicted = outlier_predicted_data.shape[0]

# Plot decision function values
xr = np.linspace(-2, 2, 500)
yr = np.linspace(-2, 2, 500)
xx, yy = np.meshgrid(xr, yr)
zz = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
zz = zz.reshape(xx.shape)
scores = clf.decision_function(data)
threshold = stats.scoreatpercentile(scores, 100 * contamination)
plt.contourf(xx,
             yy,
             zz,
             levels=np.linspace(zz.min(), threshold, 7),
             cmap=plt.cm.Blues_r)  # Outlier
plt.contour(xx,
            yy,
            zz,
            levels=np.array([threshold]),
            linewidths=2,
            colors="red")  # The frontier
plt.contourf(xx,
예제 #37
0
#plt.figure(15)
#for l in set(L):
#p=(L==l)
#if l==-1:
#color='r'
#else:
#color=colors[l]
#plt.plot(rcp[p,0],rcp[p,1],'o',c=color,markersize=10)
#plt.show()

# -17- #
from sklearn.covariance import EllipticEnvelope
anom_perc = 20
clf = EllipticEnvelope(contamination=.1)
clf.fit(rcp)
clf.decision_function(rcp).ravel()
pred = clf.decision_function(rcp).ravel()
threshold = stats.scoreatpercentile(pred, anom_perc)
Anom = pred > threshold
print(Anom)
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
waitforEnter()
#plt.figure(17)
#plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),cmap=plt.cm.Blues_r)
#plt.contour(xx, yy, Z, levels=[threshold],linewidths=2, colors='red')
#plt.plot(rcp[:, 0], rcp[:, 1], 'ko')
#plt.show()
#waitforEnter()

# -18- #
예제 #38
0
def find_outliers(datestart, dateend, plot=False, cut=-0.05):
    numtopics = 84

    di = datetime2str2(datestart)
    dfin = datetime2str2(dateend)

    #print di,dfin
    if dfin < di:
        temp = dfin
        dfin = di
        di = temp
    #print di,dfin

    afile = "/home/ubuntu/mysql_insightwiki_auth.txt"
    a = open(afile)
    passwd = a.readline().rstrip()
    a.close()
    host = 'localhost'
    user = '******'
    db = 'wikidata'
    con = mdb.connect(host, user, passwd, db)  #,port=3307)

    with con:
        curt = con.cursor()
        #sql="SELECT COUNT(*) FROM `topics` "

        sql = "SELECT `Id`,`topic_label`,`topic_string` FROM `topics`;"
        curt.execute(sql)
        topics = [[0, 'nothing', 'Filler to match index']]
        for topic in curt:
            topics.append(topic)

    data = {}

    df = range(numtopics + 1)
    with con:
        curt = con.cursor()
        sql = "SELECT `Id`,`topic_label`,`topic_string` FROM `topics`;"
        curt.execute(sql)
        for row in curt:
            cur = con.cursor()
            sql = '''SELECT `page_views`.`dateonly` AS `vd`, AVG(`page_views`.`count`) AS `vc`, 
                `topics`.`topic_label`,`topics`.`topic_string` 
                FROM `topics` INNER JOIN `page_views` ON `topics`.`ID` = `page_views`.`topic_id` 
                WHERE `topic_id`=%s GROUP BY `page_views`.`dateonly`   '''
            data[row[1]] = read_sql(sql, con, params=[row[0]])
            df[row[0]] = data[row[1]]

    topicdata = df

    d = topicdata[topics[3][0]]
    p = d[(d['vd'] > di) & (d['vd'] < dfin)]['vc'].values
    topicdata = df

    #initializing array to hold the rows to cluster
    #the 0th position is fake so that my index matches the sql index
    clusinp = []
    clusinp.append(gen_feat([0, 0, 0, 0, 0]))

    chinaoff = 6000
    #populating my array to go into my Kmean
    for index, topic in enumerate(topics):
        #topic=list(topics[index])
        if topic[0] != 0:
            d = topicdata[topic[0]]
            ppre = d[(d['vd'] > di) & (d['vd'] < dfin)]['vc'].values
            p = gen_feat(ppre)
            if topic[0] == 52:
                p = gen_feat(
                    [x - chinaoff if x - chinaoff >= 0 else 0 for x in ppre])
            clusinp.append(p)

    #cleaning up my array making it numpy to go into my kmean
    clusinp = np.array(clusinp)
    clusinp[0] = clusinp[
        5]  #making sure my through away first row matches in size
    #contam=0.325
    contamfix = 0.1

    colors = ['m', 'g', 'b']
    X1 = clusinp
    xx1, yy1 = np.meshgrid(np.linspace(0, 10000, 500),
                           np.linspace(-1.5, 1.5, 500))
    ee = EllipticEnvelope(support_fraction=1., contamination=contamfix)
    #ee=OneClassSVM(nu=contam2, gamma=0.05,kernel='rbf')
    ee.fit(clusinp)
    outliers = ee.decision_function(X1, raw_values=False)

    if plot == True:
        print "here"
        get_ipython().magic(u'matplotlib inline')
        Z1 = ee.decision_function(np.c_[xx1.ravel(), yy1.ravel()])
        Z1 = Z1.reshape(xx1.shape)
        legend1 = plt.contour(xx1,
                              yy1,
                              Z1,
                              levels=[0],
                              linewidths=2,
                              colors=colors[1])
        plt.scatter(X1[:, 0], X1[:, 1], color='black')
        plt.xlim((xx1.min(), xx1.max()))
        plt.ylim((yy1.min(), yy1.max()))
        plt.show()

    out = []
    for index, outlier in enumerate(outliers):
        row = [
            index, outlier, topics[index][1],
            int(np.round(clusinp[index][0])),
            int(np.round(100 * clusinp[index][1]))
        ]
        #row=[index,outlier,topics[index][1],int(np.round(clusinp[index][0])),clusinp[index][1]]
        if outlier < cut and index != 0 and row[3] > 8:
            out.append(row)
            #print index,outlier,topics[index][2],clusinp[index][0],clusinp[index][1]
    #out=sorted(out,operator.itemgetter(4))
    #out.sort()
    out = sorted(out, key=lambda x: -x[4])
    return out
def outliers_from_ellipticEnvelope():
    from sklearn.covariance import EllipticEnvelope
    env = EllipticEnvelope()
    env.fit(features_pca)
    outlier_pred = env.decision_function(features_pca).ravel()
    return outlier_pred
def perform_robust_covariance_novelty_detection(data):
    ''' With the five patterns' counts, this method performs Robust Covariance that can help concentrate on a relevant cluster when outlying points exist.
    The experimentation is performed with different time chunks and number of sequences. '''

    # Importing necessary libraries
    from sklearn.covariance import EllipticEnvelope
    from sklearn.model_selection import train_test_split

    X = data.iloc[:, 0:5].values
    pca = PCA(n_components=2)
    X = pca.fit(StandardScaler().fit_transform(X)).transform(
        StandardScaler().fit_transform(X))

    # Spliting the observations into 75% training and 25% testing
    X_train, X_test = train_test_split(X, test_size=0.25, random_state=42)

    # Robust Covariance classifier intialization and generate results
    classifier = EllipticEnvelope(contamination=0.25)
    classifier.fit(X_train)
    Y_pred_train = classifier.predict(X_train)
    Y_pred_test = classifier.predict(X_test)
    n_error_train = Y_pred_train[Y_pred_train == -1].size
    n_error_test = Y_pred_test[Y_pred_test == -1].size
    error_train = n_error_train / Y_pred_train.shape[0] * 100
    error_novel = n_error_test / Y_pred_test.shape[0] * 100

    # Visualization
    plt.clf()
    myFig = plt.figure(figsize=[10, 8])
    xx, yy = np.meshgrid(np.linspace(-4.5, 8.5, 500),
                         np.linspace(-4.5, 4.5, 500))
    Z = classifier.decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.contourf(xx,
                 yy,
                 Z,
                 levels=np.linspace(Z.min(), 0, 7),
                 cmap=plt.cm.PuBu)
    a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred')
    plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors='palevioletred')
    s = 60
    b1 = plt.scatter(X_train[:, 0],
                     X_train[:, 1],
                     c='white',
                     s=s,
                     edgecolors='k')
    b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='gold', s=s, edgecolors='k')
    plt.axis('tight')
    plt.legend([a.collections[0], b1, b2], [
        "Learned Frontier", "Training Observations", "New Regular Observations"
    ],
               loc="best",
               prop=matplotlib.font_manager.FontProperties(size=14))
    plt.xlabel("Error Train: %.2f%% and Error Novel Regular: %.2f%%" %
               (error_train, error_novel),
               fontsize=13,
               weight="bold")
    plt.yticks(fontsize=14)
    plt.xticks(fontsize=14)
    plt.title(
        'Novelty Detection using Robust Covariance of Ransomware Families\'\nAll Sequence Counts from 15 minutes of IRP Logs',
        fontsize=14,
        weight='bold')
    plt.show()

    # Save figure
    myFig.savefig(
        'sequence_mining_analysis/Results/novelty_detection/Robust_Covariance/15_mins_sequences_all.png',
        format='png',
        dpi=150)
    myFig.savefig(
        'sequence_mining_analysis/Results/novelty_detection/Robust_Covariance/15_mins_sequences_all.eps',
        format='eps',
        dpi=1200)
    'timestamp', 'Sample Number', 'Seconds', 'Minutes', 'Hours', 'Date',
    'Month'
],
                 axis=1)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
#num2 = scaler.fit_transform(data.drop(['timestamp'],axis=1))
num2 = scaler.fit_transform(data)
num2 = pd.DataFrame(num2, columns=data.columns)

# %% [code] {"scrolled":true}
from sklearn.covariance import EllipticEnvelope
clf = EllipticEnvelope(contamination=.1, random_state=0)
clf.fit(num2)
ee_scores = pd.Series(clf.decision_function(num2))
ee_predict = clf.predict(num2)
ee_predict = pd.Series(ee_predict).replace([-1, 1], [1, 0])

# %% [code] {"scrolled":true}
print(ee_scores)
print(ee_predict)

# %% [markdown]
# * ee_scores contains fitted densities.<br>
# * ee_predict contains labels, where -1 indicates an outlier and 1 does not. <br>
# * Labels are calculated based on clf.threshold_ and ee_scores.

# %% [code] {"scrolled":true}
anomaly_ind = ee_predict[ee_predict == 1].index
anomaly_ind
예제 #42
0
plt.savefig("svm_oneclass.pdf")
plt.show()


##################################
# Robust covariance
##################################

## Train robust covariance classifier
outliers_fraction = 0.05
robust_classifier = EllipticEnvelope(contamination=outliers_fraction)
robust_classifier.fit(X)

## Create a grid to draw the classifier
xx, yy = np.meshgrid(np.linspace(-20, 25, 500), np.linspace(-10, 35, 500))
Z = robust_classifier.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

## Draw the boundary
plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 10), cmap=plt.cm.PuBu)
a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors="darkred")
plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors="palevioletred")

## Draw the data points
b1 = plt.scatter(X_good[:, 0], X_good[:, 1], c="blueviolet", edgecolors="k")
b2 = plt.scatter(X_bad[:, 0], X_bad[:, 1], c="gold", edgecolors="k")
plt.title("Robust Covariance on PCA")
plt.xlabel("Principal component 1")
plt.ylabel("Principal component 2")

plt.legend(
import ms.version

ms.version.addpkg('numpy', '1.14.2')
ms.version.addpkg('scipy', '1.0.0')
ms.version.addpkg('sklearn', '0.19.1')

import sys
import time
import numpy as np

from sklearn.covariance import EllipticEnvelope

import util

outlier_frac = 0.05
ell = EllipticEnvelope(contamination=outlier_frac)
while True:
    X_train = util.receive_point_list_from_stdin()
    X_predict = util.receive_point_list_from_stdin()
    X_train, X_predict = util.nomalize_train_evaluate_data(X_train, X_predict)

    ell.fit(X_train)
    pred = ell.predict(X_predict)

    bools = pred == -1
    decisions = ell.decision_function(X_predict)

    util.send_bool_list_to_stdout(bools)
    util.send_double_list_to_stdout(decisions)
예제 #44
0
from sklearn.cluster import KMeans
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import csv
from sklearn import svm
from sklearn.covariance import EllipticEnvelope
from scipy import stats

data=[]
with open('newdata.csv', 'rb') as f:
	rdr=csv.reader(f)
	for row in rdr:
		data.append([int(row[1]), int(row[2])])
data=np.array(data)
# print(data)
outliers_fraction = 0.05
# est=svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,kernel="rbf", gamma=0.1)
est=EllipticEnvelope(contamination=.1)
# est=KMeans(n_clusters=3)
est.fit(data)
# labels=est.labels_
y_pred=est.decision_function(data).ravel()
threshold = stats.scoreatpercentile(y_pred,
                                            100 * outliers_fraction)

labels=[ (2 if y>threshold  else 1) for y in y_pred];
# labels=est.labels_
print(labels)
plt.scatter(data[:,0], data[:,1], c=labels, lw=0)
plt.show()
예제 #45
0
##y_pred_outliers = clf.predict(X_outliers)
n_error_train = y_pred_train[y_pred_train == -1].size
n_error_test = y_pred_test[y_pred_test == -1].size
error_train = n_error_train/X_train1.shape[0]
error_test = n_error_test/X_test1.shape[0]
print("train: {:.3f}, test{:.3f}".format(error_train,error_test))



rc_clf = EllipticEnvelope(contamination=0.05)

rc_clf.fit(X_train1)
y_pred_train_rc = rc_clf.predict(X_train1)
y_pred_test_rc = rc_clf.predict(X_test1)

scores_pred_train_rc = rc_clf.decision_function(X_train1)
scores_pred_test_rc = rc_clf.decision_function(X_test1)

##y_pred_outliers = clf.predict(X_outliers)
n_error_train_rc = y_pred_train_rc[y_pred_train_rc == -1].size
n_error_test_rc = y_pred_test_rc[y_pred_test_rc == -1].size
error_train_rc = n_error_train_rc/X_train1.shape[0]
error_test_rc = n_error_test_rc/X_test1.shape[0]
print("train: {:.3f}, test{:.3f}".format(error_train_rc,error_test_rc))


"""

if_clf = IsolationForest(max_samples='auto', contamination=0.05,
                                        random_state=rng)   
if_clf.fit(X_train1)
예제 #46
0
    def fuse_to_get_results(self, weights, num_comp):
        if weights[0] != 0:
            self.apply_pca(num_comp)
            # Make sure you apply pca before using Envelop -- it is very sensitive to the feature dimensions
            clf_een = EllipticEnvelope(store_precision=True,
                                       assume_centered=False,
                                       support_fraction=0.25,
                                       contamination=0.1,
                                       random_state=True)
            # Fitting the model on reduced dimensionality
            clf_een.fit(self.gen_tr_data)
            # The anomaly score of the input samples. The lower, the more abnormal.
            #输入样本的异常分数。越低越不正常。
            pred_gen_scores_ee = clf_een.decision_function(self.gen_ts_data)
            pred_imp_scores_ee = clf_een.decision_function(self.imp_ts_data)
            pred_scores_ts_ee = np.concatenate(
                (pred_gen_scores_ee, pred_imp_scores_ee))
            norm_scores_ee = self.mymm_scaler(pred_scores_ts_ee)
        else:
            norm_scores_ee = self.fill_sc_with_zero(
                np.concatenate(
                    (self.get_gen_ts_labels(), self.get_imp_ts_labels())))
        if weights[1] != 0:
            # Make sure you apply pca before using envelop -- it is very sensitive to the feature dimensions
            clf_if = IsolationForest(max_samples="auto",
                                     contamination=0.2,
                                     random_state=True)
            # Fitting the model on reduced dimensionality
            clf_if.fit(self.gen_tr_data)
            # The anomaly score of the input samples. The lower, the more abnormal.
            pred_gen_scores_if = clf_if.decision_function(self.gen_ts_data)
            pred_imp_scores_if = clf_if.decision_function(self.imp_ts_data)
            # print('pred_gen_scores_if',self.mymm_scaler(pred_gen_scores_if))
            # print(clf_if.predict(self.gen_ts_data))
            # print('pred_imp_scores_if', self.mymm_scaler(pred_imp_scores_if))
            # print(clf_if.predict(self.imp_ts_data))

            pred_scores_ts_if = np.concatenate(
                (pred_gen_scores_if, pred_imp_scores_if))
            norm_scores_if = self.mymm_scaler(pred_scores_ts_if)
            # print('norm_scores_if',norm_scores_if)
            # print('plabel',np.concatenate((clf_if.predict(self.gen_ts_data),clf_if.predict(self.imp_ts_data))))
        else:
            norm_scores_if = self.fill_sc_with_zero(
                np.concatenate(
                    (self.get_gen_ts_labels(), self.get_imp_ts_labels())))
        if weights[2] != 0:
            num_neighbors = 35
            clf_lof = LocalOutlierFactor(n_neighbors=num_neighbors,
                                         metric='l2',
                                         contamination=0.25)
            X = np.concatenate((self.gen_tr_data, self.gen_ts_data))
            X_all = np.concatenate((X, self.imp_ts_data))
            pred_all_score = clf_lof.fit_predict(X_all)
            #print('pred_all_score')
            #print(pred_all_score)
            pred_scores_ts_lof = pred_all_score[
                range(len(self.gen_tr_data), len(pred_all_score)), ]
            norm_scores_lof = self.mymm_scaler(pred_scores_ts_lof)
        else:
            norm_scores_lof = self.fill_sc_with_zero(
                np.concatenate(
                    (self.get_gen_ts_labels(), self.get_imp_ts_labels())))

        if weights[3] != 0:
            # Make sure you apply pca before using envelop -- it is very sensitive to the feature dimensions
            clf_svm1c = svm.OneClassSVM(kernel='rbf',
                                        degree=3,
                                        gamma=0.001,
                                        coef0=0.0,
                                        tol=0.00001,
                                        nu=0.001,
                                        shrinking=True,
                                        cache_size=200,
                                        verbose=False,
                                        max_iter=-1,
                                        random_state=True)
            # Fitting the model on reduced dimensionality
            clf_svm1c.fit(self.gen_tr_data)
            # The anomaly score of the input samples. The lower the more abnormal.
            pred_gen_scores_svm = clf_svm1c.decision_function(self.gen_ts_data)
            pred_imp_scores_svm = clf_svm1c.decision_function(self.imp_ts_data)
            pred_scores_ts_svm = np.concatenate(
                (pred_gen_scores_svm, pred_imp_scores_svm))
            norm_scores_svm = self.mymm_scaler(pred_scores_ts_svm)
        else:
            norm_scores_svm = self.fill_sc_with_zero(
                np.concatenate(
                    (self.get_gen_ts_labels(), self.get_imp_ts_labels())))

        # Score level fusion
        pred_ts_labels = []
        fused_scores = []
        for ees, ifs, lofs, svms in zip(norm_scores_ee, norm_scores_if,
                                        norm_scores_lof, norm_scores_svm):
            cfscore = (weights[0] * ees + weights[1] * ifs +
                       weights[2] * lofs + weights[3] * svms) / sum(weights)
            fused_scores.append(cfscore)
            if cfscore < self.threshold:
                pred_ts_labels.append(-1)
            else:
                pred_ts_labels.append(1)

        act_ts_labels = np.concatenate(
            (self.get_gen_ts_labels(), self.get_imp_ts_labels()))
        tn, fp, fn, tp = confusion_matrix(act_ts_labels,
                                          pred_ts_labels).ravel()
        far = fp / (fp + tn)
        frr = fn / (fn + tp)
        pr = tp / (tp + fp)
        final_score_table = [
            norm_scores_ee, norm_scores_if, norm_scores_lof, norm_scores_svm,
            fused_scores, act_ts_labels
        ]
        #ee分数
        print(norm_scores_ee)
        #if分数
        print(norm_scores_if)
        #lof是0,1标签
        print(norm_scores_lof)
        #svm分数
        print(norm_scores_svm)
        #混合后也是分数
        print(fused_scores)
        #标签
        print(act_ts_labels)
        return far, frr, pr, final_score_table
예제 #47
0
plt.figure(15)
for l in set(L):
    p = (L == l)
    if l == -1:
        color = 'r'
    else:
        color = colors[l]
    plt.plot(rcp_concat[p, 0], rcp_concat[p, 1], 'o', c=color, markersize=10)
plt.show()

# -17- #

anom_perc = 20  # original 20
clf = EllipticEnvelope(contamination=.1)
clf.fit(rcp_concat)
clf.decision_function(rcp_concat).ravel()
pred = clf.decision_function(rcp_concat).ravel()
threshold = stats.scoreatpercentile(pred, anom_perc)
Anom = pred > threshold
print(Anom)
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure(16)
plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), cmap=plt.cm.Blues_r)
plt.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='red')
plt.plot(rcp_concat[:, 0], rcp_concat[:, 1], 'ko')
plt.show()
plt.savefig("../imagens/anomaly/ex17_20.png")

# End
wait_for_enter("END!")
예제 #48
0
    # Compare given classifiers under given settings
    #xx, yy = np.meshgrid(np.linspace(-0.1, 1.1, 1000), np.linspace(0, 100, 1000))
    n_inliers = int((1. - outliers_fraction) * n_samples)
    n_outliers = int(outliers_fraction * n_samples)

    # Fit the problem with varying cluster separation
    np.random.seed(42)
    # Data generation

    # Fit the model with the One-Class SVM
    #plt.figure(figsize=(10, 5))

    clf = EllipticEnvelope(contamination=.1)
    # fit the data and tag outliers
    clf.fit(XY)
    y_pred = clf.decision_function(XY).ravel()
    threshold = stats.scoreatpercentile(y_pred, 100 * outliers_fraction)
    y_pred = y_pred > threshold
    # plot the levels lines and the points
    #Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    #Z = Z.reshape(xx.shape)

    df_outlier = df[~y_pred]
    df_feedback = df_outlier[
        (df_outlier["usage proportion"] > df["usage proportion"].median())
        & (df_outlier["usage_percentage"] > df["usage_percentage"].median())]

    feedback_homes = df_feedback["home"].values

    extra_pred = np.setdiff1d(feedback_homes, submetered_homes_feedback)
    missed = np.setdiff1d(submetered_homes_feedback, feedback_homes)
예제 #49
0
#!/usr/bin/env python
#-*- coding:utf-8 -*-

import numpy as np
from sklearn.covariance import EllipticEnvelope
import matplotlib.pyplot as plt

X1 = np.loadtxt('slocbool.txt')
ee = EllipticEnvelope(support_fraction=1., contamination=0.02)
xx, yy = np.meshgrid(np.linspace(0, 1500000, 542), np.linspace(0, 15000, 542))
ee.fit(X1)
Z = ee.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.figure(1)
plt.title("Outlier detection: SLOC vs BOOL")
plt.scatter(X1[:, 0], X1[:, 1], color='black')
plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='m')
plt.ylabel("count of boolean expressions")
plt.xlabel("count of source lines of code")
plt.show()
    #visualize the cluster
    df_class.hist(bins=32)
    #add
    df_classes.append(df_class)

#%%
# apply ellipticEnvelope(gaussian distribution) at each categories

df_classesAnom=[]
fig, ax = plt.subplots()
for c in df_classes:
    envelope =  EllipticEnvelope(contamination = outliers_fraction) 
    X_train = c.values.reshape(-1,1)
    envelope.fit(X_train)
    c = pd.DataFrame(c)
    c['deviation'] = envelope.decision_function(X_train)
    c['anomaly'] = envelope.predict(X_train)
    a0 = c.loc[c['anomaly'] == 1, dataValues]
    b0 = c.loc[c['anomaly'] == -1, dataValues]
    ax.hist([a0,b0], bins=32, stacked=True, color=['blue', 'red'])
    df_classesAnom.append(c)


#%%
# add the data to the main 
df_class=pd.concat(df_classesAnom)
df_temp['anomaly22'] = df_class['anomaly']
df_temp['anomaly22'] = np.array(df_temp['anomaly22'] == -1).astype(int) 

#%% [markdown]
# Let's visualize the tagged anomaly points throughout time 
예제 #51
0
파일: testing.py 프로젝트: Semen52/FSA2
            x, y = find_boundary(X_transformed[kclusters == i, 0],
                                 X_transformed[kclusters == i, 1], 5)
            plt.plot(x, y, '-k', lw=2., color=cluster_color)

            # create a mesh to plot in
            h = .02  # step size in the mesh
            x_min, x_max = X_transformed[kclusters == i, 0].min() - 1, X_transformed[kclusters == i, 0].max() + 1
            y_min, y_max = X_transformed[kclusters == i, 1].min() - 1, X_transformed[kclusters == i, 1].max() + 1
            xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                                 np.arange(y_min, y_max, h))

            clf = EllipticEnvelope(contamination=.1)
            clf.fit(X_transformed[kclusters == i])

            pred = clf.decision_function(X_transformed[kclusters == i]).ravel()
            threshold = stats.scoreatpercentile(pred,
                                                100 * outliers_fraction)
            print("INFO: Cluster: ", i, " Threshold: ", threshold)

            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])

            Z = Z.reshape(xx.shape)
            # plt.contour(xx, yy, Z,
            #             levels=[threshold],
            #             linewidths=2,
            #             linestyles='solid',
            #             colors=(cluster_color,))

            # plt.contourf(xx, yy, Z, levels=[threshold, Z.max()],
            #              colors='orange')