예제 #1
0
def predict_EllipticEnvelope(X, fraction_outlier):
    xx, yy = get_meshgrid(X)
    x1, x2 = xx.min(), xx.max()
    y1, y2 = yy.min(), yy.max()
    d = (x2 - x2) * 0.1

    A = EllipticEnvelope(contamination=fraction_outlier)
    A.fit(X)
    Y = A.predict(X)

    confidence_mat = numpy.array([(A.predict(x.reshape(-1, 2))).astype(int)
                                  for x in numpy.c_[xx.flatten(),
                                                    yy.flatten()]])
    grid_confidence = (confidence_mat).reshape((100, 100))
    P.plot_contourf(X[Y > 0],
                    X[Y <= 0],
                    xx,
                    yy,
                    grid_confidence,
                    x_range=[x1 - d, x2 + d],
                    y_range=[y1 - d, y2 + d],
                    filename_out='4_pred_EllipticEnvelope_density.png')
    P.plot_2D_features_multi_Y(X,
                               -Y,
                               x_range=[x1 - d, x2 + d],
                               y_range=[y1 - d, y2 + d],
                               filename_out='4_pred_EllipticEnvelope.png')
    return
예제 #2
0
class EllipticEnvelope_Classifier:
  """docstring for EllipticEnvelope"""
  def __init__(self, save_path):

    # 默认路径
    # 保存路径
    self.save_path = os.path.join(save_path,'EllipticEnvelope')
    if not os.path.exists(self.save_path):
      os.makedirs(self.save_path)
    self.contamination = 0.1

    self.classifier = EllipticEnvelope(contamination=self.contamination)
    

  def fit_model(self, train_data_matrix, test_data_matrix, test_true_label):
    """训练模型"""
    train_data_matrix = train_data_matrix.toarray()
    test_data_matrix = test_data_matrix.toarray()
    self.classifier.fit(train_data_matrix)
    y_pred_label = self.classifier.predict(test_data_matrix)
    n_errors_test = (y_pred_label!=test_true_label).sum()
    accuracy, classification_report, confusion_matrix = sklearn_evaluation(test_true_label, y_pred_label)
    print('Accuracy: {} \nClassification Report:\n{}\n'.format(accuracy, classification_report))
    sys.stdout.flush()


  def test_model(test_data,):
    """测试模型
       such as test_label = [1,1,-1,....]
    """
    scores_pred = self.classifier.decision_function(train_data)
    y_pred = self.classifier.predict(train_data)
    n_error_train = y_pred_test[y_pred_test == -1].size
예제 #3
0
def compare_drift(X_src, y_src, X_new, y_new):
    clf_y = EllipticEnvelope(random_state=0, contamination=0.01)
    clf_X = EllipticEnvelope(random_state=0, contamination=0.01)

    clf_X.fit(X_src)
    clf_y.fit(y_src.reshape(y_src.size, 1))

    test_X = clf_X.predict(X_new)

    test_y = clf_y.predict(y_new.reshape(-1, 1))

    X_distance = wasserstein_distance(X_src.values.flatten(),
                                      X_new.values.flatten())

    y_distance = wasserstein_distance(y_src.flatten(), y_new.flatten())

    X_outlier = len(test_X[test_X == -1]) / len(test_X)

    y_outlier = len(test_y[test_y == -1]) / len(test_y)

    results = {
        'X_wasserstein_distance': X_distance,
        'y_wasserstein_distance': y_distance,
        'X_outlier_percentage': X_outlier,
        'y_outlier_percentage': y_outlier
    }

    return results
예제 #4
0
def view_anomalies(df):
    data = reindex_data(df)
    df.index = data.index

    df_class0 = df.loc[df['srch_saturday_night_bool'] == 0, 'price_usd']
    df_class1 = df.loc[df['srch_saturday_night_bool'] == 1, 'price_usd']

    fig, axs = plt.subplots(1,2)
    df_class0.hist(ax=axs[0], bins=30)
    df_class1.hist(ax=axs[1], bins=30);

    outliers_fraction = 0.01
    envelope =  EllipticEnvelope(contamination = outliers_fraction) 
    X_train = df_class0.values.reshape(-1,1)
    envelope.fit(X_train)
    df_class0 = pd.DataFrame(df_class0)
    df_class0['deviation'] = envelope.decision_function(X_train)
    df_class0['anomaly'] = envelope.predict(X_train)

    envelope =  EllipticEnvelope(contamination = outliers_fraction) 
    X_train = df_class1.values.reshape(-1,1)
    envelope.fit(X_train)
    df_class1 = pd.DataFrame(df_class1)
    df_class1['deviation'] = envelope.decision_function(X_train)
    df_class1['anomaly'] = envelope.predict(X_train)

    # plot the price repartition by categories with anomalies
    a0 = df_class0.loc[df_class0['anomaly'] == 1, 'price_usd']
    b0 = df_class0.loc[df_class0['anomaly'] == -1, 'price_usd']

    a2 = df_class1.loc[df_class1['anomaly'] == 1, 'price_usd']
    b2 = df_class1.loc[df_class1['anomaly'] == -1, 'price_usd']

    fig, axs = plt.subplots(1,2)
    axs[0].hist([a0,b0], bins=32, stacked=True, color=['blue', 'red'])
    axs[1].hist([a2,b2], bins=32, stacked=True, color=['blue', 'red'])
    axs[0].set_title("Search Non Saturday Night")
    axs[1].set_title("Search Saturday Night")

    df_class = pd.concat([df_class0, df_class1])
    df['anomaly5'] = df_class['anomaly']
    # df['anomaly5'] = np.array(df['anomaly22'] == -1).astype(int)
    fig, ax = plt.subplots(figsize=(10, 6))
    
    df = df.sort_values('date_time')
    df['date_time_int'] = pd.to_datetime(df['date_time']).astype('int64')
    a = df.loc[df['anomaly5'] == -1, ('date_time_int', 'price_usd')] #anomaly
    ax.plot(df['date_time_int'], df['price_usd'], color='blue', label='Normal')
    ax.scatter(a['date_time_int'],a['price_usd'], color='red', label='Anomaly')
    plt.legend()

    a = df.loc[df['anomaly5'] == 1, 'price_usd']
    b = df.loc[df['anomaly5'] == -1, 'price_usd']

    fig, axs = plt.subplots(figsize=(10, 6))
    axs.hist([a,b], bins=32, stacked=True, color=['blue', 'red'])
    plt.show();
예제 #5
0
    def oneClassSVM(self, encoded_imgs_test):

        encoded_imgs_list = encoded_imgs_test.tolist()
        print(encoded_imgs_list)

        # clf = OneClassSVM(gamma='auto', nu=self.nu).fit(encoded_imgs_list)
        clf = EllipticEnvelope(contamination=self.nu).fit(
            np.array(encoded_imgs_list))
        print('test: ', clf.predict(encoded_imgs_list))

        return clf.predict(encoded_imgs_list)
        '''
예제 #6
0
def model_monitor(country="all", dev=DEV, training=True):
    """
    performance monitoring
    """
    print("Monitor Model")
    
    ## import data
    #datasets = engineer_features(training=training, dev=dev)
    datasets = engineer_features(training=training)
    X, y, dates, labels = datasets[country]
    dates = pd.to_datetime(dates)
    print(X.shape)
    
    ## train the model
    if training:
        _model_train(X, y, labels, tag=country, dev=dev)
    
    ## monitor RMSE
    samples = [10, 20, 30, 50, 60]

    for n in samples:
        X_new, y_new, dates_new = simulate_samples(n, X, y, dates)
        queries = [(str(d.year), str(d.month), str(d.day), country) for d in dates_new]
        y_pred = [model_predict(year=query[0], month=query[1], day=query[2], country=query[3],verbose=False, dev=dev)["y_pred"][0].round(2) for query in queries]
        rmse = np.sqrt(mean_squared_error(y_new.tolist(),y_pred))
        print("sample size: {}, RSME: {}".format(n, rmse.round(2)))
        
    ## monitor performance
    ## scaling
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    samples = [25, 50, 75, 90]

    clf_y = EllipticEnvelope(random_state=0,contamination=0.01)
    clf_X = EllipticEnvelope(random_state=0,contamination=0.01)

    clf_X.fit(X)
    clf_y.fit(y.reshape(y.size,1))

    results = defaultdict(list)
    for n in samples:
        X_new, y_new, dates_new = simulate_samples(n,X,y, dates)
        results["sample_size"].append(n)
        results['wasserstein_X'].append(np.round(wasserstein_distance(X.flatten(),X_new.flatten()),2))
        results['wasserstein_y'].append(np.round(wasserstein_distance(y,y_new),2))
        test1 = clf_X.predict(X_new)
        test2 = clf_y.predict(y_new.reshape(y_new.size,1))
        results["outlier_percent_X"].append(np.round(1.0 - (test1[test1==1].size / test1.size),2))
        results["outlier_percent_y"].append(np.round(1.0 - (test2[test2==1].size / test2.size),2))
    
    return pd.DataFrame(results)
예제 #7
0
def ellepticEnvelopeAnomaly(df, outliersFraction):

    # creation of 4 differents data set based on categories defined before
    df_class0 = df.loc[df['categories'] == 0, 'value']
    df_class1 = df.loc[df['categories'] == 1, 'value']
    df_class2 = df.loc[df['categories'] == 2, 'value']
    df_class3 = df.loc[df['categories'] == 3, 'value']

    # apply ellipticEnvelope(gaussian distribution) at each categories
    envelope = EllipticEnvelope(contamination=outliersFraction)
    X_train = df_class0.values.reshape(-1, 1)
    envelope.fit(X_train)
    df_class0 = pd.DataFrame(df_class0)
    df_class0['deviation'] = envelope.decision_function(X_train)
    df_class0['anomaly'] = envelope.predict(X_train)

    envelope = EllipticEnvelope(contamination=outliersFraction)
    X_train = df_class1.values.reshape(-1, 1)
    envelope.fit(X_train)
    df_class1 = pd.DataFrame(df_class1)
    df_class1['deviation'] = envelope.decision_function(X_train)
    df_class1['anomaly'] = envelope.predict(X_train)

    envelope = EllipticEnvelope(contamination=outliersFraction)
    X_train = df_class2.values.reshape(-1, 1)
    envelope.fit(X_train)
    df_class2 = pd.DataFrame(df_class2)
    df_class2['deviation'] = envelope.decision_function(X_train)
    df_class2['anomaly'] = envelope.predict(X_train)

    envelope = EllipticEnvelope(contamination=outliersFraction)
    X_train = df_class3.values.reshape(-1, 1)
    envelope.fit(X_train)
    df_class3 = pd.DataFrame(df_class3)
    df_class3['deviation'] = envelope.decision_function(X_train)
    df_class3['anomaly'] = envelope.predict(X_train)

    # add the data to the main
    df_class = pd.concat([df_class0, df_class1, df_class2, df_class3])
    df['anomaly22'] = df_class['anomaly']
    df['anomaly22'] = np.array(df['anomaly22'] == -1).astype(int)
    # visualisation of anomaly throughout time (viz 1)
    fig, ax = plt.subplots()
    a = df.loc[df['anomaly22'] == 1, ['time_epoch', 'value']]  #anomaly
    ax.plot(df['time_epoch'], df['value'], color='blue')
    ax.scatter(a['time_epoch'], a['value'], color='red')
    ax.set_title('Elliptic Envelope Multi Clustering')
    plt.show()
    return df
예제 #8
0
def EllipticEnvelop(X):
    Outlier_fraction = 0.0001
    from sklearn.covariance import EllipticEnvelope
    # (n+k+1)/2 points whose empirical covariance has the smallest determinant
    ell = EllipticEnvelope(contamination=Outlier_fraction).fit(X)
    Outlier_pred = ell.predict(X)
    return Outlier_pred
예제 #9
0
파일: oneclass.py 프로젝트: zfxu/tests
def show(samplepath):
    paths = []
    sname = os.path.splitext(samplepath)[0]
    print sname
    with open(sname+"_path.txt", 'r') as f:
        for line in f:
            paths.append(line.strip())
    X = load_one_class_feature(samplepath)
    X = norm_data(X)
    #clf = OneClassSVM(kernel='rbf',gamma=0.01,nu=0.098)
    clf = EllipticEnvelope(contamination=0.05)
    clf.fit(X)
    Y = clf.predict(X)
    DY = clf.decision_function(X)
    for k in range(len(Y)):
        if Y[k] < 0: #abnormality is positive
            print k + 1, ',', DY[k], ',',paths[k]
    err = np.sum( [ y < 0 for y in Y] )
    print '%d/%d'%(err, len(Y))

    x1,y1 = np.meshgrid(np.linspace(-20,20,400), np.linspace(-20,20,400))
    z1 = clf.decision_function(np.c_[x1.ravel(), y1.ravel()])
    z1 = z1.reshape(x1.shape)
    legend = {}
    legend['test'] = plt.contour(x1,y1,z1, levels=[0], linewidths=2,color='r')
    plt.scatter(X[:,0], X[:,1], color='black')

    values_list = list(legend.values())
    keys_list = list(legend.keys())
    plt.legend([values_list[0].collections[0]],[keys_list[0]])
    plt.show()
예제 #10
0
파일: helpers.py 프로젝트: ChengF-Lab/scIVA
def outliers_detection(expr):
    x = PCA(n_components=2).fit_transform(expr)
    ee = EllipticEnvelope()
    ee.fit(x)
    oo = ee.predict(x)

    return oo
def EllipticEnvelopeDetection(clm_select, all_tss, df_data, plot=False):
    rng = np.random.RandomState(42)
    outliers_fraction = 0.6
    if plot:
        plt.figure()
    ee_pred = {}
    for i in range(len(clm_select)):
        col = clm_select[i]
        j = 1
        ee_pred[col] = []
        for kind in all_tss[col].keys():
            j += 1
            X = np.array(all_tss[col][kind])
            # ONE-class SVM
            clf = EllipticEnvelope(contamination=outliers_fraction)
            clf.fit(X)
            y_pred = clf.predict(X)
            ee_pred[col].extend(y_pred)
        if plot:
            subplot = plt.subplot(len(clm_select), 1, i + 1)
            subplot.scatter(df_data['val'], df_data[col], c=ee_pred[col])
            subplot.set_title('Dimension ' + clm_select[i])
    if plot:
        plt.suptitle('Outlier detection with one class EllipticEnvelope')
        plt.show()
    return ee_pred
예제 #12
0
class EllipticEnvelopeOutlierStream(OutlierStream):

    def __init__(self, data, data_stream):
        OutlierStream.__init__(self, data, data_stream)
        self.model = EllipticEnvelope(contamination=0.045)
        self.DEBUG = False
        self.pca_plot = StreamPCA()

    def train_model(self, data):
        self.model.fit(data)

    def update_model(self, data):
        return None

    def predict_model(self, data):
        return self.model.predict(data)

    def summary(self, predictions, data_stream):

        print("Non outliers: {}".format(len(list(filter(lambda x: x > 0, predictions)))))
        print("Outliers: {}".format(len(list(filter(lambda x: x < 0, predictions)))))

        import numpy as np
        y_axes = np.linspace(0, len(predictions), len(predictions))
        plt.scatter(y_axes,predictions)
        plt.show()
예제 #13
0
def ellipticCurve(dataset):
    classifier = EllipticEnvelope(contamination=outlierFraction)
    classifier.fit(dataset)
    predScore = classifier.decision_function(dataset)
    pred = classifier.predict(dataset)
    outlierRows = [i for i in range(len(pred)) if pred[i] == -1]
    return predScore, outlierRows
예제 #14
0
    def envelop(self):
        # Make sure you apply pca before using Envelop -- it is very sensitive to the feature dimensions
        clf_een = EllipticEnvelope(store_precision=True,
                                   assume_centered=False,
                                   support_fraction=0.25,
                                   contamination=0.1,
                                   random_state=True)

        # Fitting the model on reduced dimensionality
        clf_een.fit(self.gen_tr_data)

        # Prediction labels
        pred_gen_ts_labels = clf_een.predict(self.gen_ts_data)
        pred_imp_ts_labels = clf_een.predict(self.imp_ts_data)

        act_ts_labels = np.concatenate(
            (self.get_gen_ts_labels(), self.get_imp_ts_labels()))
        pred_ts_labels = np.concatenate(
            (pred_gen_ts_labels, pred_imp_ts_labels))

        tn, fp, fn, tp = confusion_matrix(act_ts_labels,
                                          pred_ts_labels).ravel()
        far = fp / (fp + tn)
        frr = fn / (fn + tp)
        pr = tp / (tp + fp)
        return far, frr, pr
예제 #15
0
def outlier(TRAIN, contam):

    for i in range(TRAIN.shape[1]):
        v = TRAIN[:, i]
        v_hat = (v - np.median(v))
        TRAIN[:, i] = v_hat

    # model creation
    clf = EllipticEnvelope(support_fraction=1.,
                           contamination=contam,
                           assume_centered=True)
    clf.fit(TRAIN)
    C = clf.correct_covariance(TRAIN)
    pred = clf.predict(TRAIN)

    # eigen decomposition
    E, U = LA.eig(C)
    P = U[0:2, :]
    X_hat = np.dot(TRAIN, np.transpose(P))

    # plotting
    pred += 1
    for i in range(pred.shape[0]):
        pred[i] = pred[i] // 2
    plotting(X_hat, pred)

    return pred
예제 #16
0
    def cov(self, X_train, contamination=None, random_state=None):
        """
        Train Elliptic Envelope model from scikit-learn

        Parameters
        __________
        X_train: scaled training data
        contamination: percentage of anomalies in the data
        random_state: random number seed

        Returns
        ________
        Anomaly scores
        """
        model = EllipticEnvelope(contamination=contamination,
                                 random_state=random_state)
        model.fit(X_train)

        # Predict raw anomaly score
        labels = model.predict(X_train)  # -1 for outliers and 1 for inliers
        labels = (labels.max() -
                  labels) // 2  # rescaled labels (1: outliers, 0: inliers)
        cov_anomaly_scores = model.decision_function(
            X_train) * -1  # anomaly score
        cov_anomaly_scores = self.min_max_scaler(cov_anomaly_scores)
        return cov_anomaly_scores, labels
예제 #17
0
def predict_AB(train,test,result,num,sshop):

    filter_feature_train = ['user_id', 'time_stamp', 'mall_id', 'wifi_infos','wifi_id_signal','shop_id']
    filter_feature_test  = ['user_id', 'time_stamp', 'mall_id', 'wifi_infos','wifi_id_signal']
    train = train.drop(filter_feature_train,axis=1)
    test  = test.drop(filter_feature_test,axis=1)
    train = train.fillna(-999)
    test  = test.fillna(-999)
    test = test[list(train.columns)].join(test['row_id'])

    # # 存储矩阵
    # train.to_csv(r'D:\刘帅专用\XGBoost天池\mall_data_train&test\train_%d.csv'% num,index=None)
    # test.to_csv(r'D:\刘帅专用\XGBoost天池\mall_data_train&test\test_%d.csv' % num, index=None)

    model = EllipticEnvelope()
    model.fit(train)
    test['label'] = model.predict(test.drop(['row_id'],axis=1))

    # 标签转化回去
    test['shop_id'] = None
    print('***************************',len(test))
    print(len(test[test['label']==1]))

    print('***************************')
    test = test[test['label']==1]

    test['shop_id'][test['label']==1] = sshop #todo
    r = test[['row_id', 'shop_id']]
    result = pd.concat([result, r])
    result['row_id'] = result['row_id'].astype('int')

    return result
예제 #18
0
class EllipticDetection(BaseEstimator, TransformerMixin):
    def __init__(self, contamination=0):
        self.contamination = contamination

    def fit(self, X, y=None):
        if self.contamination == 0:
            return self
        self.ell = EllipticEnvelope(contamination=self.contamination)
        if y is None:
            self.ell.fit(X)
        else:
            self.ell.fit(X, y)

        return self

    def transform(self, X_):
        X = deepcopy(X_)
        if self.contamination == 0:
            return X
        idx_outlier = self.ell.predict(X) == -1
        X[idx_outlier, :] = np.nan

        simple_imputer = SimpleImputer()
        X = simple_imputer.fit_transform(X)

        return X
예제 #19
0
class Baseline(ModelBase):
    def __init__(self, model_name, packet_length=1500, seq_length=1, epochs=1):
        super().__init__(packet_length, seq_length, epochs)
        self.model_name = model_name
        if model_name == 'svm':
            self.model = OneClassSVM(kernel='rbf', nu=0.05)
        elif model_name == 'if':
            self.model = IsolationForest(contamination=0.05,
                                         max_features=15,
                                         random_state=0)
        elif model_name == 'lof':
            self.model = LocalOutlierFactor(contamination=0.05, novelty=True)
        elif model_name == 'gm':
            self.model = GaussianMixture(random_state=0)
        elif model_name == 'ee':
            self.model = EllipticEnvelope(contamination=0.05, random_state=0)

    def fit(self, X):
        self.model.fit(X)

    def predict(self, X):
        labels = self.model.predict(X)
        scores = self.model.score_samples(X)
        return scores, labels

    def save(self, name):
        joblib.dump(self.model, name + '_{}.pkl'.format(self.model_name))

    def load(self, name):
        self.model = joblib.load(name + '_{}.pkl'.format(self.model_name))

    def exist(self, name):
        return os.path.exists(name + '_{}.pkl'.format(self.model_name))
예제 #20
0
def robustcovariance(nparray, contamination):
    """
    The scikit-learn provides an object covariance.EllipticEnvelope that fits a
    robust covariance estimate to the data, and thus fits an ellipse to the central
    data points, ignoring points outside the central mode.

    References:
    Rousseeuw, P.J., Van Driessen, K. “A fast algorithm for the minimum covariance determinant estimator”.
    Technometrics 41(3), 212 (1999)
    """

    df = pd.DataFrame(nparray)

    # Fit the model
    clf = EllipticEnvelope(contamination=contamination)
    clf.fit(df)
    y_pred = clf.predict(df)

    y_pred[y_pred == 1] = 0
    y_pred[y_pred == -1] = 1

    # df['RC'] = y_pred
    # ax = df[df['RC']==1][0].plot(style='.')
    # df[df['RC']==-1][0].plot(style='.',ax=ax)

    return y_pred
예제 #21
0
def ellipses_indices_of_outliers(X, contamination=0.1):
    '''
    Detects outliers using the elliptical envelope method
    
    Input: An array of all variables to detect outliers for
    Output: An array with indices of detected outliers
    '''
    from sklearn.covariance import EllipticEnvelope
    
    # Copying to prevent changes to the input array
    X = X.copy()
    
    # Dropping categorical columns
    non_categorical = []
    for feature in range(X.shape[1]):
        num_unique_values = len(np.unique(X[:, feature]))
        if num_unique_values > 30:
            non_categorical.append(feature)
    X = X[:, non_categorical]  # Subsetting to columns without categorical indexes

    # Testing if there are an adequate number of features
    if X.shape[0] < X.shape[1] ** 2.:
        print('Will not perform well. Reduce the dimensionality and try again.')
        return
    
    # Creating and fitting the detector
    outlier_detector = EllipticEnvelope(contamination=contamination)
    outlier_detector.fit(X)
    
    # Predicting outliers and outputting an array with 1 if it is an outlier
    outliers = outlier_detector.predict(X)
    outlier_indices = np.where(outliers == -1)
    return outlier_indices
def filter_outliers_in_features(X):
    # clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
    clf = EllipticEnvelope(support_fraction=1, contamination=0.2)
    clf.fit(X)
    # r = clf.predict(X)
    X = X[clf.predict(X) == 1]
    return X
예제 #23
0
def oneClassSVM():
    for percent in percents:
        model = EllipticEnvelope(contamination=percent).fit(clean_matrix)
        results = orig_df[model.predict(mixed_matrix) == -1]
        name = 'covariance_' + str(percent)
        results.to_csv('output/' + name + '_outliers.csv')
        parseResults(results, name)
예제 #24
0
class EllipticEnvelopeFilter(BaseEstimator):
    def __init__(self,
                 assume_centered=False,
                 support_fraction=None,
                 contamination=0.1,
                 random_state=None):
        self.assume_centered = assume_centered
        self.support_fraction = support_fraction
        self.contamination = contamination
        self.random_state = random_state

    def fit_pipe(self, X, y=None):
        self.elliptic_envelope_ = EllipticEnvelope(**self.get_params())
        self.elliptic_envelope_.fit(X)
        return self.transform_pipe(X, y)

    def transform_pipe(self, X, y):
        # XXX: sample_props not taken care off
        is_inlier = self.elliptic_envelope_.predict(X) == 1
        X_out = X[is_inlier]
        if y is None:
            y_out = None
        else:
            y_out = y[is_inlier]
        return X_out, y_out

    def transform(self, X, y=None):
        return X
예제 #25
0
def treate_outliers(df, action="parallel", debug=True, remove=True):
    if action == "colective":
        columns = df.columns
        # Saca todas las caregorias de Y
        categories = df[df.columns[-1]].unique()
        # Cambia las categorias por numeros
        for i in range(len(categories)):
            df[df.columns[-1]].replace(categories[i], i, inplace=True)
        elip_env = EllipticEnvelope().fit(df)
        detection = elip_env.predict(df)
        #Outilers using Mahalanobis distance.
        outlier_positions_mah = [
            x for x in range(df.shape[0]) if detection[x] == -1
        ]
        if remove:
            df.drop(df.index[outlier_positions_mah], inplace=True)
        return outlier_positions_mah

    elif action == "individual":
        all_outliers_positions_box = []
        columns = df.columns
        _, bp = pd.DataFrame.boxplot(df, return_type='both')
        outliers = [flier.get_ydata() for flier in bp["fliers"]]
        for i in range(len(outliers)):
            prop_outliers = outliers[i]
            if prop_outliers.size > 0:

                IQR = df.describe()[columns[i]]["75%"] - df.describe()[
                    columns[i]]["25%"]
                whiskers = [
                    df.describe()[columns[i]]["25%"] - (1.5 * IQR),
                    df.describe()[columns[i]]["75%"] + (1.5 * IQR)
                ]
                outlier_positions_box = [
                    x for x in range(df.shape[0])
                    if df[columns[i]].values[x] < whiskers[0]
                    or df[columns[i]].values[x] > whiskers[1]
                ]
                all_outliers_positions_box += outlier_positions_box
                if debug:
                    print("outliers for variable ['" + str(columns[i]) +
                          "'] = " + str(outlier_positions_box))

        if remove:
            df.drop(df.index[outlier_positions_box], inplace=True)
        return all_outliers_positions_box

    elif action == "parallel":
        outlier_positions_mah = treate_outliers(df,
                                                action="colective",
                                                remove=False)
        outlier_positions_box = treate_outliers(df,
                                                action="individual",
                                                remove=False)
        outliers_position = list(
            np.sort(outlier_positions_mah + outlier_positions_box))
        if remove:
            df.drop(df.index[outliers_position], inplace=True)
        return outliers_position
def calcu2(mppt):
    clf = EllipticEnvelope(contamination=0.01)
    my_mppt1 = mppt.iloc[:, 0:106]
    clf.fit(my_mppt1)
    y_pred = clf.predict(my_mppt1)
    # y_pred = clf.predict(my_mppt1)
    output = mppt[y_pred == -1].iloc[:, 108]
    return output
예제 #27
0
def detectAnomalies(X, model_params):
    """
    Detects the anomalies using Mahalonobis Distance
    
    Arguments:
        X {2d numpy.array} -- features of the windowed sequences
        model_params {dictionary} -- SSG-LUGIA model configuration    
    
    Returns:
        yp {numpy.array} -- binary prediction of the anomalies
        ys {numpy.array} -- mahalonobis distance of the anomalies
    """

    # we use the EllipticEnvelope model from Scikit-Learn library
    # to detect anomalies using mahalonobis distance
    elenv = EllipticEnvelope(
        contamination=model_params['contamination_model1'],
        support_fraction=model_params['support_fraction_model1'],
        random_state=3)

    elenv.fit(X)

    yp = elenv.predict(X)  # binary prediction
    ys = elenv.decision_function(X)  # mahalonobis distance computation

    X2 = X[np.where(yp == 1)]  # selecting only the windows predicted native

    # performing anomaly detection again
    elenv2 = EllipticEnvelope(
        contamination=model_params['contamination_model2'],
        support_fraction=model_params['support_fraction_model2'],
        random_state=3)
    elenv2.fit(X2)

    yp2 = elenv2.predict(X2)  # binary prediction
    ys2 = elenv2.decision_function(X2)  # mahalonobis distance computation

    ys[np.where(
        yp ==
        1)] = ys2  # updating the binary prediction based on level 2 detection
    yp[np.where(
        yp == 1
    )] = yp2  # updating the mahalonobis distance based on level 2 detection

    return (yp, ys)
def test_elliptic_envelope():
    rnd = np.random.RandomState(0)
    X = rnd.randn(100, 10)
    clf = EllipticEnvelope(contamination=0.1)
    with pytest.raises(NotFittedError):
        clf.predict(X)
    with pytest.raises(NotFittedError):
        clf.decision_function(X)
    clf.fit(X)
    y_pred = clf.predict(X)
    scores = clf.score_samples(X)
    decisions = clf.decision_function(X)

    assert_array_almost_equal(scores, -clf.mahalanobis(X))
    assert_array_almost_equal(clf.mahalanobis(X), clf.dist_)
    assert_almost_equal(clf.score(X, np.ones(100)),
                        (100 - y_pred[y_pred == -1].size) / 100.)
    assert (sum(y_pred == -1) == sum(decisions < 0))
예제 #29
0
 def elliptic_envelope_out(self, contamination):
     self.report.append('elliptic_envelope_out')
     ds = self.training[self.numerical_var]
     elliptic = EllipticEnvelope(contamination=contamination)
     elliptic.fit(ds)
     results = elliptic.predict(ds)
     outlier_elliptic = pd.Series(results)
     outlier_elliptic.index = ds.index
     return outlier_elliptic[outlier_elliptic == -1].index
예제 #30
0
def plot(X, y):
    proj = TSNE().fit_transform(X)
    e = EllipticEnvelope(assume_centered=True, contamination=.25) # Outlier detection
    e.fit(X)

    good = np.where(e.predict(X) == 1)
    X = X[good]
    y = y[good]

    scatter(proj, y)
예제 #31
0
def transform(features, labels):

    #    for ff, ll in zip(features, labels):
    #        print ll, ff
    #    for rr in range(0, len(features) ):
    #        features[rr] = scaler.fit_transform( features[rr] )

    print "transforming features via pca"
    pca = PCA(n_components=30)
    features = pca.fit_transform(features)

    envelope = EllipticEnvelope()
    envelope.fit(features)
    print envelope.predict(features)

    scaler = MinMaxScaler()
    features = scaler.fit_transform(features)

    return features, labels
def test_outlier_detection():
    """

    """
    rnd = np.random.RandomState(0)
    X = rnd.randn(100, 10)
    clf = EllipticEnvelope(contamination=0.1)
    clf.fit(X)
    y_pred = clf.predict(X)

    assert_array_almost_equal(clf.decision_function(X, raw_mahalanobis=True), clf.mahalanobis(X - clf.location_))
    assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.0)
예제 #33
0
def transform( features, labels ):

#    for ff, ll in zip(features, labels):
#        print ll, ff
#    for rr in range(0, len(features) ):
#        features[rr] = scaler.fit_transform( features[rr] )

    print "transforming features via pca"
    pca = PCA(n_components = 30)
    features = pca.fit_transform( features )

    envelope = EllipticEnvelope()
    envelope.fit( features )
    print envelope.predict( features )

    scaler = MinMaxScaler()
    features = scaler.fit_transform( features )



    return features, labels
예제 #34
0
def test_outlier_detection():
    rnd = np.random.RandomState(0)
    X = rnd.randn(100, 10)
    clf = EllipticEnvelope(contamination=0.1)
    assert_raises(NotFittedError, clf.predict, X)
    assert_raises(NotFittedError, clf.decision_function, X)
    clf.fit(X)
    y_pred = clf.predict(X)
    decision = clf.decision_function(X, raw_values=True)
    decision_transformed = clf.decision_function(X, raw_values=False)

    assert_array_almost_equal(decision, clf.mahalanobis(X))
    assert_array_almost_equal(clf.mahalanobis(X), clf.dist_)
    assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.0)
    assert sum(y_pred == -1) == sum(decision_transformed < 0)
예제 #35
0
def ellipticenvelope(data, fraction = 0.02):
    elenv = EllipticEnvelope(contamination=fraction)
    elenv.fit(data)
    score = elenv.predict(data)

    numeration = [[i] for i in xrange(1, len(data)+1, 1)]
    numeration = np.array(numeration)
    y = np.hstack((numeration, score))

    anomalies = numeration
    for num,s in y:
        if (y == 1):
            y = np.delete(anomalies, num-1, axis=0)

    return anomalies
def test_elliptic_envelope():
    rnd = np.random.RandomState(0)
    X = rnd.randn(100, 10)
    clf = EllipticEnvelope(contamination=0.1)
    assert_raises(NotFittedError, clf.predict, X)
    assert_raises(NotFittedError, clf.decision_function, X)
    clf.fit(X)
    y_pred = clf.predict(X)
    scores = clf.score_samples(X)
    decisions = clf.decision_function(X)

    assert_array_almost_equal(
        scores, -clf.mahalanobis(X))
    assert_array_almost_equal(clf.mahalanobis(X), clf.dist_)
    assert_almost_equal(clf.score(X, np.ones(100)),
                        (100 - y_pred[y_pred == -1].size) / 100.)
    assert(sum(y_pred == -1) == sum(decisions < 0))
def anomaly_detection(features, labels):
	# In this function, I try to use anomaly detection method (using mutivariate gaussian distribution) to identify poi-s
	non_pois = features[labels==0]
	pois = features[labels==1]
	print "non poi size", non_pois.shape, pois.shape, features.shape

	## Spliting data to train, test and cross validation set for anomaly detection

	split1 = produce_spliting_array(non_pois.shape[0], .75 )
	X_train = non_pois[split1==1]

	X_intermediate = non_pois[split1==0]

	print "size intermediate", X_intermediate.shape

	split2 = produce_spliting_array(X_intermediate.shape[0], .5 )

	X_test = X_intermediate[split2==1]
	label_test = np.zeros((X_test.shape[0],), dtype=np.int) - 1

	X_cv = X_intermediate[split2==0]
	label_cv = np.zeros((X_cv.shape[0],), dtype=np.int) - 1

	split3 = produce_spliting_array(pois.shape[0], .5 )
	X_test = np.vstack((X_test, pois[split3==1]))
	label_test = np.hstack((label_test, np.ones(sum(split3), dtype=np.int)))

	X_cv = np.vstack((X_cv, pois[split3==0]))
	label_cv = np.hstack((label_cv, np.ones(sum(split3==0), dtype=np.int)))



	print "size X_train", X_train.shape
	print "size test data", X_test.shape, label_test.shape
	print "size cv data", X_cv.shape, label_cv.shape
	print "size splits", len(split1), len(split2), len(split3)

	from sklearn.covariance import EllipticEnvelope
	detector = EllipticEnvelope(contamination=.85)
	detector.fit(X_train)
	pred_cv = detector.predict(X_cv)
	print pred_cv
	print label_cv
	print detector.score(X_cv, label_cv)
예제 #38
0
파일: outliers.py 프로젝트: apodemus/tsa
def CovEstOD(data, classifier=None, N=1, **kw):
    if classifier is None:
        from sklearn.covariance import EllipticEnvelope
        contamination = N / data.shape[0]
        classifier = EllipticEnvelope(support_fraction=1., contamination=contamination)

    classifier.fit(data)
    clipix, = np.where( classifier.predict(data) == -1)
    
    wdb = kw.pop( 'with_decision_boundary', False )
    #TODO:  A better way of finding the decision boundary
    if wdb:
        w,T = np.linalg.eigh( clf.precision_ )          #T (eigenvectors of precision matrix) is the transformation matrix between principle axes and data coordinates
        Ti = np.linalg.inv(T)
        M = np.dot(Ti, clf.precision_) * T              #Diagonalizing the precision matrix ==> quadratic representation of decision boundary (ellipse): z^T M z = threshold. where x-<x> = Tz transforms to principle axes
        a, b = np.sqrt(clf.threshold / np.diag(M))      #semi-major & semi-minor axes
        theta = np.degrees( np.arccos(T[0,0]) )         #T is (im)proper rotation matrix
        theta = np.linalg.det(T) * theta                #If det(T)=-1 ==> improper rotation matrix (rotoinversion - one of the axes is inverted)
        decision_boundary = Ellipse( clf.location_, 2*a, 2*b, theta,  color='m' )
        return clipix, decision_boundary
    else:
        return clipix
예제 #39
0
# print(Y)

# Find outliers in the interaction rate data

# Step 1 - Convert the dataset into pandas series
util = Utility.SeriesUtility()
datasetFileName = "fans_change_taylor_swift.csv"
series = util.convertDatasetsToSeries(datasetFileName)

series = util.resampleSeriesSum(series, "D")

numberOfPoints = series.data.shape[0]
X = series.values.flatten().reshape(numberOfPoints,1)

det.fit(X)

predicted = det.predict(X)

for i in range(numberOfPoints):
    outputClass = det.predict(X[i])[0]

    if(outputClass == -1):
        print("Outlier detected...")







예제 #40
0
    try:
        return float(val)
    except ValueError:
        return np.nan

cytos = ['VEGF','IL-1beta','G-CSF','EGF','IL-10','HGF','FGF-basic',
'IFN-alpha','IL-6','IL-12','Rantes','Eotaxin','IL-13','IL-15',
'IL-17','MIP-1alpha','GM-CSF','MIP-1beta','MCP-1','IL-5',
'IFN-gamma','TNF-alpha','IL-RA','IL-2','IL-7',
'IP-10','IL-2R','MIG','IL-4','IL-8']

for col in cytos:
    data[col] = data[col].map(safe_float)
    try:
        env = EllipticEnvelope().fit(data[col].dropna().values.reshape(-1,1))
        mask = env.predict(data[col].values.reshape(-1,1))
        data[col][mask == -1] = np.nan
    except:
        pass
        
    
    #print mask
    #break
    

# <codecell>

pos = dict(zip('ABCDEFGH', range(8)))
def xpos(val):
    _, p = val.split('(')
    return pos[p.split(',')[1][0]]
for i in range(0,len(SectionData)):
    if SectionData['newAngle'][i]==0:
        SectionData['angle'][i]=180
    else:
        SectionData['angle'][i]=SectionData['newAngle'][i]
    x=SectionData['newX'][i]
    y=SectionData['newY'][i]
    SectionData['Distance'][i]=math.sqrt((x*x)+(y*y))
        



#fit the outlier detector to the data and predict
X=SectionData[['angle','newX','newY']]
outlier_detector = EllipticEnvelope(contamination=0.14).fit(X.values)
outliers = outlier_detector.predict(X.values)

#finds outliers
for i in range(0,len(outliers)):
    SectionData['OUTLIER'][i]=outliers[i]
    if outliers[i]==-1:
        print 'outlier at: ',SectionData['center'][i]
        
        
fig = plt.figure(figsize=(20,20))
#plotting the section map 
#outliers indicated on map with larger circles
for i in range(0,len(SectionData)):
    if SectionData['OUTLIER'][i]==-1:
        plt.scatter(SectionData['X'][i],SectionData['Y'][i],s=40)
        plt.annotate(str(int(round(SectionData['gradient_angle'][i],0))),(SectionData['X'][i],SectionData['Y'][i]+5))
예제 #42
0
def search_outliers_EllipticEnvelope(X):
    clf = EllipticEnvelope(contamination=0.2)
    clf.fit(X)
    is_outliers = clf.predict(X)
    return is_outliers
                'Age',
                'HAART-Naive',
                'HAART-Non-Adherent',
                'HAART-Off',
                'HAART-On',
                'Hepatitis C status (HCV)']
for col in tranfer_cols:
    _, cyto_data[col] = cyto_data.align(pat_data[col], join='left', axis = 0)
cyto_data['HCV'] = cyto_data['Hepatitis C status (HCV)']

# <codecell>

for col in cytos:
    env = EllipticEnvelope(contamination=0.05)
    env.fit(cyto_data[col].dropna().values.reshape(-1, 1))
    mask = env.predict(cyto_data[col].values.reshape(-1,1))
    cyto_data[col][mask==-1] = np.nan

# <codecell>


fig, axs = plt.subplots(11,3, figsize = (10,20))

for ax, col in zip(axs.flatten(), cytos):
    
    boxes = []
    mus = []
    stds = []
    for trop in trops:
        mask = cyto_data['Tropism'] == trop
        #mask &= cyto_data['Keep']