예제 #1
0
    def filter_remove_outlayers(self, flat, minimum_value=0):
        """
        Remove outlayers using ellicptic envelope from scikits learn
        :param flat:
        :param minimum_value:
        :return:
        """
        from sklearn.covariance import EllipticEnvelope
        flat0 = flat.copy()
        flat0[np.isnan(flat)] = 0
        x,y = np.nonzero(flat0)
        # print np.prod(flat.shape)
        # print len(y)

        z = flat[(x,y)]

        data = np.asarray([x,y,z]).T

        clf = EllipticEnvelope(contamination=.1)
        clf.fit(data)
        y_pred = clf.decision_function(data)


        out_inds = y_pred < minimum_value
        flat[(x[out_inds], y[out_inds])] = np.NaN
        return flat
예제 #2
0
    def envelop(self):
        # Make sure you apply pca before using Envelop -- it is very sensitive to the feature dimensions
        clf_een = EllipticEnvelope(store_precision=True,
                                   assume_centered=False,
                                   support_fraction=0.25,
                                   contamination=0.1,
                                   random_state=True)

        # Fitting the model on reduced dimensionality
        clf_een.fit(self.gen_tr_data)

        # Prediction labels
        pred_gen_ts_labels = clf_een.predict(self.gen_ts_data)
        pred_imp_ts_labels = clf_een.predict(self.imp_ts_data)

        act_ts_labels = np.concatenate(
            (self.get_gen_ts_labels(), self.get_imp_ts_labels()))
        pred_ts_labels = np.concatenate(
            (pred_gen_ts_labels, pred_imp_ts_labels))

        tn, fp, fn, tp = confusion_matrix(act_ts_labels,
                                          pred_ts_labels).ravel()
        far = fp / (fp + tn)
        frr = fn / (fn + tp)
        pr = tp / (tp + fp)
        return far, frr, pr
예제 #3
0
def predict_AB(train,test,result,num,sshop):

    filter_feature_train = ['user_id', 'time_stamp', 'mall_id', 'wifi_infos','wifi_id_signal','shop_id']
    filter_feature_test  = ['user_id', 'time_stamp', 'mall_id', 'wifi_infos','wifi_id_signal']
    train = train.drop(filter_feature_train,axis=1)
    test  = test.drop(filter_feature_test,axis=1)
    train = train.fillna(-999)
    test  = test.fillna(-999)
    test = test[list(train.columns)].join(test['row_id'])

    # # 存储矩阵
    # train.to_csv(r'D:\刘帅专用\XGBoost天池\mall_data_train&test\train_%d.csv'% num,index=None)
    # test.to_csv(r'D:\刘帅专用\XGBoost天池\mall_data_train&test\test_%d.csv' % num, index=None)

    model = EllipticEnvelope()
    model.fit(train)
    test['label'] = model.predict(test.drop(['row_id'],axis=1))

    # 标签转化回去
    test['shop_id'] = None
    print('***************************',len(test))
    print(len(test[test['label']==1]))

    print('***************************')
    test = test[test['label']==1]

    test['shop_id'][test['label']==1] = sshop #todo
    r = test[['row_id', 'shop_id']]
    result = pd.concat([result, r])
    result['row_id'] = result['row_id'].astype('int')

    return result
예제 #4
0
    def calc(self,outliers_fraction):
        

        data, dqs, raw = self.get_data()
        clf = EllipticEnvelope(contamination=outliers_fraction)
        X = zip(data['Tbandwidth'],data['Tlatency'],data['Tframerate'])
        clf.fit(X)
        #data['y_pred'] = clf.decision_function(X).ravel()
        #data['y_pred'] = clf.decision_function(X).ravel()
        
        #threshold = np.percentile(data['y_pred'],100 * outliers_fraction)
        data['MDist']=clf.mahalanobis(X)
        
        #picking "bad" outliers, not good ones
        outliers = chi2_outliers(data, [.8,.9,.95], 3)
        #print outliers
        outliers = [i[i['Tbandwidth']<i['Tlatency']] for i in outliers]
        
        #outliers = data[data['y_pred']<threshold]
        #data['y_pred'] = data['y_pred'] > threshold
        #outliers = [x[['ticketid','MDist']].merge(raw, how='inner').drop_duplicates() for x in outliers]
        #print raw
        #outliers = [raw[raw['ticketid'].isin(j['ticketid'])] for j in outliers]
        outliers = [k[k['Tframerate']<(k['Tframerate'].mean()+k['Tframerate'].std())] for k in outliers] #making sure we don't remove aberrantly good framrates
        outliers = [t.sort_values(by='MDist', ascending=False).drop_duplicates().drop(['Tbandwidth','Tlatency','Tframerate'],axis=1) for t in outliers]
        
        #dqs = raw[raw['ticketid'].isin(dqs['ticketid'])]
        #data = data.sort_values('MDist', ascending=False).drop_duplicates()
        
        return outliers, dqs, data.sort_values(by='MDist', ascending=False).drop_duplicates().drop(['Tbandwidth','Tlatency','Tframerate'],axis=1)
    def clean_series(self, token, discard=5):

        """
        Remove outliers from the ratio series for a token.

        Args:
            discard (int): Drop the most outlying X% of the data.

        Returns: OrderedDict{year: wpm}
        """

        series = self.ratios[token]

        X = np.array(list(series.values()))[:, np.newaxis]

        env = EllipticEnvelope()
        env.fit(X)

        # Score each data point.
        y_pred = env.decision_function(X).ravel()

        # Get the discard threshold.
        threshold = stats.scoreatpercentile(y_pred, discard)

        return OrderedDict([
            (year, ratio)
            for (year, ratio), pred in zip(series.items(), y_pred)
            if pred > threshold
        ])
예제 #6
0
def ellipticCurve(dataset):
    classifier = EllipticEnvelope(contamination=outlierFraction)
    classifier.fit(dataset)
    predScore = classifier.decision_function(dataset)
    pred = classifier.predict(dataset)
    outlierRows = [i for i in range(len(pred)) if pred[i] == -1]
    return predScore, outlierRows
예제 #7
0
def predict_EllipticEnvelope(X, fraction_outlier):
    xx, yy = get_meshgrid(X)
    x1, x2 = xx.min(), xx.max()
    y1, y2 = yy.min(), yy.max()
    d = (x2 - x2) * 0.1

    A = EllipticEnvelope(contamination=fraction_outlier)
    A.fit(X)
    Y = A.predict(X)

    confidence_mat = numpy.array([(A.predict(x.reshape(-1, 2))).astype(int)
                                  for x in numpy.c_[xx.flatten(),
                                                    yy.flatten()]])
    grid_confidence = (confidence_mat).reshape((100, 100))
    P.plot_contourf(X[Y > 0],
                    X[Y <= 0],
                    xx,
                    yy,
                    grid_confidence,
                    x_range=[x1 - d, x2 + d],
                    y_range=[y1 - d, y2 + d],
                    filename_out='4_pred_EllipticEnvelope_density.png')
    P.plot_2D_features_multi_Y(X,
                               -Y,
                               x_range=[x1 - d, x2 + d],
                               y_range=[y1 - d, y2 + d],
                               filename_out='4_pred_EllipticEnvelope.png')
    return
예제 #8
0
def train_model():
    data = json_normalize(retrieve_data())

    if data.empty:  # Early termination if no data was retrieved
        print("No data retrieved, terminating script")
        sys.exit()

    vib_dist = (data["vibration_max"] - data["vibration_min"])
    vec_dist = data["vector_distance"]

    # SCALE DATA, ISOLATED
    frame = {'vector_distance': vec_dist, 'vibration_distance': vib_dist}
    data_2d = pd.DataFrame(frame).dropna()

    scaler = MinMaxScaler()
    training_data = scaler.fit_transform(data_2d)

    # K MEANS CLUSTERING
    # Initialize the two centroids in minimum and maximum
    init_cnts = np.array([[0.0, 0.0], [1.0, 1.0]])
    clustering = KMeans(n_clusters=2, random_state=42, init=init_cnts)
    clustering.fit(training_data)

    # OUTLIER DETECTION
    outlier = EllipticEnvelope(contamination=0.00075, random_state=42)
    outlier.fit(training_data)

    # SAVE MODELS
    print("Saving scaler")
    save_model(scaler, "scaler.sav")
    print("Saving clustering model")
    save_model(clustering, "kmeans_model.sav")
    print("Saving outlier model")
    save_model(outlier, "outlier_model.sav")
예제 #9
0
def robustcovariance(nparray, contamination):
    """
    The scikit-learn provides an object covariance.EllipticEnvelope that fits a
    robust covariance estimate to the data, and thus fits an ellipse to the central
    data points, ignoring points outside the central mode.

    References:
    Rousseeuw, P.J., Van Driessen, K. “A fast algorithm for the minimum covariance determinant estimator”.
    Technometrics 41(3), 212 (1999)
    """

    df = pd.DataFrame(nparray)

    # Fit the model
    clf = EllipticEnvelope(contamination=contamination)
    clf.fit(df)
    y_pred = clf.predict(df)

    y_pred[y_pred == 1] = 0
    y_pred[y_pred == -1] = 1

    # df['RC'] = y_pred
    # ax = df[df['RC']==1][0].plot(style='.')
    # df[df['RC']==-1][0].plot(style='.',ax=ax)

    return y_pred
예제 #10
0
파일: oneclass.py 프로젝트: zfxu/tests
def show(samplepath):
    paths = []
    sname = os.path.splitext(samplepath)[0]
    print sname
    with open(sname+"_path.txt", 'r') as f:
        for line in f:
            paths.append(line.strip())
    X = load_one_class_feature(samplepath)
    X = norm_data(X)
    #clf = OneClassSVM(kernel='rbf',gamma=0.01,nu=0.098)
    clf = EllipticEnvelope(contamination=0.05)
    clf.fit(X)
    Y = clf.predict(X)
    DY = clf.decision_function(X)
    for k in range(len(Y)):
        if Y[k] < 0: #abnormality is positive
            print k + 1, ',', DY[k], ',',paths[k]
    err = np.sum( [ y < 0 for y in Y] )
    print '%d/%d'%(err, len(Y))

    x1,y1 = np.meshgrid(np.linspace(-20,20,400), np.linspace(-20,20,400))
    z1 = clf.decision_function(np.c_[x1.ravel(), y1.ravel()])
    z1 = z1.reshape(x1.shape)
    legend = {}
    legend['test'] = plt.contour(x1,y1,z1, levels=[0], linewidths=2,color='r')
    plt.scatter(X[:,0], X[:,1], color='black')

    values_list = list(legend.values())
    keys_list = list(legend.keys())
    plt.legend([values_list[0].collections[0]],[keys_list[0]])
    plt.show()
예제 #11
0
class EllipticEnvelopeOutlierStream(OutlierStream):

    def __init__(self, data, data_stream):
        OutlierStream.__init__(self, data, data_stream)
        self.model = EllipticEnvelope(contamination=0.045)
        self.DEBUG = False
        self.pca_plot = StreamPCA()

    def train_model(self, data):
        self.model.fit(data)

    def update_model(self, data):
        return None

    def predict_model(self, data):
        return self.model.predict(data)

    def summary(self, predictions, data_stream):

        print("Non outliers: {}".format(len(list(filter(lambda x: x > 0, predictions)))))
        print("Outliers: {}".format(len(list(filter(lambda x: x < 0, predictions)))))

        import numpy as np
        y_axes = np.linspace(0, len(predictions), len(predictions))
        plt.scatter(y_axes,predictions)
        plt.show()
예제 #12
0
파일: helpers.py 프로젝트: ChengF-Lab/scIVA
def outliers_detection(expr):
    x = PCA(n_components=2).fit_transform(expr)
    ee = EllipticEnvelope()
    ee.fit(x)
    oo = ee.predict(x)

    return oo
def EllipticEnvelopeDetection(clm_select, all_tss, df_data, plot=False):
    rng = np.random.RandomState(42)
    outliers_fraction = 0.6
    if plot:
        plt.figure()
    ee_pred = {}
    for i in range(len(clm_select)):
        col = clm_select[i]
        j = 1
        ee_pred[col] = []
        for kind in all_tss[col].keys():
            j += 1
            X = np.array(all_tss[col][kind])
            # ONE-class SVM
            clf = EllipticEnvelope(contamination=outliers_fraction)
            clf.fit(X)
            y_pred = clf.predict(X)
            ee_pred[col].extend(y_pred)
        if plot:
            subplot = plt.subplot(len(clm_select), 1, i + 1)
            subplot.scatter(df_data['val'], df_data[col], c=ee_pred[col])
            subplot.set_title('Dimension ' + clm_select[i])
    if plot:
        plt.suptitle('Outlier detection with one class EllipticEnvelope')
        plt.show()
    return ee_pred
예제 #14
0
def outlier_removal2(features, samples, cv_predict):

    outliers_fraction = 0.1

    print cv_predict.shape
    print samples.shape
    test = np.column_stack((cv_predict, samples))
    #clf = EllipticEnvelope(contamination=.1)
    clf = EllipticEnvelope(contamination=.1)
    #clf = svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,
    #                                 kernel="rbf", gamma=0.1)
    clf.fit(test)
    y_pred = clf.decision_function(test).ravel()
    threshold = stats.scoreatpercentile(y_pred,
                                        100 * outliers_fraction)

    y_pred_new = y_pred > threshold
    print y_pred_new
    #print samples[y_pred_new]
    print samples.shape
    print samples[y_pred_new].shape
    print features.shape
    print features[y_pred_new].shape

    return features[y_pred_new], samples[y_pred_new]
예제 #15
0
    def cov(self, X_train, contamination=None, random_state=None):
        """
        Train Elliptic Envelope model from scikit-learn

        Parameters
        __________
        X_train: scaled training data
        contamination: percentage of anomalies in the data
        random_state: random number seed

        Returns
        ________
        Anomaly scores
        """
        model = EllipticEnvelope(contamination=contamination,
                                 random_state=random_state)
        model.fit(X_train)

        # Predict raw anomaly score
        labels = model.predict(X_train)  # -1 for outliers and 1 for inliers
        labels = (labels.max() -
                  labels) // 2  # rescaled labels (1: outliers, 0: inliers)
        cov_anomaly_scores = model.decision_function(
            X_train) * -1  # anomaly score
        cov_anomaly_scores = self.min_max_scaler(cov_anomaly_scores)
        return cov_anomaly_scores, labels
예제 #16
0
    def DetectOutliersUsingEnvelope(self):

        data = self.__df[[self.__x, self.__y]].values
        clf = EllipticEnvelope()
        x_min_value, x_max_value = min(
            self.__df[self.__x].values) - self.__factor, max(
                self.__df[self.__x].values) + self.__factor
        y_min_value, y_max_value = min(
            self.__df[self.__y].values) - self.__factor, max(
                self.__df[self.__y].values) + self.__factor

        xx, yy = np.meshgrid(np.linspace(x_min_value, x_max_value, 500),
                             np.linspace(y_min_value, y_max_value, 500))
        clf.fit(data)
        Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        pred = clf.fit_predict(data)  #Outliers = -1, inliers = 1
        if self.__drop_outliers:  # Let's drop outliers from dataset!
            for index, outlier in enumerate(pred):
                if outlier == -1:
                    self.__df = self.__df.drop(index, axis=0)
            return self.__df

        else:
            return xx, yy, Z
예제 #17
0
def ellipses_indices_of_outliers(X, contamination=0.1):
    '''
    Detects outliers using the elliptical envelope method
    
    Input: An array of all variables to detect outliers for
    Output: An array with indices of detected outliers
    '''
    from sklearn.covariance import EllipticEnvelope
    
    # Copying to prevent changes to the input array
    X = X.copy()
    
    # Dropping categorical columns
    non_categorical = []
    for feature in range(X.shape[1]):
        num_unique_values = len(np.unique(X[:, feature]))
        if num_unique_values > 30:
            non_categorical.append(feature)
    X = X[:, non_categorical]  # Subsetting to columns without categorical indexes

    # Testing if there are an adequate number of features
    if X.shape[0] < X.shape[1] ** 2.:
        print('Will not perform well. Reduce the dimensionality and try again.')
        return
    
    # Creating and fitting the detector
    outlier_detector = EllipticEnvelope(contamination=contamination)
    outlier_detector.fit(X)
    
    # Predicting outliers and outputting an array with 1 if it is an outlier
    outliers = outlier_detector.predict(X)
    outlier_indices = np.where(outliers == -1)
    return outlier_indices
예제 #18
0
 def detect_anomalies(self, data, **params):
     envelope = EllipticEnvelope()
     envelope.set_params(**params)
     envelope.fit(data)
     # TODO: decision function has other range than that of IsolationForest
     return envelope.decision_function(
         data)  # The anomaly score. The lower, the more abnormal.
예제 #19
0
class EllipticEnvelope_Classifier:
  """docstring for EllipticEnvelope"""
  def __init__(self, save_path):

    # 默认路径
    # 保存路径
    self.save_path = os.path.join(save_path,'EllipticEnvelope')
    if not os.path.exists(self.save_path):
      os.makedirs(self.save_path)
    self.contamination = 0.1

    self.classifier = EllipticEnvelope(contamination=self.contamination)
    

  def fit_model(self, train_data_matrix, test_data_matrix, test_true_label):
    """训练模型"""
    train_data_matrix = train_data_matrix.toarray()
    test_data_matrix = test_data_matrix.toarray()
    self.classifier.fit(train_data_matrix)
    y_pred_label = self.classifier.predict(test_data_matrix)
    n_errors_test = (y_pred_label!=test_true_label).sum()
    accuracy, classification_report, confusion_matrix = sklearn_evaluation(test_true_label, y_pred_label)
    print('Accuracy: {} \nClassification Report:\n{}\n'.format(accuracy, classification_report))
    sys.stdout.flush()


  def test_model(test_data,):
    """测试模型
       such as test_label = [1,1,-1,....]
    """
    scores_pred = self.classifier.decision_function(train_data)
    y_pred = self.classifier.predict(train_data)
    n_error_train = y_pred_test[y_pred_test == -1].size
예제 #20
0
def outlier(TRAIN, contam):

    for i in range(TRAIN.shape[1]):
        v = TRAIN[:, i]
        v_hat = (v - np.median(v))
        TRAIN[:, i] = v_hat

    # model creation
    clf = EllipticEnvelope(support_fraction=1.,
                           contamination=contam,
                           assume_centered=True)
    clf.fit(TRAIN)
    C = clf.correct_covariance(TRAIN)
    pred = clf.predict(TRAIN)

    # eigen decomposition
    E, U = LA.eig(C)
    P = U[0:2, :]
    X_hat = np.dot(TRAIN, np.transpose(P))

    # plotting
    pred += 1
    for i in range(pred.shape[0]):
        pred[i] = pred[i] // 2
    plotting(X_hat, pred)

    return pred
예제 #21
0
class EllipticEnvelopeFilter(BaseEstimator):
    def __init__(self,
                 assume_centered=False,
                 support_fraction=None,
                 contamination=0.1,
                 random_state=None):
        self.assume_centered = assume_centered
        self.support_fraction = support_fraction
        self.contamination = contamination
        self.random_state = random_state

    def fit_pipe(self, X, y=None):
        self.elliptic_envelope_ = EllipticEnvelope(**self.get_params())
        self.elliptic_envelope_.fit(X)
        return self.transform_pipe(X, y)

    def transform_pipe(self, X, y):
        # XXX: sample_props not taken care off
        is_inlier = self.elliptic_envelope_.predict(X) == 1
        X_out = X[is_inlier]
        if y is None:
            y_out = None
        else:
            y_out = y[is_inlier]
        return X_out, y_out

    def transform(self, X, y=None):
        return X
예제 #22
0
def compare_drift(X_src, y_src, X_new, y_new):
    clf_y = EllipticEnvelope(random_state=0, contamination=0.01)
    clf_X = EllipticEnvelope(random_state=0, contamination=0.01)

    clf_X.fit(X_src)
    clf_y.fit(y_src.reshape(y_src.size, 1))

    test_X = clf_X.predict(X_new)

    test_y = clf_y.predict(y_new.reshape(-1, 1))

    X_distance = wasserstein_distance(X_src.values.flatten(),
                                      X_new.values.flatten())

    y_distance = wasserstein_distance(y_src.flatten(), y_new.flatten())

    X_outlier = len(test_X[test_X == -1]) / len(test_X)

    y_outlier = len(test_y[test_y == -1]) / len(test_y)

    results = {
        'X_wasserstein_distance': X_distance,
        'y_wasserstein_distance': y_distance,
        'X_outlier_percentage': X_outlier,
        'y_outlier_percentage': y_outlier
    }

    return results
예제 #23
0
class EllipticDetection(BaseEstimator, TransformerMixin):
    def __init__(self, contamination=0):
        self.contamination = contamination

    def fit(self, X, y=None):
        if self.contamination == 0:
            return self
        self.ell = EllipticEnvelope(contamination=self.contamination)
        if y is None:
            self.ell.fit(X)
        else:
            self.ell.fit(X, y)

        return self

    def transform(self, X_):
        X = deepcopy(X_)
        if self.contamination == 0:
            return X
        idx_outlier = self.ell.predict(X) == -1
        X[idx_outlier, :] = np.nan

        simple_imputer = SimpleImputer()
        X = simple_imputer.fit_transform(X)

        return X
예제 #24
0
class Baseline(ModelBase):
    def __init__(self, model_name, packet_length=1500, seq_length=1, epochs=1):
        super().__init__(packet_length, seq_length, epochs)
        self.model_name = model_name
        if model_name == 'svm':
            self.model = OneClassSVM(kernel='rbf', nu=0.05)
        elif model_name == 'if':
            self.model = IsolationForest(contamination=0.05,
                                         max_features=15,
                                         random_state=0)
        elif model_name == 'lof':
            self.model = LocalOutlierFactor(contamination=0.05, novelty=True)
        elif model_name == 'gm':
            self.model = GaussianMixture(random_state=0)
        elif model_name == 'ee':
            self.model = EllipticEnvelope(contamination=0.05, random_state=0)

    def fit(self, X):
        self.model.fit(X)

    def predict(self, X):
        labels = self.model.predict(X)
        scores = self.model.score_samples(X)
        return scores, labels

    def save(self, name):
        joblib.dump(self.model, name + '_{}.pkl'.format(self.model_name))

    def load(self, name):
        self.model = joblib.load(name + '_{}.pkl'.format(self.model_name))

    def exist(self, name):
        return os.path.exists(name + '_{}.pkl'.format(self.model_name))
def filter_outliers_in_features(X):
    # clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
    clf = EllipticEnvelope(support_fraction=1, contamination=0.2)
    clf.fit(X)
    # r = clf.predict(X)
    X = X[clf.predict(X) == 1]
    return X
예제 #26
0
def anomaly_detection(X):
    clf = EllipticEnvelope()
    clf.fit(X)
    y_pred = clf.decision_function(X).ravel()
    percentile = 1.9
    threshold = np.percentile(y_pred, percentile)
    print(threshold)
    outliers = y_pred < threshold

    xx, yy = np.meshgrid(np.linspace(0, 25, 200), np.linspace(0, 30, 200))
    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

    sns.distplot(y_pred, rug=True, ax=ax1)
    sns.distplot(y_pred[outliers], rug=True, hist=False, kde=False, norm_hist=True, color='r', ax=ax1)
    ax1.vlines(threshold, 0, 0.9, colors='r', linestyles='dotted',
               label='Threshold for {} percentile = {}'.format(percentile, np.round(threshold, 2)))
    ax1.set_title('Distribution of Elliptic Envelope decision function values')
    ax1.legend(loc='best')

    ax2.scatter(X[:, 0], X[:, 1], c='b', marker='x')
    ax2.scatter(X[outliers][:, 0], X[outliers][:, 1], c='r', marker='x', linewidths=2)
    ax2.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='red', linestyles='dotted')
    ax2.set_title("Outlier detection")
    ax2.set_xlabel('Latency (ms)')
    ax2.set_ylabel('Throughput (mb/s)')

    plt.show()
def calcu2(mppt):
    clf = EllipticEnvelope(contamination=0.01)
    my_mppt1 = mppt.iloc[:, 0:106]
    clf.fit(my_mppt1)
    y_pred = clf.predict(my_mppt1)
    # y_pred = clf.predict(my_mppt1)
    output = mppt[y_pred == -1].iloc[:, 108]
    return output
예제 #28
0
def view_anomalies(df):
    data = reindex_data(df)
    df.index = data.index

    df_class0 = df.loc[df['srch_saturday_night_bool'] == 0, 'price_usd']
    df_class1 = df.loc[df['srch_saturday_night_bool'] == 1, 'price_usd']

    fig, axs = plt.subplots(1,2)
    df_class0.hist(ax=axs[0], bins=30)
    df_class1.hist(ax=axs[1], bins=30);

    outliers_fraction = 0.01
    envelope =  EllipticEnvelope(contamination = outliers_fraction) 
    X_train = df_class0.values.reshape(-1,1)
    envelope.fit(X_train)
    df_class0 = pd.DataFrame(df_class0)
    df_class0['deviation'] = envelope.decision_function(X_train)
    df_class0['anomaly'] = envelope.predict(X_train)

    envelope =  EllipticEnvelope(contamination = outliers_fraction) 
    X_train = df_class1.values.reshape(-1,1)
    envelope.fit(X_train)
    df_class1 = pd.DataFrame(df_class1)
    df_class1['deviation'] = envelope.decision_function(X_train)
    df_class1['anomaly'] = envelope.predict(X_train)

    # plot the price repartition by categories with anomalies
    a0 = df_class0.loc[df_class0['anomaly'] == 1, 'price_usd']
    b0 = df_class0.loc[df_class0['anomaly'] == -1, 'price_usd']

    a2 = df_class1.loc[df_class1['anomaly'] == 1, 'price_usd']
    b2 = df_class1.loc[df_class1['anomaly'] == -1, 'price_usd']

    fig, axs = plt.subplots(1,2)
    axs[0].hist([a0,b0], bins=32, stacked=True, color=['blue', 'red'])
    axs[1].hist([a2,b2], bins=32, stacked=True, color=['blue', 'red'])
    axs[0].set_title("Search Non Saturday Night")
    axs[1].set_title("Search Saturday Night")

    df_class = pd.concat([df_class0, df_class1])
    df['anomaly5'] = df_class['anomaly']
    # df['anomaly5'] = np.array(df['anomaly22'] == -1).astype(int)
    fig, ax = plt.subplots(figsize=(10, 6))
    
    df = df.sort_values('date_time')
    df['date_time_int'] = pd.to_datetime(df['date_time']).astype('int64')
    a = df.loc[df['anomaly5'] == -1, ('date_time_int', 'price_usd')] #anomaly
    ax.plot(df['date_time_int'], df['price_usd'], color='blue', label='Normal')
    ax.scatter(a['date_time_int'],a['price_usd'], color='red', label='Anomaly')
    plt.legend()

    a = df.loc[df['anomaly5'] == 1, 'price_usd']
    b = df.loc[df['anomaly5'] == -1, 'price_usd']

    fig, axs = plt.subplots(figsize=(10, 6))
    axs.hist([a,b], bins=32, stacked=True, color=['blue', 'red'])
    plt.show();
예제 #29
0
def train(featuremethods,
          trainingdata,
          classification="anomaly_detection",
          gamma=0,
          nu=0.5,
          features=None):
    #trainingdata is a list of [listoflines,value] duples. For anomaly detection, value is always 0.
    if not features:
        features = []
        for bunchoflines in trainingdata:
            features.append([i(bunchoflines) for i in featuremethods])
    means = [
        np.array([i[j] for i in features]).mean()
        for j in range(0, len(featuremethods))
    ]
    stdevs = [
        np.array([i[j] for i in features]).std()
        for j in range(0, len(featuremethods))
    ]
    tempfeatures = copy.deepcopy(features)
    for bunchoflines in tempfeatures:
        for feature in range(0, len(bunchoflines)):
            bunchoflines[feature] -= means[feature]
            bunchoflines[feature] /= stdevs[feature]
    parameters = {'gamma': [0, 10], 'nu': [0.1, 0.9]}
    if classification == "anomaly_detection":
        svr = svm.OneClassSVM(kernel='rbf',
                              degree=3,
                              coef0=0.0,
                              tol=0.001,
                              shrinking=True,
                              cache_size=200,
                              verbose=False,
                              max_iter=-1,
                              random_state=None,
                              gamma=gamma,
                              nu=nu)
    elif classification == "elliptic_envelope":
        svr = EllipticEnvelope()
    else:
        svr = svm.SVC(cache_size=200,
                      class_weight=None,
                      coef0=0.0,
                      kernel="rbf",
                      max_iter=-1,
                      probability=False,
                      random_state=None,
                      shrinking=True,
                      tol=0.001,
                      verbose=False)
    if classification == "anomaly_detection":
        svr.fit(tempfeatures)
    elif classification == "elliptic_envelope":
        return [svr.decision_function(tempfeatures), means, stdevs]
    else:
        svr.fit(tempfeatures, [1 for i in trainingdata])
    return [svr, means, stdevs]
예제 #30
0
 def elliptic_envelope_out(self, contamination):
     self.report.append('elliptic_envelope_out')
     ds = self.training[self.numerical_var]
     elliptic = EllipticEnvelope(contamination=contamination)
     elliptic.fit(ds)
     results = elliptic.predict(ds)
     outlier_elliptic = pd.Series(results)
     outlier_elliptic.index = ds.index
     return outlier_elliptic[outlier_elliptic == -1].index
예제 #31
0
def plot(X, y):
    proj = TSNE().fit_transform(X)
    e = EllipticEnvelope(assume_centered=True, contamination=.25) # Outlier detection
    e.fit(X)

    good = np.where(e.predict(X) == 1)
    X = X[good]
    y = y[good]

    scatter(proj, y)
예제 #32
0
def clean_reviewer_average_radius_with_EllipticEnvelope(reviews):
    good_points = {}
    classifier = EllipticEnvelope(contamination=0.005)
    centers, user_ids = get_all_centers_as_array(reviews)
    classifier.fit(centers)
    inlier_indexes = np.where(classifier.predict(centers) != -1)
    user_ids = np.array(user_ids)[inlier_indexes]
    for i, user_id in enumerate(user_ids):
        good_points[user_id] = reviews[user_id]
    return good_points
예제 #33
0
def labelValidSkeletons_old(skeletons_file,
                            good_skel_row,
                            fit_contamination=0.05):
    base_name = getBaseName(skeletons_file)
    progress_timer = timeCounterStr('')

    print_flush(base_name + ' Filter Skeletons: Starting...')
    with pd.HDFStore(skeletons_file, 'r') as table_fid:
        trajectories_data = table_fid['/trajectories_data']

    trajectories_data['is_good_skel'] = trajectories_data['has_skeleton']

    if good_skel_row.size > 0:
        #nothing to do if there are not valid skeletons left.

        print_flush(
            base_name +
            ' Filter Skeletons: Reading features for outlier identification.')
        #calculate classifier for the outliers

        nodes4fit = ['/skeleton_length', '/contour_area'] + \
        ['/' + name_width_fun(part) for part in worm_partitions]

        X4fit = nodes2Array(skeletons_file, nodes4fit, good_skel_row)
        assert not np.any(np.isnan(X4fit))

        #%%
        print_flush(
            base_name +
            ' Filter Skeletons: Fitting elliptic envelope. Total time:' +
            progress_timer.getTimeStr())
        #TODO here the is a problem with singular covariance matrices that i need to figure out how to solve
        clf = EllipticEnvelope(contamination=fit_contamination)
        clf.fit(X4fit)

        print_flush(base_name +
                    ' Filter Skeletons: Calculating outliers. Total time:' +
                    progress_timer.getTimeStr())
        #calculate outliers using the fitted classifier
        X = nodes2Array(skeletons_file, nodes4fit)  #use all the indexes
        y_pred = clf.decision_function(
            X).ravel()  #less than zero would be an outlier

        print_flush(
            base_name +
            ' Filter Skeletons: Labeling valid skeletons. Total time:' +
            progress_timer.getTimeStr())
        #labeled rows of valid individual skeletons as GOOD_SKE
        trajectories_data['is_good_skel'] = (y_pred > 0).astype(np.int)

    #Save the new is_good_skel column
    saveModifiedTrajData(skeletons_file, trajectories_data)

    print_flush(base_name + ' Filter Skeletons: Finished. Total time:' +
                progress_timer.getTimeStr())
예제 #34
0
def filterOut(x):
    x = np.array(x)
    outliers_fraction=0.05
    #clf = svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,  kernel="rbf", gamma=0.1) 
    clf = EllipticEnvelope(contamination=outliers_fraction)
    clf.fit(x)
    y_pred = clf.decision_function(x).ravel()
    threshold = stats.scoreatpercentile(y_pred,
                                        100 * outliers_fraction)
    y_pred = y_pred > threshold
    return y_pred
def test_outlier_detection():
    """

    """
    rnd = np.random.RandomState(0)
    X = rnd.randn(100, 10)
    clf = EllipticEnvelope(contamination=0.1)
    clf.fit(X)
    y_pred = clf.predict(X)

    assert_array_almost_equal(clf.decision_function(X, raw_mahalanobis=True), clf.mahalanobis(X - clf.location_))
    assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.0)
예제 #36
0
    def module4(self):
        '''
            入力された一次元配列からanomaly detectionを用いて外れ値を検出する
        '''

        # get data
        img = cv2.imread('../saliency_detection/image/pearl.png')
        b,g,r = cv2.split(img) 
        B,G,R = map(lambda x,y,z: x*1. - (y*1. + z*1.)/2., [b,g,r],[r,r,g],[g,b,b])

        Y = (r*1. + g*1.)/2. - np.abs(r*1. - g*1.)/2. - b*1.
        # 負の部分は0にする
        R[R<0] = 0
        G[G<0] = 0
        B[B<0] = 0
        Y[Y<0] = 0
        rg = cv2.absdiff(R,G)
        by = cv2.absdiff(B,Y)
        img1 = rg
        img2 = by

        rg, by = map(lambda x:x.reshape((len(b[0])*len(b[:,0]),1)),[rg,by])
        data = np.hstack((rg,by))
        data = data.astype(np.float64)
        data = np.delete(data, range( 0,len(data[:,0]),2),0)

        # grid
        xx1, yy1 = np.meshgrid(np.linspace(-10, 300, 500), np.linspace(-10, 300, 500))
        
        # 学習して境界を求める # contamination大きくすると円は小さく
        clf = EllipticEnvelope(support_fraction=1, contamination=0.01)
        print 'data.shape =>',data.shape
        print 'learning...'
        clf.fit(data) #学習 # 0があるとだめっぽいかも
        print 'complete learning!'

        # 学習した分類器に基づいてデータを分類して楕円を描画
        z1 = clf.decision_function(np.c_[xx1.ravel(), yy1.ravel()])
        z1 = z1.reshape(xx1.shape)
        plt.contour(xx1,yy1,z1,levels=[0],linewidths=2,colors='r')

        # plot
        plt.scatter(data[:,0],data[:,1],color= 'black')
        plt.title("Outlier detection")
        plt.xlim((xx1.min(), xx1.max()))
        plt.ylim((yy1.min(), yy1.max()))
        plt.pause(.001)
        # plt.show()
        
        cv2.imshow('rg',img1/np.amax(img1))
        cv2.imshow('by',img2/np.amax(img2))
예제 #37
0
def ellipticenvelope(data, fraction = 0.02):
    elenv = EllipticEnvelope(contamination=fraction)
    elenv.fit(data)
    score = elenv.predict(data)

    numeration = [[i] for i in xrange(1, len(data)+1, 1)]
    numeration = np.array(numeration)
    y = np.hstack((numeration, score))

    anomalies = numeration
    for num,s in y:
        if (y == 1):
            y = np.delete(anomalies, num-1, axis=0)

    return anomalies
예제 #38
0
def test_outlier_detection():
    rnd = np.random.RandomState(0)
    X = rnd.randn(100, 10)
    clf = EllipticEnvelope(contamination=0.1)
    assert_raises(NotFittedError, clf.predict, X)
    assert_raises(NotFittedError, clf.decision_function, X)
    clf.fit(X)
    y_pred = clf.predict(X)
    decision = clf.decision_function(X, raw_values=True)
    decision_transformed = clf.decision_function(X, raw_values=False)

    assert_array_almost_equal(decision, clf.mahalanobis(X))
    assert_array_almost_equal(clf.mahalanobis(X), clf.dist_)
    assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.0)
    assert sum(y_pred == -1) == sum(decision_transformed < 0)
예제 #39
0
def labelValidSkeletons(skel_file, valid_index, trajectories_data, fit_contamination = 0.05):
    #calculate valid widths if they were not used
    calculate_widths(skel_file)
    
    #calculate classifier for the outliers    
    X4fit = nodes2Array(skel_file, valid_index)        
    clf = EllipticEnvelope(contamination = fit_contamination)
    clf.fit(X4fit)
    
    #calculate outliers using the fitted classifier
    X = nodes2Array(skel_file) #use all the indexes
    y_pred = clf.decision_function(X).ravel() #less than zero would be an outlier

    #labeled rows of valid individual skeletons as GOOD_SKE
    trajectories_data['auto_label'] = ((y_pred>0).astype(np.int))*wlab['GOOD_SKE'] #+ wlab['BAD']*np.isnan(y_prev)
    saveLabelData(skel_file, trajectories_data)
def test_elliptic_envelope():
    rnd = np.random.RandomState(0)
    X = rnd.randn(100, 10)
    clf = EllipticEnvelope(contamination=0.1)
    assert_raises(NotFittedError, clf.predict, X)
    assert_raises(NotFittedError, clf.decision_function, X)
    clf.fit(X)
    y_pred = clf.predict(X)
    scores = clf.score_samples(X)
    decisions = clf.decision_function(X)

    assert_array_almost_equal(
        scores, -clf.mahalanobis(X))
    assert_array_almost_equal(clf.mahalanobis(X), clf.dist_)
    assert_almost_equal(clf.score(X, np.ones(100)),
                        (100 - y_pred[y_pred == -1].size) / 100.)
    assert(sum(y_pred == -1) == sum(decisions < 0))
예제 #41
0
 def model_2_determine_test_data_similarity(self,model):
     clf_EE={}
     model_EE={}
     for i in range(len(model)):
         clf=EllipticEnvelope(contamination=0.01,support_fraction=1)
         clf_EE[i]=clf
         EEmodel=clf.fit(model[i])
         model_EE[i]=EEmodel
     return clf_EE,model_EE
예제 #42
0
def labelValidSkeletons(skel_file):
    calculate_widths(skel_file)
    
    #get valid rows using the trajectory displacement and the skeletonization success
    valid_index, trajectories_data = getValidIndexes(skel_file)
    
    #calculate classifier for the outliers    
    X4fit = nodes2Array(skel_file, valid_index)        
    clf = EllipticEnvelope(contamination=.1)
    clf.fit(X4fit)
    
    #calculate outliers using the fitted classifier
    X = nodes2Array(skel_file)
    y_pred = clf.decision_function(X).ravel() #less than zero would be an outlier

    #labeled rows of valid individual skeletons as GOOD_SKE
    trajectories_data['auto_label'] = ((y_pred>0).astype(np.int))*wlab['GOOD_SKE'] #+ wlab['BAD']*np.isnan(y_prev)
    saveLabelData(skel_file, trajectories_data)
def anomaly_detection(features, labels):
	# In this function, I try to use anomaly detection method (using mutivariate gaussian distribution) to identify poi-s
	non_pois = features[labels==0]
	pois = features[labels==1]
	print "non poi size", non_pois.shape, pois.shape, features.shape

	## Spliting data to train, test and cross validation set for anomaly detection

	split1 = produce_spliting_array(non_pois.shape[0], .75 )
	X_train = non_pois[split1==1]

	X_intermediate = non_pois[split1==0]

	print "size intermediate", X_intermediate.shape

	split2 = produce_spliting_array(X_intermediate.shape[0], .5 )

	X_test = X_intermediate[split2==1]
	label_test = np.zeros((X_test.shape[0],), dtype=np.int) - 1

	X_cv = X_intermediate[split2==0]
	label_cv = np.zeros((X_cv.shape[0],), dtype=np.int) - 1

	split3 = produce_spliting_array(pois.shape[0], .5 )
	X_test = np.vstack((X_test, pois[split3==1]))
	label_test = np.hstack((label_test, np.ones(sum(split3), dtype=np.int)))

	X_cv = np.vstack((X_cv, pois[split3==0]))
	label_cv = np.hstack((label_cv, np.ones(sum(split3==0), dtype=np.int)))



	print "size X_train", X_train.shape
	print "size test data", X_test.shape, label_test.shape
	print "size cv data", X_cv.shape, label_cv.shape
	print "size splits", len(split1), len(split2), len(split3)

	from sklearn.covariance import EllipticEnvelope
	detector = EllipticEnvelope(contamination=.85)
	detector.fit(X_train)
	pred_cv = detector.predict(X_cv)
	print pred_cv
	print label_cv
	print detector.score(X_cv, label_cv)
예제 #44
0
def labelValidSkeletons_old(skeletons_file, good_skel_row, fit_contamination = 0.05):
    base_name = getBaseName(skeletons_file)
    progress_timer = timeCounterStr('');
    
    print_flush(base_name + ' Filter Skeletons: Starting...')
    with pd.HDFStore(skeletons_file, 'r') as table_fid:
        trajectories_data = table_fid['/trajectories_data']

    trajectories_data['is_good_skel'] = trajectories_data['has_skeleton']
    
    if good_skel_row.size > 0:
        #nothing to do if there are not valid skeletons left. 
        
        print_flush(base_name + ' Filter Skeletons: Reading features for outlier identification.')
        #calculate classifier for the outliers    
        
        nodes4fit = ['/skeleton_length', '/contour_area'] + \
        ['/' + name_width_fun(part) for part in worm_partitions]
        
        X4fit = nodes2Array(skeletons_file, nodes4fit, good_skel_row)
        assert not np.any(np.isnan(X4fit))
        
        #%%
        print_flush(base_name + ' Filter Skeletons: Fitting elliptic envelope. Total time:' + progress_timer.getTimeStr())
        #TODO here the is a problem with singular covariance matrices that i need to figure out how to solve
        clf = EllipticEnvelope(contamination = fit_contamination)
        clf.fit(X4fit)
        
        print_flush(base_name + ' Filter Skeletons: Calculating outliers. Total time:' + progress_timer.getTimeStr())
        #calculate outliers using the fitted classifier
        X = nodes2Array(skeletons_file, nodes4fit) #use all the indexes
        y_pred = clf.decision_function(X).ravel() #less than zero would be an outlier

        print_flush(base_name + ' Filter Skeletons: Labeling valid skeletons. Total time:' + progress_timer.getTimeStr())
        #labeled rows of valid individual skeletons as GOOD_SKE
        trajectories_data['is_good_skel'] = (y_pred>0).astype(np.int)
    
    #Save the new is_good_skel column
    saveModifiedTrajData(skeletons_file, trajectories_data)

    print_flush(base_name + ' Filter Skeletons: Finished. Total time:' + progress_timer.getTimeStr())
예제 #45
0
def transform( features, labels ):

#    for ff, ll in zip(features, labels):
#        print ll, ff
#    for rr in range(0, len(features) ):
#        features[rr] = scaler.fit_transform( features[rr] )

    print "transforming features via pca"
    pca = PCA(n_components = 30)
    features = pca.fit_transform( features )

    envelope = EllipticEnvelope()
    envelope.fit( features )
    print envelope.predict( features )

    scaler = MinMaxScaler()
    features = scaler.fit_transform( features )



    return features, labels
def detect_outliers(X, station):
    if station=='hoerning':
            outlierfraction = 0.0015
            classifier = svm.OneClassSVM(nu=0.95*outlierfraction + 0.05,
                                         kernel='rbf', gamma=0.1)
            Xscaler = StandardScaler(copy=True, with_mean=True, with_std=True).fit(X)
            X_scaled = Xscaler.transform(X)
            classifier.fit(X_scaled)
            svcpred = classifier.decision_function(X_scaled).ravel()
            threshold = stats.scoreatpercentile(svcpred, 100*outlierfraction)
            inlierpred = svcpred>threshold        
            
    else:
        outlierfraction = 0.0015
        classifier = EllipticEnvelope(contamination=outlierfraction)
        classifier.fit(X)
        gausspred = classifier.decision_function(X).ravel()
        threshold = stats.scoreatpercentile(gausspred, 100*outlierfraction)
        inlierpred = gausspred>threshold
            
    return inlierpred
예제 #47
0
def find_outlier_test_homes(df,all_homes,  appliance, outlier_features, outliers_fraction=0.1):
    from scipy import stats

    from sklearn import svm
    from sklearn.covariance import EllipticEnvelope
    clf = EllipticEnvelope(contamination=.1)
    try:
        X = df.ix[all_homes[appliance]][outlier_features].values
        clf.fit(X)
    except:
        try:
            X = df.ix[all_homes[appliance]][outlier_features[:-1]].values
            clf.fit(X)
        except:
            try:
                X = df.ix[all_homes[appliance]][outlier_features[:-2]].values
                clf.fit(X)
            except:
                print "outlier cannot be found"
                return df.ix[all_homes[appliance]].index.tolist()


    y_pred = clf.decision_function(X).ravel()
    threshold = stats.scoreatpercentile(y_pred,
                                        100 * outliers_fraction)
    y_pred = y_pred > threshold
    return df.ix[all_homes[appliance]][~y_pred].index.tolist()
예제 #48
0
파일: outliers.py 프로젝트: apodemus/tsa
def CovEstOD(data, classifier=None, N=1, **kw):
    if classifier is None:
        from sklearn.covariance import EllipticEnvelope
        contamination = N / data.shape[0]
        classifier = EllipticEnvelope(support_fraction=1., contamination=contamination)

    classifier.fit(data)
    clipix, = np.where( classifier.predict(data) == -1)
    
    wdb = kw.pop( 'with_decision_boundary', False )
    #TODO:  A better way of finding the decision boundary
    if wdb:
        w,T = np.linalg.eigh( clf.precision_ )          #T (eigenvectors of precision matrix) is the transformation matrix between principle axes and data coordinates
        Ti = np.linalg.inv(T)
        M = np.dot(Ti, clf.precision_) * T              #Diagonalizing the precision matrix ==> quadratic representation of decision boundary (ellipse): z^T M z = threshold. where x-<x> = Tz transforms to principle axes
        a, b = np.sqrt(clf.threshold / np.diag(M))      #semi-major & semi-minor axes
        theta = np.degrees( np.arccos(T[0,0]) )         #T is (im)proper rotation matrix
        theta = np.linalg.det(T) * theta                #If det(T)=-1 ==> improper rotation matrix (rotoinversion - one of the axes is inverted)
        decision_boundary = Ellipse( clf.location_, 2*a, 2*b, theta,  color='m' )
        return clipix, decision_boundary
    else:
        return clipix
예제 #49
0
def find_outlier_train(ser, outliers_fraction=0.1, min_units=0.2):
    # Returns outlier, inliers

    X = ser[ser>min_units].reshape(-1,1)
    #is_normal_data = is_normal(ser)
    # FOR NOW only using Robust estimator of Covariance
    is_normal_data = True
    if is_normal_data:
        # Use robust estimator of covariance
        from sklearn.covariance import EllipticEnvelope
        clf = EllipticEnvelope(contamination=.1)
    else:
        #Data is not normally distributed, use OneClassSVM based outlier detection
        from sklearn import svm
        clf = svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,
                                     kernel="rbf", gamma=0.1)
    from scipy import stats

    clf.fit(X)
    y_pred = clf.decision_function(X).ravel()
    threshold = stats.scoreatpercentile(y_pred,
                                        100 * outliers_fraction)
    y_pred = y_pred > threshold
    return ser[ser>min_units][~y_pred], ser[ser>min_units][y_pred]
예제 #50
0
colors = plt.cm.Blues(np.linspace(0, 1, len(set(L))))
plt.figure(15)
for l in set(L):
    p = (L == l)
    if l == -1:
        color = 'r'
    else:
        color = colors[l]
    plt.plot(rcp_concat[p, 0], rcp_concat[p, 1], 'o', c=color, markersize=10)
plt.show()

# -17- #

anom_perc = 20  # original 20
clf = EllipticEnvelope(contamination=.1)
clf.fit(rcp_concat)
clf.decision_function(rcp_concat).ravel()
pred = clf.decision_function(rcp_concat).ravel()
threshold = stats.scoreatpercentile(pred, anom_perc)
Anom = pred > threshold
print(Anom)
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure(16)
plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), cmap=plt.cm.Blues_r)
plt.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='red')
plt.plot(rcp_concat[:, 0], rcp_concat[:, 1], 'ko')
plt.show()
plt.savefig("../imagens/anomaly/ex17_20.png")

# End
예제 #51
0
plt.scatter( classical_md, robust_md, color = "green", alpha = 0.5 );
plt.title( "BBAutoTune \n\n Real Robot Forward Motion MD versus RD" );
plt.xlabel( "Mahalanobis Distance (MD)" );
plt.ylabel( "Robust Distance (RD)" );
#plt.plot( [ min( classical_md ), max( classical_md ) ], [ min( classical_md ), max( classical_md ) ], color = "red", alpha = 0.5 );


# Try the elliptical envelope now with the outliers gone.

print "EE:";

ssp = numpy.array( [ [ -10, 25.0, 0.0 ] ] );

print "Sample simulated point [[X',Y',T']]: ", ssp;

ee = EllipticEnvelope( assume_centered = False, contamination = 0.0 );

print "With outliers:";

print "In envelope? ", ee.fit( forward_motion ).predict( ssp );
print "MD: ", math.sqrt( ee.fit( forward_motion ).mahalanobis( ssp ) );

print "Without outliers:";

print "In envelope? ", ee.fit( forward_motion_clean ).predict( ssp );
print "MD: ", math.sqrt( ee.fit( forward_motion_clean ).mahalanobis( ssp ) );

# Show the plots.

plt.show( );
    # Compare given classifiers under given settings
    xx, yy = np.meshgrid(np.linspace(-0.1, 1.1, 1000), np.linspace(0, 100, 1000))
    n_inliers = int((1. - outliers_fraction) * n_samples)
    n_outliers = int(outliers_fraction * n_samples)

    # Fit the problem with varying cluster separation
    np.random.seed(42)
    # Data generation


    # Fit the model with the One-Class SVM
    #plt.figure(figsize=(10, 5))

    clf = EllipticEnvelope(contamination=.1)
    # fit the data and tag outliers
    clf.fit(XY)
    y_pred = clf.decision_function(XY).ravel()
    threshold = stats.scoreatpercentile(y_pred,
                                        100 * outliers_fraction)
    y_pred = y_pred > threshold
    # plot the levels lines and the points
    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    subplot = ax[i]
    subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),
                     cmap=plt.cm.Blues_r)
    a = subplot.contour(xx, yy, Z, levels=[threshold],
                        linewidths=2, colors='red')
    subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()],
                     colors='orange')
    b = subplot.scatter(XY[:-n_outliers, 0], XY[:-n_outliers, 1], c='white')
def outliers_from_ellipticEnvelope():
    from sklearn.covariance import EllipticEnvelope
    env=EllipticEnvelope()
    env.fit(features_pca)
    outlier_pred=env.decision_function(features_pca).ravel()
    return outlier_pred
예제 #54
0
from sklearn.cluster import KMeans
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import csv
from sklearn import svm
from sklearn.covariance import EllipticEnvelope
from scipy import stats

data=[]
with open('newdata.csv', 'rb') as f:
	rdr=csv.reader(f)
	for row in rdr:
		data.append([int(row[1]), int(row[2])])
data=np.array(data)
# print(data)
outliers_fraction = 0.05
# est=svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,kernel="rbf", gamma=0.1)
est=EllipticEnvelope(contamination=.1)
# est=KMeans(n_clusters=3)
est.fit(data)
# labels=est.labels_
y_pred=est.decision_function(data).ravel()
threshold = stats.scoreatpercentile(y_pred,
                                            100 * outliers_fraction)

labels=[ (2 if y>threshold  else 1) for y in y_pred];
# labels=est.labels_
print(labels)
plt.scatter(data[:,0], data[:,1], c=labels, lw=0)
plt.show()
예제 #55
0
파일: testing.py 프로젝트: Semen52/FSA2
                        # label=target_name.decode('utf8')
                        )

            x, y = find_boundary(X_transformed[kclusters == i, 0],
                                 X_transformed[kclusters == i, 1], 5)
            plt.plot(x, y, '-k', lw=2., color=cluster_color)

            # create a mesh to plot in
            h = .02  # step size in the mesh
            x_min, x_max = X_transformed[kclusters == i, 0].min() - 1, X_transformed[kclusters == i, 0].max() + 1
            y_min, y_max = X_transformed[kclusters == i, 1].min() - 1, X_transformed[kclusters == i, 1].max() + 1
            xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                                 np.arange(y_min, y_max, h))

            clf = EllipticEnvelope(contamination=.1)
            clf.fit(X_transformed[kclusters == i])

            pred = clf.decision_function(X_transformed[kclusters == i]).ravel()
            threshold = stats.scoreatpercentile(pred,
                                                100 * outliers_fraction)
            print("INFO: Cluster: ", i, " Threshold: ", threshold)

            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])

            Z = Z.reshape(xx.shape)
            # plt.contour(xx, yy, Z,
            #             levels=[threshold],
            #             linewidths=2,
            #             linestyles='solid',
            #             colors=(cluster_color,))
예제 #56
0
#
# print(Y)

# Find outliers in the interaction rate data

# Step 1 - Convert the dataset into pandas series
util = Utility.SeriesUtility()
datasetFileName = "fans_change_taylor_swift.csv"
series = util.convertDatasetsToSeries(datasetFileName)

series = util.resampleSeriesSum(series, "D")

numberOfPoints = series.data.shape[0]
X = series.values.flatten().reshape(numberOfPoints,1)

det.fit(X)

predicted = det.predict(X)

for i in range(numberOfPoints):
    outputClass = det.predict(X[i])[0]

    if(outputClass == -1):
        print("Outlier detected...")






예제 #57
0
def find_outliers(datestart,dateend,plot=False,cut=-0.05):
    numtopics=84

    di=datetime2str2(datestart)
    dfin=datetime2str2(dateend)

    #print di,dfin
    if dfin<di:
        temp=dfin
        dfin=di
        di=temp
    #print di,dfin
    
    afile="/home/ubuntu/mysql_insightwiki_auth.txt"
    a=open(afile)
    passwd=a.readline().rstrip()
    a.close()
    host='localhost'; user='******';db='wikidata'
    con = mdb.connect(host, user, passwd, db)#,port=3307)
     
    with con:
        curt= con.cursor()
        #sql="SELECT COUNT(*) FROM `topics` "
        
        sql="SELECT `Id`,`topic_label`,`topic_string` FROM `topics`;"
        curt.execute(sql)
        topics=[[0,'nothing','Filler to match index']]
        for topic in curt:
            topics.append(topic)

    data={}
        
    df=range(numtopics+1)
    with con:
        curt= con.cursor()
        sql="SELECT `Id`,`topic_label`,`topic_string` FROM `topics`;"
        curt.execute(sql)
        for row in curt:
            cur = con.cursor()
            sql='''SELECT `page_views`.`dateonly` AS `vd`, AVG(`page_views`.`count`) AS `vc`, 
                `topics`.`topic_label`,`topics`.`topic_string` 
                FROM `topics` INNER JOIN `page_views` ON `topics`.`ID` = `page_views`.`topic_id` 
                WHERE `topic_id`=%s GROUP BY `page_views`.`dateonly`   '''
            data[row[1]]=read_sql(sql, con,params=[row[0]])
            df[row[0]]=data[row[1]]
    
    topicdata=df
    
    d=topicdata[topics[3][0]]
    p=d[ (d['vd']>di) & (d['vd']<dfin )]['vc'].values    
    topicdata=df
    
    #initializing array to hold the rows to cluster
    #the 0th position is fake so that my index matches the sql index
    clusinp=[]
    clusinp.append(gen_feat([0,0,0,0,0]))
    
    chinaoff=6000
    #populating my array to go into my Kmean
    for index,topic in enumerate(topics):
        #topic=list(topics[index])
        if topic[0]!=0:
            d=topicdata[topic[0]]
            ppre=d[ (d['vd']>di) & (d['vd']<dfin )]['vc'].values
            p=gen_feat(ppre)
            if topic[0]==52:
                p=gen_feat([x-chinaoff if x-chinaoff>=0 else 0 for x in ppre  ])
            clusinp.append(p)
    
    #cleaning up my array making it numpy to go into my kmean
    clusinp=np.array(clusinp)
    clusinp[0]=clusinp[5] #making sure my through away first row matches in size
    #contam=0.325
    contamfix=0.1
    
    colors = ['m', 'g', 'b']
    X1=clusinp
    xx1, yy1 = np.meshgrid(np.linspace(0, 10000, 500), np.linspace(-1.5, 1.5, 500))
    ee=EllipticEnvelope(support_fraction=1., contamination=contamfix)
    #ee=OneClassSVM(nu=contam2, gamma=0.05,kernel='rbf')
    ee.fit(clusinp)
    outliers=ee.decision_function(X1, raw_values=False)
    
    if plot==True:
        print "here"
        get_ipython().magic(u'matplotlib inline')
        Z1 = ee.decision_function(np.c_[xx1.ravel(), yy1.ravel()])    
        Z1 = Z1.reshape(xx1.shape)
        legend1 = plt.contour(xx1, yy1, Z1, levels=[0], linewidths=2, colors=colors[1])
        plt.scatter(X1[:, 0], X1[:, 1], color='black')
        plt.xlim((xx1.min(), xx1.max()))
        plt.ylim((yy1.min(), yy1.max()))
        plt.show()

    out=[]
    for index,outlier in enumerate(outliers):
        row=[index,outlier,topics[index][1],int(np.round(clusinp[index][0])),int(np.round(100*clusinp[index][1]))]
        #row=[index,outlier,topics[index][1],int(np.round(clusinp[index][0])),clusinp[index][1]]
        if outlier<cut and index!=0 and row[3]>8:
            out.append(row)
            #print index,outlier,topics[index][2],clusinp[index][0],clusinp[index][1]
    #out=sorted(out,operator.itemgetter(4))
    #out.sort()
    out=sorted(out,key =lambda x:-x[4])
    return out
                'Race-Black',
                'Age',
                'HAART-Naive',
                'HAART-Non-Adherent',
                'HAART-Off',
                'HAART-On',
                'Hepatitis C status (HCV)']
for col in tranfer_cols:
    _, cyto_data[col] = cyto_data.align(pat_data[col], join='left', axis = 0)
cyto_data['HCV'] = cyto_data['Hepatitis C status (HCV)']

# <codecell>

for col in cytos:
    env = EllipticEnvelope(contamination=0.05)
    env.fit(cyto_data[col].dropna().values.reshape(-1, 1))
    mask = env.predict(cyto_data[col].values.reshape(-1,1))
    cyto_data[col][mask==-1] = np.nan

# <codecell>


fig, axs = plt.subplots(11,3, figsize = (10,20))

for ax, col in zip(axs.flatten(), cytos):
    
    boxes = []
    mus = []
    stds = []
    for trop in trops:
        mask = cyto_data['Tropism'] == trop