Exemplos de LocalOutlierFactor.LocalOutlierFactor em Python, exemplos de sklearn.neighbors.LocalOutlierFactor.LocalOutlierFactor em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: day8.py Projeto: sojeongw/SKC-DT-BootCamp

# 종속 변수만 있는 데이터를 만든다
y = boston["medv"]

# =============================================================================
# 3. 이상치 존재 여부 확인
# =============================================================================

## 1) 회귀식을 구한 다음 그 모듈 안에서 이상치를 체크한다. -> statsmodels

## 2) 회귀식 없이 순수 데이터만 이용해 이상치를 체크한다. -> sklearn.neighbors 최근접이웃 분류기법(KNN)

from sklearn.neighbors import LocalOutlierFactor

# LocalOutlierFactor(n_neighbors=이웃의 숫자, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, contamination="legacy", novelty=False, n_jobs=None)
# 이웃의 숫자가 너무 많으면 엉뚱한 값이 나오므로 적당히 낮은 걸로 한다.
lof1 = LocalOutlierFactor(n_neighbors=5)

# 실제 그 값을 찾으려면 fit()
# fit(수치형 데이터만 가능하다. 범주형은 불가능)
lof1.fit(xx)

# fit()을 한 뒤에는 변수명을 새로 만들지 않는다. 그 안에서 값을 만들고 가지고 있으라고 실행하는 것이기 때문이다.
# 이상치를 구한다.
lof1.negative_outlier_factor_  # -2보다 크면 정상치, 작으면 이상치이다.

# 이상치의 총 개수는 506개다.
len(lof1.negative_outlier_factor_)

# 정상치는 값을 행에 넣고 모든 값을 열로 넣는다.
xx1 = xx.loc[lof1.negative_outlier_factor_ > -2, :]

Exemplo n.º 2

0

Exibir arquivo

X = root2array('../no_truecc_cut_stride2_offset0.root',
               branches=['calehad', 'cvnpi0', 'cvnchargedpion', 'cvnneutron', 'cvnproton'],
               selection='mustopz<1275&&remidtrkismuon==1&&isnumucc==1',
               step=scaledown)
X = X.view(np.float32).reshape(X.shape + (-1,))
recoemu_official = root2array('../no_truecc_cut_stride2_offset0.root', branches='recoemu',
                              selection='mustopz<1275&&remidtrkismuon==1&&isnumucc==1',
                              step=scaledown)
trueenu = root2array('../no_truecc_cut_stride2_offset0.root', branches='trueenu',
                     selection='mustopz<1275&&remidtrkismuon==1&&isnumucc==1',
                     step=scaledown)
y = trueenu - recoemu_official
Xy = np.insert(X, 5, y, axis=1)

# fit the model
clf = LocalOutlierFactor(n_neighbors = nneighbors)
y_pred = clf.fit_predict(Xy)

#~ # plot the level sets of the decision function
#~ xx, yy = np.meshgrid(np.linspace(0, 15, 150), np.linspace(0, 15, 150))
#~ Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()])
#~ Z = Z.reshape(xx.shape)

#~ # level curve plot with original distribution
#~ plt.figure(1)
#~ plt.subplot(1, 2, 1)
#~ plt.title("Local Outlier Factor (LOF)")
#~ plt.contourf(xx, yy, -Z, locator=ticker.LogLocator(), cmap=plt.cm.Blues_r)
#~ a = plt.scatter(X, y, c='white',
                #~ edgecolor='k', s=20)
#~ plt.axis('tight')

Exemplo n.º 3

0

Exibir arquivo

# ## Set up for training
# ### Set up 5-fold cross validation

# In[6]:

X = data.drop(columns=['Class'])
y = data['Class']
cv = KFold(shuffle=True)

# ### Set classifiers

# In[7]:

classifiers = {
    "LOF": LocalOutlierFactor(n_neighbors=20, novelty=True),
    "SVM-rbf": SVC(),
    "SVM-poly": SVC(kernel="poly")
}

# ### Set score names

# In[8]:

score_names = ["time", "accuracy", "precision", "recall", "f1"]

# ###  Set a function to get the scores

# In[9]:

Exemplo n.º 4

0

Exibir arquivo

Arquivo: news20groups-Ensemble-Methods.py Projeto: song6cy/one-class-text-classification-using-ensemble-approach

def local_outlier_factory(dataset, neighbours):
    lof = LocalOutlierFactor(n_neighbors=neighbours, contamination=0.1,novelty=True).fit(dataset)
    return lof

Exemplo n.º 5

0

Exibir arquivo

Arquivo: hackercode_credit_card.py Projeto: prachipundir/credit_card_fraud_detection

print(Y.shape)

##Define the outlier detection methods

classifiers = {
    "Isolation Forest":
    IsolationForest(n_estimators=100,
                    max_samples=len(X),
                    contamination=outlier_fraction,
                    random_state=state,
                    verbose=0),
    "Local Outlier Factor":
    LocalOutlierFactor(n_neighbors=20,
                       algorithm='auto',
                       leaf_size=30,
                       metric='minkowski',
                       p=2,
                       metric_params=None,
                       contamination=outlier_fraction)
}

type(classifiers)
n_outliers = len(Fraud)
for i, (clf_name, clf) in enumerate(classifiers.items()):
    #Fit the data and tag outliers
    if clf_name == "Local Outlier Factor":
        y_pred = clf.fit_predict(X)
        scores_prediction = clf.negative_outlier_factor_
    else:
        clf.fit(X)
        scores_prediction = clf.decision_function(X)

Exemplo n.º 6

0

Exibir arquivo

def train(loader, epoch, model_list, method='ocsvm'):
    # 大于阈值表示属于正常
    # model_list 对需要多轮训练的模型有效， 传入上一次训练的模型，例如ocnn
    datas, labels = get_features(loader)

    threshold_list = []
    update_models = []
    update_optimizer = []
    clf_list, optimizers = model_list

    for label in range(args.class_num):  # 为每个类别拟合ocsvm模型
        condition_index = np.where(labels == label)[0]
        fit_data = datas[condition_index]  # 标签label的训练数据
        optimizer = optimizers[label]

        if method == 'ocsvm':
            clf = OneClassSVM()
        elif method == 'isofore':
            clf = IsolationForest()
        elif method == 'gmm':
            clf = BayesianGaussianMixture()
        elif method == 'svdd':
            clf = SVDD(parameters)
        elif method == 'lof':
            clf = LocalOutlierFactor(novelty=True,
                                     n_neighbors=int(fit_data.size * 0.1))
        elif method == 'cnn':
            clf = ''
        elif method != 'sp':
            clf = clf_list[label]

        # 训练异常检测模型
        if method == 'ocnn':
            clf, optimizer = fit(clf, fit_data, optimizer, epoch)
            scores_temp = score_samples(clf, fit_data, epoch)
        elif method == 'lof':
            clf.fit(fit_data)
            scores_temp = clf.decision_function(fit_data)
        elif method == 'sp':
            pass
        elif method == 'cnn':
            pass
        else:
            clf.fit(fit_data)
            scores_temp = clf.score_samples(fit_data)

        # 异常检测模型阈值的计算
        if method != 'sp' and method != 'gmm' and method != 'cnn':
            threshold = np.mean(scores_temp) - \
                args.threshold_std_times*np.std(scores_temp)
            update_optimizer.append(optimizer)
            update_models.append(clf)
            threshold_list.append(threshold)
        elif method == 'gmm':
            threshold = np.mean(scores_temp)
            update_optimizer.append(optimizer)
            update_models.append(clf)
            threshold_list.append(threshold)
        elif method == 'sp':
            from cnn import get_c_v
            threshold_list = get_c_v(p_s=datas, labels=labels)
        elif method == 'cnn':
            threshold_list = ''

    model_list = (update_models, optimizers)
    return model_list, threshold_list

Exemplo n.º 7

0

Exibir arquivo

print(__doc__)

np.random.seed(42)

xx, yy = np.meshgrid(np.linspace(-5, 5, 500), np.linspace(-5, 5, 500))
# Generate normal (not abnormal) training observations
X = 0.3 * np.random.randn(100, 2)
X_train = np.r_[X + 2, X - 2]
# Generate new normal (not abnormal) observations
X = 0.3 * np.random.randn(20, 2)
X_test = np.r_[X + 2, X - 2]
# Generate some abnormal novel observations
X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))

# fit the model for novelty detection (novelty=True)
clf = LocalOutlierFactor(n_neighbors=20, novelty=True, contamination=0.1)
clf.fit(X_train)
# DO NOT use predict, decision_function and score_samples on X_train as this
# would give wrong results but only on new unseen data (not used in X_train),
# e.g. X_test, X_outliers or the meshgrid
y_pred_test = clf.predict(X_test)
y_pred_outliers = clf.predict(X_outliers)
n_error_test = y_pred_test[y_pred_test == -1].size
n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size

# plot the learned frontier, the points, and the nearest vectors to the plane
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.title("Novelty Detection with LOF")
plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: sih.py Projeto: akaash1982/gail

# In[29]:

st.subheader("Accuracy score For Isolation forest")
ISF = IsolationForest(random_state=42)
ISF.fit(ins)
falsepositive_isf = ISF.predict(ins)
falsenegative_isf = ISF.predict(outs)
in_accuracy_isf = falsepositive_accuracy(falsepositive_isf)
out_accuracy_isf = falsenegative_accuracy(falsenegative_isf)
st.write("Accuracy in Detecting falsepositive Alarm:", in_accuracy_isf)
st.write("Accuracy in Detecting falsenegative Alarm:", out_accuracy_isf)

# In[30]:

st.subheader("Accuracy score For Local Outlier Factor")
LOF = LocalOutlierFactor(novelty=True)
LOF.fit(ins)
falsepositive_lof = LOF.predict(ins)
falsenegative_lof = LOF.predict(outs)
in_accuracy_lof = falsepositive_accuracy(falsepositive_lof)
out_accuracy_lof = falsenegative_accuracy(falsenegative_lof)
st.write("Accuracy in Detecting falsepositive Alarm :", in_accuracy_lof)
st.write("Accuracy in Detecting falsenegative Alarm:", out_accuracy_lof)

# In[31]:

if st.sidebar.checkbox("Alarm Report", False):
    st.subheader("classification of Alarm")
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=[16, 3])
    ax1.set_title("Accuracy of Isolation Forest", fontsize=20)
    st.write(

Exemplo n.º 9

0

Exibir arquivo

Arquivo: Outlier_Anomaly Detection.py Projeto: 1suraj/Anomaly-Detection-Python

ii.fit(dataset[features])  #Error occurs here.

dataset['outlier'] = ii.predict(dataset[features])
del ii

print(dataset[dataset['outlier'] == -1])

#IsolationForest

from sklearn.ensemble import IsolationForest
ii = IsolationForest(max_samples=62,
                     contamination=0.25,
                     random_state=np.random.RandomState(42))

print("Fit data")
ii.fit(dataset[features])  #Error occurs here.

dataset['outlier'] = ii.predict(dataset[features])
del ii

print(dataset[dataset['outlier'] == -1])

#LocalOutlierFactor

from sklearn.neighbors import LocalOutlierFactor
ii = LocalOutlierFactor(n_neighbors=35, contamination=0.25)

dataset['outlier'] = ii.fit_predict(dataset[features])
del ii
print(dataset[dataset['outlier'] == -1])

Exemplo n.º 10

0

Exibir arquivo

Arquivo: qc.py Projeto: ekunnii/APPIAN

def _LocalOutlierFactor(X):
    n = int(round(X.shape[0] * 0.2))
    clf = LocalOutlierFactor(n_neighbors=n)
    return clf.fit_predict(X)

Exemplo n.º 11

0

Exibir arquivo

for dat in datasets:
    plt.clf()
    plt.figure(figsize=(25, 13))

    # loading and vectorization
    #X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,n_clusters_per_class=1, weights=[0.999], flip_y=0, random_state=4)
    
    
    X = df.values    
    X.astype(float)
    n_features = 2
    X_train = X#.reshape(-1, 1)
    
    # define models:
    iforest = IsolationForest(n_estimators=50, max_samples='auto', contamination=float(0.01),max_features=2).fit(X)
    lof = LocalOutlierFactor(n_neighbors=5, novelty=True)
    #ocsvm = OneClassSVM()
    #ocsvm = OneClassSVM(kernel='linear', degree=2, gamma='auto', nu=0.5)
    ocsvm = OneClassSVM(gamma='auto', nu=0.01)

    lim_inf = X.min(axis=0)
    lim_sup = X.max(axis=0)
    volume_support = (lim_sup - lim_inf).prod()
    t = np.arange(0, 100 / volume_support, 0.01 / volume_support)
    axis_alpha = np.arange(alpha_min, alpha_max, 0.0001)
    unif = np.random.uniform(lim_inf, lim_sup,
                             size=(n_generated, n_features))

    # fit:
    print('IsolationForest processing...')
    iforest = IsolationForest()

Exemplo n.º 12

0

Exibir arquivo

Arquivo: run_detection.py Projeto: ihomelab/effect-of-sampling-rate-on-PV-self-consumption

# In[20]:

# The max power used by the appliance in the initial active areas, filtered by this percentile,
# is assumed to be roughly the max power used by the appliance
rough_max_power_percentile = 95

# Any initial active area where the max power used is less than
# this ratio of the rough max power of the appliance is ignored
too_low_power_ratio = 0.02

# In[21]:

len_max_coords = np.column_stack(
    ([len(a) for a in active_area_data], [a.max() for a in active_area_data]))
lof_labels = LocalOutlierFactor().fit_predict(len_max_coords)

# In[22]:

colors = np.array(['r.', 'g.', 'b.'])
plt.scatter(len_max_coords[:, 0], len_max_coords[:, 1], c=lof_labels)

# In[23]:

plt.hist([len_max_coords[lof_labels == l][:, 1] for l in [1, -1]],
         stacked=True)

# In[24]:

rough_max_power = np.percentile(len_max_coords[:, 1], 95)
too_low_power = rough_max_power * too_low_power_ratio

Exemplo n.º 13

0

Exibir arquivo

Arquivo: credit_card_fraud_detection.py Projeto: sathish1709/machine_learning

print(y_value.head())
y_value = y_value.values.reshape(-1)
print(y_value.shape)
x_value = sampled_data.drop(labels="Class", axis=1)
print(x_value.columns)
print(x_value.shape)

# Print shapes
print(x_value.shape)
print(y_value.shape)

#Algorithms used: Random Isolation, LocalOutlier factor are common  anomaly detection methods
random_isolation = IsolationForest(max_samples=len(x_value),
                                   contamination=outlier_value,
                                   random_state=3)
local_outlier = LocalOutlierFactor(n_neighbors=12, contamination=outlier_value)

n_outlier = len(fraudal_count)
#fit and predict
random_isolation.fit(x_value)
score_prediction = random_isolation.decision_function(x_value)
y_predict_lof = random_isolation.predict(x_value)

y_predict_isf = local_outlier.fit_predict(x_value)
score_prediction = local_outlier.negative_outlier_factor_

#Change the value to 0 for valid and 1 for fradual cases.
y_predict_isf[y_predict_isf == 1] = 0
y_predict_isf[y_predict_isf == -1] = 1
y_predict_lof[y_predict_lof == 1] = 0
y_predict_lof[y_predict_lof == -1] = 1

Exemplo n.º 14

0

Exibir arquivo

df2 = df2[df2["Job Title"].isin(emp_counts[emp_counts > 3000].index)]
df2['Salary Paid'] = df2['Salary Paid'].apply(lambda x:x.split('.')[0].strip()).replace({'\$':'', ',':''}, regex=True)


FirAtt_lst = df2['Job Title'].unique()
SecAtt_lst = df2['Employer'].unique()
ThrAtt_lst = df2['Calendar Year'].unique()

###################################     Forming a context   #######################################
Orgn_Ctx = df2.loc[df2['Job Title'].isin([FirAtt_lst[0],FirAtt_lst[1],FirAtt_lst[2],FirAtt_lst[3], FirAtt_lst[4]]) & \
                   df2['Employer'].isin([SecAtt_lst[0],SecAtt_lst[1], SecAtt_lst[2],SecAtt_lst[3], SecAtt_lst[4], SecAtt_lst[5]]) & \
                   df2['Calendar Year'].isin([ThrAtt_lst[0],ThrAtt_lst[1],ThrAtt_lst[2],ThrAtt_lst[3],ThrAtt_lst[4]])]


#######################     Finding an outlier in the selected context      #######################
clf = LocalOutlierFactor(n_neighbors=20)
Sal_outliers = clf.fit_predict(Orgn_Ctx['Salary Paid'].values.reshape(-1,1))
Queried_ID =Orgn_Ctx.iloc[Sal_outliers.argmin()][1]

print '\n\n Outlier\'s ID in the selected context is: ', Queried_ID

################# Exploring Contexts larger than the original to find the maximal #################
FirAtt_Sprset = sum(map(lambda r: list(combinations(FirAtt_lst[5:], r)), range(1, len(FirAtt_lst[5:])+1)), [])
SecAtt_Sprset = sum(map(lambda r: list(combinations(SecAtt_lst[6:], r)), range(1, len(SecAtt_lst[6:])+1)), [])
ThrAtt_Sprset = sum(map(lambda r: list(combinations(ThrAtt_lst[5:], r)), range(1, len(ThrAtt_lst[5:])+1)), [])

Sub_pop        =  []
Sub_pop_count  =  0
Epsilon        =  0.1  ### Privacy Parameter
output         =  []
context        =  []

Exemplo n.º 15

0

Exibir arquivo

result_angriff_excel = []
x_train = signal[:, 0:35000]
x_train = np.transpose(x_train)

x_test = signal[:, 35000:len(signal[0])]
x_test = np.transpose(x_test)

# x_outliers = np.concatenate((angriff_sis_attack_1,angriff_sis_attack_2), axis = 1)
# x_outliers = np.transpose(x_outliers)
x_outliers = np.transpose(angriff_sis_attack_2)
ground_truth_angriff = pred_sis_attack

neighbours = 3000
print('Neighbours: ', neighbours)

lof = LocalOutlierFactor(n_neighbors=neighbours, novelty=True)
lof.fit(x_train)

test_pred = lof.predict(x_test)

n_outliers = 0
ground_truth = np.ones(len(x_test), dtype=int)
n_errors = (test_pred != ground_truth).sum()

result_test = (len(x_test) - n_errors) / (len(x_test))
result_test_excel += [result_test]

lof.fit(x_train)
outlier_pred = lof.predict(x_outliers)

n_outliers = len(x_outliers)

Exemplo n.º 16

0

Exibir arquivo

Arquivo: test_core_pipeline.py Projeto: vaisaxena/lale

    def test_import_from_sklearn_pipeline_no_wrapper(self):
        from sklearn.neighbors import LocalOutlierFactor
        from sklearn.pipeline import make_pipeline

        sklearn_pipeline = make_pipeline(PCA(), LocalOutlierFactor())
        _ = import_from_sklearn_pipeline(sklearn_pipeline, fitted=False)

Exemplo n.º 17

0

Exibir arquivo

Arquivo: visualize_freesurfer_volumes_more_clean_spm.py Projeto: rsjones94/data_inspection

                pass

        out_df = out_df.append(working, ignore_index=True)

    out_df = out_df[blank_dict.keys()]
    out_df.to_csv(out_csv, index=False)

if visualize:
    print('Visualizing')
    brain_vol_df = pd.read_csv(brain_vol_csv)

    collated_csv = os.path.join(out_folder, 'collated.csv')
    clean_table = pd.read_csv(collated_csv, index_col='mr_id')
    clean_table = clean_table[clean_table['exclude'] != 1]

    clf = LocalOutlierFactor(n_neighbors=20, contamination=0.06)

    y_pred = clf.fit_predict(clean_table)
    #y_pred_unsort = y_pred.copy()
    x_scores = clf.negative_outlier_factor_
    #x_scores_unsort = x_scores.copy()
    clean_table['outlier'] = y_pred

    clean_table['normal_control'] = [
        all([i, not j])
        for i, j in zip(clean_table['control'], clean_table['sci'])
    ]
    clean_table['sci_control'] = [
        all([i, j]) for i, j in zip(clean_table['control'], clean_table['sci'])
    ]

Exemplo n.º 18

0

Exibir arquivo

        print('\n******Iso-Forest*******\n')
        start = time.time()
        clf = IsolationForest(contamination=0.1, behaviour='new')
        clf.fit(X)
        end = time.time()
        time_all[j, 0] = end - start
        iso_scores = clf.score_samples(X)

        if run_lof_svm == 0:
            lof_scores = iso_scores
            osvm_scores = iso_scores
        elif j == 0:

            print('\n******LOF*******\n')
            start = time.time()
            lof = LocalOutlierFactor()
            lof.fit(X)
            end = time.time()
            time_all[j, 1] = end - start
            lof_scores = lof.negative_outlier_factor_

            print('\n******1-class SVM*******\n')
            start = time.time()
            osvm = OneClassSVM(kernel='rbf')
            osvm.fit(X)
            end = time.time()
            time_all[j, 2] = end - start
            osvm_scores = osvm.score_samples(X)

        print('\n******Our Algo*******\n')
        start = time.time()

Exemplo n.º 19

0

Exibir arquivo

Arquivo: HyperParameterSearch.py Projeto: portdan/MFDCAChallenge

def main():

    X = read_data()

    Y = read_labels()

    isf = IsolationForest()
    lof = LocalOutlierFactor(novelty=True)
    svm = OneClassSVM(kernel="rbf")
    cov = EllipticEnvelope()
    kmn = KMeans(n_clusters=1)

    k_fold = StratifiedKFold(n_splits=3, shuffle=True)

    params_isf = []
    params_lof = []
    params_svm = []
    params_cov = []
    params_kmn = []

    for user in range(0, num_of_labeled_users):
        X_all = X[user]
        Y_all = Y[user].astype(int)

        X_genuine = X[user][0:num_of_genuine_segments]
        Y_genuine = Y[user][0:num_of_genuine_segments].astype(int)

        X_unlabeled = X[user][num_of_genuine_segments:]
        Y_unlabeled = Y[user][num_of_genuine_segments:].astype(int)
        '''
        count_vect = CountVectorizer()
        tfidf_transformer = TfidfTransformer(use_idf=False)

        X_all_counts = count_vect.fit_transform(X_all)
        X_all_tfidf = tfidf_transformer.fit_transform(X_all_counts)


        isf_random = RandomizedSearchCV(estimator=isf, param_distributions=ISF_HYPER_PARAMS, n_iter=random_search_iter, cv=k_fold, verbose=2,
                                       random_state=42, n_jobs=-1, scoring=make_scorer(custom_acc))

        svm_random = RandomizedSearchCV(estimator=svm, param_distributions=SVM_HYPER_PARAMS, n_iter=random_search_iter, cv=k_fold, verbose=2,
                                        random_state=42, n_jobs=-1, scoring=make_scorer(custom_acc))

        lof_random = RandomizedSearchCV(estimator=lof, param_distributions=LOF_HYPER_PARAMS, n_iter=random_search_iter, cv=k_fold, verbose=2,
                                        random_state=42, n_jobs=-1, scoring=make_scorer(custom_acc))

        kmn_random = RandomizedSearchCV(estimator=kmn, param_distributions=KMN_HYPER_PARAMS, n_iter=random_search_iter, cv=k_fold, verbose=2,
                                        random_state=42, n_jobs=-1, scoring=make_scorer(custom_acc))

        cov_random = RandomizedSearchCV(estimator=cov, param_distributions=COV_HYPER_PARAMS, n_iter=random_search_iter, cv=k_fold, verbose=2,
                                        random_state=42, n_jobs=-1, scoring=make_scorer(custom_acc))

        isf_random.fit(X_all_tfidf, Y_all)
        svm_random.fit(X_all_tfidf, Y_all)
        lof_random.fit(X_all_tfidf, Y_all)
        kmn_random.fit(X_all_tfidf, Y_all)
        cov_random.fit(X_all_tfidf.toarray(), Y_all)

        p_isf = dict(isf_random.best_params_)
        p_svm = dict(svm_random.best_params_)
        p_lof = dict(lof_random.best_params_)
        p_kmn = dict(kmn_random.best_params_)
        p_cov = dict(cov_random.best_params_)

        p_isf["score"] = isf_random.best_score_
        p_svm["score"] = svm_random.best_score_
        p_lof["score"] = lof_random.best_score_
        p_kmn["score"] = kmn_random.best_score_
        p_cov["score"] = cov_random.best_score_
        
        params_isf.append(p_isf)
        params_svm.append(p_svm)
        params_lof.append(p_lof)
        params_kmn.append(p_kmn)
        params_cov.append(p_cov)
        '''

        params_isf.append(
            calc_best_detector_for_algoritm(X_unlabeled, Y_unlabeled, isf,
                                            ISF_HYPER_PARAMS, k_fold))
        params_svm.append(
            calc_best_detector_for_algoritm(X_unlabeled, Y_unlabeled, svm,
                                            SVM_HYPER_PARAMS, k_fold))
        params_lof.append(
            calc_best_detector_for_algoritm(X_unlabeled, Y_unlabeled, lof,
                                            LOF_HYPER_PARAMS, k_fold))
        params_kmn.append(
            calc_best_detector_for_algoritm(X_unlabeled, Y_unlabeled, kmn,
                                            KMN_HYPER_PARAMS, k_fold))
        params_cov.append(
            calc_best_detector_for_algoritm(X_unlabeled, Y_unlabeled, cov,
                                            COV_HYPER_PARAMS, k_fold))

    write_output(params_isf, 'IsolationForest')
    write_output(params_svm, 'OneClassSVM')
    write_output(params_lof, 'LocalOutlierFactor')
    write_output(params_kmn, 'KMeans')
    write_output(params_cov, 'EllipticEnvelope')

Exemplo n.º 20

0

Exibir arquivo

def lof(df, training_df):
    lof = LocalOutlierFactor(n_neighbors=20, contamination='auto')
    y_pred = lof.fit_predict(training_df)
    outliers = np.where(y_pred == -1)
    print('Removing ' + str(len(outliers[0])) + ' records')
    return df.drop(outliers[0])

Exemplo n.º 21

0

Exibir arquivo

Arquivo: anomaly_detection_high_d.py Projeto: mihailmaksakov/Machine_learning

import matplotlib.pyplot as plt
from scipy.io import loadmat
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

from sklearn.decomposition import PCA

data = loadmat('ex8data2.mat')

X = data['X']

e1 = EllipticEnvelope()
labels1 = e1.fit_predict(X)

e2 = LocalOutlierFactor()
labels2 = e2.fit_predict(X)

n_components = 3

pca1 = PCA(n_components=n_components)
Xproj = pca1.fit_transform(X)

plt.figure()
plt.clf()
ax = plt.axes(projection='3d')

# ax.scatter(image_array[:, 0], image_array[:, 1], image_array[:, 2], c=labels, cmap='coolwarm', marker=',')

ax.scatter(Xproj[:, 0], Xproj[:, 1], Xproj[:, 2], marker='o', c=labels1)

Exemplo n.º 22

0

Exibir arquivo

Arquivo: filter.py Projeto: wang7922/MLplatform

 def __init__(self, name="局部异常因子"):
     self._model = LocalOutlierFactor()
     self.name = name

Exemplo n.º 23

0

Exibir arquivo

            orient='horizontal',
            flierprops=flierprops,
            whiskerprops=whiskerprops,
            capprops=capprops)
#plt.savefig('Distribution.png',dpi=400,bbox_inches='tight')

scaledData = np.log(data)

ax = plt.figure(figsize=(8, 5)).gca(title='Log Sales Distribution',
                                    xlabel='Product',
                                    ylabel='Log Sales')
sns.violinplot(data=scaledData)
#plt.savefig('Violin.png',dpi=400,bbox_inches='tight')

# remove outliers
outliers = LocalOutlierFactor(n_neighbors=20, contamination=.05)
scaledData['inlier'] = outliers.fit_predict(scaledData)
cleanData = scaledData.loc[scaledData.inlier == 1, products]
#sns.pairplot(cleanData, plot_kws={'s': 5})
#plt.tight_layout();

sns.clustermap(cleanData.corr(),
               annot=True,
               fmt='.1%',
               center=0.0,
               vmin=-1,
               vmax=1,
               cmap=sns.diverging_palette(250, 10, n=20))
#plt.savefig('Heatmap.png',dpi=400,bbox_inches='tight')

# run PCA

Exemplo n.º 24

0

Exibir arquivo

Arquivo: BFS.py Projeto: masi-sh/DP_out_Sum

def BFS_Alg(Org_Vec, Queue, Data_to_write, Epsilon, max_ctx):
    Visited = []
    BFS_Vec = np.zeros(len(Org_Vec))
    for i in range(len(Org_Vec)):
        BFS_Vec[i] = Org_Vec[i]
    BFS_Flp = np.zeros(len(Org_Vec))
    termination_threshold = 500
    Terminator = 0
    # I use the Queue it for visited nodes.
    # and just use sub_q here, for each sample I add the children to this sub_q without resetting it first
    sub_q = [[
        0,
        mp.exp(Epsilon * (Orgn_Ctx.shape[0])), Orgn_Ctx.shape[0], Org_Vec
    ]]
    contexts = [Org_Vec]
    while len(Visited) < 100:
        Terminator += 1
        if (Terminator > termination_threshold):
            break
    #print 'sub_q before: ', sub_q
        for i in range(len(sub_q)):
            sub_q[i][0] = i
        Sub_elements = [elem for elem in range(len(sub_q))]
        Sub_probabilities = []
        for prob in sub_q:
            Sub_probabilities.append(prob[1] /
                                     (sum([prob[1] for prob in sub_q])))
        SubRes = np.random.choice(Sub_elements, 1, p=Sub_probabilities)
        Queue.append([
            len(Queue), sub_q[SubRes[0]][1], sub_q[SubRes[0]][2],
            sub_q[SubRes[0]][3][:]
        ])
        #print 'Queue is:', Queue
        Visited.append(sub_q[SubRes[0]][3][:])
        #print 'Visited is:', Visited
        sub_q.remove(sub_q[SubRes[0]])
        #print 'Visited is:', Visited
        for Flp_bit in range(0, (len(BFS_Vec))):
            for i in range(len(BFS_Flp)):
                BFS_Flp[i] = Queue[len(Queue) - 1][3][i]
            Sub_Sal_list = []
            Sub_ID_list = []
            BFS_Flp[Flp_bit] = 1 - BFS_Flp[Flp_bit]
            BFS_Ctx  = df2.loc[df2['Weapon'].isin(FirAtt_lst[np.where(BFS_Flp[0:len(FirAtt_lst)] == 1)].tolist()) &\
                 df2['State'].isin(SecAtt_lst[np.where(BFS_Flp[len(FirAtt_lst):len(FirAtt_lst)+len(SecAtt_lst)] == 1)].tolist())  &\
                 df2['AgencyType'].isin(ThrAtt_lst[np.where(BFS_Flp[len(FirAtt_lst)+len(SecAtt_lst):len(FirAtt_lst)+len(SecAtt_lst)+len(ThrAtt_lst)] == 1)].tolist())]
            if ((not any(np.array_equal(BFS_Flp[:], x[:])
                         for x in Visited)) and
                (not any(np.array_equal(BFS_Flp[:], x[:]) for x in contexts))
                    and (BFS_Ctx.shape[0] > 20)):
                for row in range(BFS_Ctx.shape[0]):
                    #VictimAge is column 4 and the ID is on column 0
                    Sub_Sal_list.append(BFS_Ctx.iloc[row, 4])
                    Sub_ID_list.append(BFS_Ctx.iloc[row, 0])
                Sub_Sal_arr = np.array(Sub_Sal_list)
                clf = LocalOutlierFactor(n_neighbors=20)
                Sub_Sal_outliers = clf.fit_predict(Sub_Sal_arr.reshape(-1, 1))
                for outlier_finder in range(0, len(Sub_ID_list)):
                    if ((Sub_Sal_outliers[outlier_finder] == -1)
                            and (Sub_ID_list[outlier_finder] == Queried_ID)):
                        Sub_Score = mp.exp(Epsilon * (BFS_Ctx.shape[0]))
                        sub_q.append([
                            Flp_bit, Sub_Score, BFS_Ctx.shape[0],
                            np.zeros(len(Org_Vec))
                        ])
                        for i in range(len(sub_q[len(sub_q) - 1][3])):
                            sub_q[len(sub_q) - 1][3][i] = BFS_Flp[i]
                        contexts.append(np.zeros(len(Org_Vec)))
                        for i in range(len(Org_Vec)):
                            contexts[len(contexts) - 1][i] = BFS_Flp[i]
    # Exp mechanism on the visited nodes
    for i in range(len(Queue)):
        Queue[i][0] = i
    elements = [elem for elem in range(len(Queue))]
    probabilities = []
    for prob in Queue:
        probabilities.append(prob[1] / (sum([prob[1] for prob in Queue])))
    Res = np.random.choice(elements, 1, p=probabilities)
    Data_to_write.append(Queue[Res[0]][2] / max_ctx)
    return

Exemplo n.º 25

0

Exibir arquivo

target = 'Class'
X = data[columns]
Y = data[target]
print(X.shape)
print(Y.shape)
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
state = 1
classifiers = {
    'Isolation Forest':
    IsolationForest(max_samples=len(X),
                    contamination=outlier_fraction,
                    random_state=state),
    'Local Outlier Factor':
    LocalOutlierFactor(n_neighbors=20, contamination=outlier_fraction)
}
n_outliers = len(fraud)
for i, (clf_name, clf) in enumerate(classifiers.items()):
    if clf_name == 'Local Outlier Factor':
        y_pred = clf.fit_predict(X)
        scores_pred = clf.negative_outlier_factor_
    else:
        clf.fit(X)
        scores_pred = clf.decision_function(X)
        y_pred = clf.predict(X)
    y_pred[y_pred == 1] = 0
    y_pred[y_pred == -1] = 1
    n_errors = (y_pred != Y).sum()
    print('{}: {}'.format(clf_name, n_errors))
    print(accuracy_score(Y, y_pred))

Exemplo n.º 26

0

Exibir arquivo

def main(camera_FPS, camera_width, camera_height, inference_scale, threshold,
         num_threads):

    interpreter = None
    input_details = None
    output_details = None

    path = "pictures/"
    if not os.path.exists(path):
        os.mkdir(path)

    model_path = "OneClassAnomalyDetection-RaspberryPi3/DOC/model/"
    if os.path.exists(model_path):
        # LOF
        print("LOF model building...")
        x_train = np.loadtxt(model_path + "train.csv", delimiter=",")

        ms = MinMaxScaler()
        x_train = ms.fit_transform(x_train)

        # fit the LOF model
        clf = LocalOutlierFactor(n_neighbors=5)
        clf.fit(x_train)

        # DOC
        print("DOC Model loading...")
        interpreter = interpreter_wrapper.Interpreter(
            model_path="models/tensorflow/weights.tflite")
        interpreter.allocate_tensors()
        interpreter.set_num_threads(num_threads)
        input_details = interpreter.get_input_details()
        output_details = interpreter.get_output_details()
        print("loading finish")
    else:
        print("Nothing model folder")
        sys.exit(0)

    base_range = min(camera_width, camera_height)
    stretch_ratio = inference_scale / base_range
    resize_image_width = int(camera_width * stretch_ratio)
    resize_image_height = int(camera_height * stretch_ratio)

    if base_range == camera_height:
        crop_start_x = (resize_image_width - inference_scale) // 2
        crop_start_y = 0
    else:
        crop_start_x = 0
        crop_start_y = (resize_image_height - inference_scale) // 2
    crop_end_x = crop_start_x + inference_scale
    crop_end_y = crop_start_y + inference_scale

    fps = ""
    message = "Push [p] to take a picture"
    result = "Push [s] to start anomaly detection"
    flag_score = False
    picture_num = 1
    elapsedTime = 0
    score = 0
    score_mean = np.zeros(10)
    mean_NO = 0

    cap = cv2.VideoCapture(0)
    cap.set(cv2.CAP_PROP_FPS, camera_FPS)
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, camera_width)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, camera_height)

    time.sleep(1)

    while cap.isOpened():
        t1 = time.time()

        ret, image = cap.read()

        if not ret:
            break

        image_copy = image.copy()

        # prediction
        if flag_score == True:
            prepimg = cv2.resize(image,
                                 (resize_image_width, resize_image_height))
            prepimg = prepimg[crop_start_y:crop_end_y, crop_start_x:crop_end_x]
            prepimg = np.array(prepimg).reshape(
                (1, inference_scale, inference_scale, 3))
            prepimg = prepimg / 255

            interpreter.set_tensor(input_details[0]['index'],
                                   np.array(prepimg, dtype=np.float32))
            interpreter.invoke()
            outputs = interpreter.get_tensor(output_details[0]['index'])

            outputs = outputs.reshape((len(outputs), -1))
            outputs = ms.transform(outputs)
            score = -clf._decision_function(outputs)

        # output score
        if flag_score == False:
            cv2.putText(image, result, (camera_width - 350, 100),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1,
                        cv2.LINE_AA)
        else:
            score_mean[mean_NO] = score[0]
            mean_NO += 1
            if mean_NO == len(score_mean):
                mean_NO = 0

            if np.mean(score_mean) > threshold:  #red if score is big
                cv2.putText(image, "{:.1f} Score".format(np.mean(score_mean)),
                            (camera_width - 230, 100),
                            cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 1,
                            cv2.LINE_AA)
            else:  # blue if score is small
                cv2.putText(image, "{:.1f} Score".format(np.mean(score_mean)),
                            (camera_width - 230, 100),
                            cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1,
                            cv2.LINE_AA)

        # message
        cv2.putText(image, message, (camera_width - 285, 15),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1, cv2.LINE_AA)
        cv2.putText(image, fps, (camera_width - 164, 50),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 1, cv2.LINE_AA)

        cv2.imshow("Result", image)

        # FPS
        elapsedTime = time.time() - t1
        fps = "{:.0f} FPS".format(1 / elapsedTime)

        # quit or calculate score or take a picture
        key = cv2.waitKey(1) & 0xFF
        if key == ord("q"):
            break
        if key == ord("p"):
            cv2.imwrite(path + str(picture_num) + ".jpg", image_copy)
            picture_num += 1
        if key == ord("s"):
            flag_score = True

    cv2.destroyAllWindows()

Exemplo n.º 27

0

Exibir arquivo

Arquivo: plot_anomaly_comparison.py Projeto: Aschente0/scikit-learn-fixes

outliers_fraction = 0.15
n_outliers = int(outliers_fraction * n_samples)
n_inliers = n_samples - n_outliers

# define outlier/anomaly detection methods to be compared
anomaly_algorithms = [("Robust covariance",
                       EllipticEnvelope(contamination=outliers_fraction)),
                      ("One-Class SVM",
                       svm.OneClassSVM(nu=outliers_fraction,
                                       kernel="rbf",
                                       gamma=0.1)),
                      ("Isolation Forest",
                       IsolationForest(contamination=outliers_fraction,
                                       random_state=42)),
                      ("Local Outlier Factor",
                       LocalOutlierFactor(n_neighbors=35,
                                          contamination=outliers_fraction))]

# Define datasets
blobs_params = dict(random_state=0, n_samples=n_inliers, n_features=2)
datasets = [
    make_blobs(centers=[[0, 0], [0, 0]], cluster_std=0.5, **blobs_params)[0],
    make_blobs(centers=[[2, 2], [-2, -2]],
               cluster_std=[0.5, 0.5],
               **blobs_params)[0],
    make_blobs(centers=[[2, 2], [-2, -2]],
               cluster_std=[1.5, .3],
               **blobs_params)[0],
    4. * (make_moons(n_samples=n_samples, noise=.05, random_state=0)[0] -
          np.array([0.5, 0.25])),
    14. * (np.random.RandomState(42).rand(n_samples, 2) - 0.5)
]

Exemplo n.º 28

0

Exibir arquivo

Arquivo: Outlier.py Projeto: ghabdidabdi/ml_task3

CONTAMINATION = 0.1

# try reading a csv
y_pred = []
filename = 'Outlier_multy_n={}_c={}.csv'.format(N_NEIGHBORS, CONTAMINATION)
try:
    out_frame = pd.read_csv(filename)
    y_pred = out_frame.Out
except FileNotFoundError:
    # file was not found, create and train new model, then print results to csv
    print('file ', filename, ' was not found :(')
    print('new file will be generated')
    print()
    print('create new classifier')
    outlier_clf = LocalOutlierFactor(n_neighbors = N_NEIGHBORS,
                                    contamination = CONTAMINATION
    )
    print("training model for corruption: ", CONTAMINATION, ', neighbors: ', N_NEIGHBORS)
    y_pred = outlier_clf.fit_predict(features)
    print("outliers detected, creating csv")

    # create new frame and print it to csv
    f = pd.DataFrame({'Out': y_pred})
    f.to_csv(filename)

# read data
train_og = pd.read_hdf("train.h5", "train")
all_data = pd.read_hdf("train.h5", "train").drop(['y'], axis = 1)

# insert outlier-column
train_og.insert(0, column = 'outlier', value = y_pred)

Exemplo n.º 29

0

Exibir arquivo

    if dataset_name == "SA":
        lb = LabelBinarizer()
        x1 = lb.fit_transform(X[:, 1].astype(str))
        x2 = lb.fit_transform(X[:, 2].astype(str))
        x3 = lb.fit_transform(X[:, 3].astype(str))
        X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]
        y = (y != b"normal.").astype(int)

    if dataset_name == "http" or dataset_name == "smtp":
        y = (y != b"normal.").astype(int)

    X = X.astype(float)

    print("LocalOutlierFactor processing...")
    model = LocalOutlierFactor(n_neighbors=20)
    tstart = time()
    model.fit(X)
    fit_time = time() - tstart
    scoring = -model.negative_outlier_factor_  # the lower, the more normal
    fpr, tpr, thresholds = roc_curve(y, scoring)
    AUC = auc(fpr, tpr)
    plt.plot(
        fpr,
        tpr,
        lw=1,
        label="ROC for %s (area = %0.3f, train-time: %0.2fs)" %
        (dataset_name, AUC, fit_time),
    )

plt.xlim([-0.05, 1.05])

Exemplo n.º 30

0

Exibir arquivo

def anomaly_detection(testdata_name,rank_method_index,test_EVs_ts,test_MVs_ts):
    # Local Outlier Factor
    from sklearn.neighbors import LocalOutlierFactor
    from myFunctions import gen_dist_mat
    
    #
    experimentName = '{}_LOF'.format(testdata_name)
    # Choose ranking method
    # rank_group = rank_high_low
    rank_group = rank_methods[rank_method_index]
    rank_method_name = rank_methods_names[rank_method_index]
    
    test_weather_ts = test_EVs_ts[0] # test weather data
    
    # MV_index = 0 # MV we are examining
    MV_predictions = []
    for MV_index in range(len(MVs)):
        predictions = []
        for n in range(test_weather_ts.shape[0]):
            # The 20th closest weather data
            weather_group = rank_group(weather_ts,test_weather_ts[n])['Day'][:20]
    
            print('{} - group length:{}'.format(n,len(weather_group)))
            if len(weather_group) < 10:
                predictions.append('len<')
                continue
            
    
            # reshape to row array to concatenate
            test_data_point = test_MVs_ts[MV_index,n].reshape((1,MVs_ts[MV_index,weather_group].shape[1]))
            # concatenated matrix of training data and the test data sample
            NT_data = np.concatenate((MVs_ts[MV_index,weather_group],test_data_point),axis = 0)
            
            LOF = LocalOutlierFactor(n_neighbors = 3,metric='precomputed')
            D = gen_dist_mat(NT_data) # distance matrix
            
            # if distance matrix are all zeros(all TS are identical), then skip this
            if len(D[D == 0]) == D.shape[0]*D.shape[1]:
                predictions.append('D=0')
                continue
                
            pred = LOF.fit_predict(D)
            predictions.append(str(pred[-1])) # change to string to avoid comparison error in numpy later
            
            # if detected as outlier, save plot of MVs
            if pred[-1] == -1:
                plt.figure()
                # # draw only the current MV-----
                for c in weather_group:
                    plt.plot(MVs_ts[MV_index,c],color='steelblue',alpha=0.5,linestyle='dotted')
                plt.plot(test_MVs_ts[MV_index,n],color='gold')
                #--------------------------------
                
                # # draw for all MVs-------------
                # for index in range(MVs_ts.shape[0]):
                #     for c in combination:
                #         plt.plot(MVs_ts[index,c],color=color_list[index],alpha=0.5,linestyle='dotted')
                #     plt.plot(test_MVs_ts[index,n],color='gold')
                # plt.show()
                # -------------------------------
                
                dir_loc = r'C:\Users\James\Desktop\python_figs\rank\{}\{}\{}'.format(rank_method_name,experimentName,MVs[MV_index])
                # check directory if exists
                if not os.path.exists(dir_loc):
                    os.makedirs(dir_loc)
                # save faulty plot
                plt.savefig(dir_loc + '\\n{}.png'.format(n))
                plt.close()
            
        MV_predictions.append(np.array(predictions))
    
    
    p_fault = np.empty(MV_predictions[0].shape,dtype = np.bool) # faulty
    p_normal = np.empty(MV_predictions[0].shape,dtype = np.bool) # normal
    p_lack = np.empty(MV_predictions[0].shape,dtype = np.bool) # lack of data
    p_fault[:] = False
    p_normal[:] = True # False
    p_lack[:] = True # False
    for predictions in MV_predictions:
        p_fault = np.logical_or(p_fault, predictions=='-1')
        normal_with_identical = np.logical_or(predictions=='1',predictions=='D=0')
        p_normal = np.logical_and(p_normal,normal_with_identical)
        p_lack = np.logical_and(p_lack, predictions=='len<')
        
    # the indices of ts sample which are considered faulty
    fault_index = np.arange(len(p_fault))[p_fault]
    normal_index = np.arange(len(p_normal))[p_normal]
    lack_index = np.arange(len(p_lack))[p_lack]
    
    # print results:
    fd_rate = 'Fault detection rate:\t {}%'.format(len(fault_index)/test_weather_ts.shape[0]*100)
    nd_rate = 'Normal operation rate:\t {}%'.format(len(normal_index)/test_weather_ts.shape[0]*100)
    ld_rate = 'Lack of data rate:\t {}%'.format(len(lack_index)/test_weather_ts.shape[0]*100)
    
    print(fd_rate)
    print(nd_rate)
    print(ld_rate)
    
    # Save results:
    dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}'.format(rank_method_name,experimentName)
    with open(dir_loc+'\\results.txt','w') as f:
        f.write(fd_rate + '\n' + nd_rate+ '\n' + ld_rate)
    
    
    
    
    
    
    
    # Isolation Forest
    
    from sklearn.ensemble import IsolationForest
    from myFunctions import gen_dist_mat
    
    #
    experimentName = '{}_IsolationForest'.format(testdata_name)
    # Choose ranking method
    # rank_group = rank_high_low
    rank_group = rank_methods[rank_method_index]
    rank_method_name = rank_methods_names[rank_method_index]
    
    # test_weather_ts = test_EVs_ts[0] # test weather data
    
    # MV_index = 0 # MV we are examining
    MV_predictions = []
    for MV_index in range(len(MVs)):
        predictions = []
        for n in range(test_weather_ts.shape[0]):
            # The 20th closest weather data
            weather_group = rank_group(weather_ts,test_weather_ts[n])['Day'][:20]
            
            print('{} - group length:{}'.format(n,len(weather_group)))
            if len(weather_group) < 10:
                predictions.append('len<')
                continue
            
    
            # reshape to row array to concatenate
            test_data_point = test_MVs_ts[MV_index,n].reshape((1,MVs_ts[MV_index,weather_group].shape[1]))
            # concatenated matrix of training data and the test data sample
            NT_data = np.concatenate((MVs_ts[MV_index,weather_group],test_data_point),axis = 0)
            
            D = gen_dist_mat(NT_data) # distance matrix
            
            # if distance matrix are all zeros(all TS are identical), then skip this
            if len(D[D == 0]) == D.shape[0]*D.shape[1]:
                predictions.append('D=0')
                continue
            
            IsoForest = IsolationForest()
            IsoForest.fit(NT_data)
            pred = IsoForest.predict(NT_data)    
            
            predictions.append(str(pred[-1])) # change to string to avoid comparison error in numpy later
            
            # if detected as outlier, save plot of MVs
            if pred[-1] == -1:
                plt.figure()
                # # draw only the current MV-----
                for c in weather_group:
                    plt.plot(MVs_ts[MV_index,c],color='steelblue',alpha=0.5,linestyle='dotted')
                plt.plot(test_MVs_ts[MV_index,n],color='gold')
                #--------------------------------
                
                # # draw for all MVs-------------
                # for index in range(MVs_ts.shape[0]):
                #     for c in combination:
                #         plt.plot(MVs_ts[index,c],color=color_list[index],alpha=0.5,linestyle='dotted')
                #     plt.plot(test_MVs_ts[index,n],color='gold')
                # plt.show()
                # -------------------------------
                
                dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}\{}'.format(rank_method_name,experimentName,MVs[MV_index])
                # check directory if exists
                if not os.path.exists(dir_loc):
                    os.makedirs(dir_loc)
                # save faulty plot
                plt.savefig(dir_loc + '\\n{}.png'.format(n))
                plt.close()
            
        MV_predictions.append(np.array(predictions))
    
    
    p_fault = np.empty(MV_predictions[0].shape,dtype = np.bool) # faulty
    p_normal = np.empty(MV_predictions[0].shape,dtype = np.bool) # normal
    p_lack = np.empty(MV_predictions[0].shape,dtype = np.bool) # lack of data
    p_fault[:] = False
    p_normal[:] = True # False
    p_lack[:] = True # False
    for predictions in MV_predictions:
        p_fault = np.logical_or(p_fault, predictions=='-1')
        normal_with_identical = np.logical_or(predictions=='1',predictions=='D=0')
        p_normal = np.logical_and(p_normal,normal_with_identical)
        p_lack = np.logical_and(p_lack, predictions=='len<')
        
    # the indices of ts sample which are considered faulty
    fault_index = np.arange(len(p_fault))[p_fault]
    normal_index = np.arange(len(p_normal))[p_normal]
    lack_index = np.arange(len(p_lack))[p_lack]
    
    # print results:
    fd_rate = 'Fault detection rate:\t {}%'.format(len(fault_index)/test_weather_ts.shape[0]*100)
    nd_rate = 'Normal operation rate:\t {}%'.format(len(normal_index)/test_weather_ts.shape[0]*100)
    ld_rate = 'Lack of data rate:\t {}%'.format(len(lack_index)/test_weather_ts.shape[0]*100)
    
    print(fd_rate)
    print(nd_rate)
    print(ld_rate)
    
    # Save results:
    dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}'.format(rank_method_name,experimentName)
    with open(dir_loc+'\\results.txt','w') as f:
        f.write(fd_rate + '\n' + nd_rate+ '\n' + ld_rate)