Exemplo n.º 1
0
def detect_anomalies(kills):
    num_neighbors = min(KILL_NUM_NEIGHBORS, len(kills) - 1)
    contam = min(float(KILL_MAX_ANOM) / len(kills), 0.2)
    lof = LocalOutlierFactor(num_neighbors, metric="manhattan", contamination=contam)

    kill_vals = np.array([[k.value / 1e6] for k in kills])
    res = lof.fit_predict(kill_vals)

    return [kills[i] for i in np.nditer(np.where(res == -1))]
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor

from storage import DataStorage

np.random.seed(42)

storage = DataStorage()

records = pd.read_csv(storage.augmented_dataset_file_name)
X = records[['average_cpu', 'average_memory']]
y = records['is_normal']

clf = LocalOutlierFactor(n_neighbors=5, contamination=0.1)

y_pred = clf.fit_predict(X)
X_scores = clf.negative_outlier_factor_

plt.title("Local Outlier Factor (LOF)")
plt.scatter(X.iloc[:, 0].values, X.iloc[:, 1].values, color='k', s=3.,
            label='Data points')
radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())
plt.scatter(X.iloc[:, 0].values, X.iloc[:, 1].values, s=1000 * radius,
            edgecolors='r',
            facecolors='none', label='Outlier scores')

plt.ylabel('Average memory usage')
plt.xlabel('Average CPU usage')

legend = plt.legend(loc='upper left')
legend.legendHandles[0]._sizes = [10]
# TerminalSN_le = preprocessing.LabelEncoder()
# preprocessed_features["TerminalSN"] = TerminalSN_le.fit_transform(preprocessed_features["TerminalSN"])

# EventID_le = preprocessing.LabelEncoder()
# preprocessed_features["EventID"] = EventID_le.fit_transform(preprocessed_features["EventID"])

# Split data set, not needed as it is unsupervised
# train_features, test_features = train_test_split(preprocessed_features, test_size=0.2)

# Begin Training
neigh = LocalOutlierFactor(n_neighbors=300,
                           leaf_size=100,
                           novelty=False,
                           algorithm="auto",
                           contamination=0.01)
train_outliers = neigh.fit_predict(preprocessed_features)  # On training data

# Compile into Data Frame for print
outlier_result_df = pd.DataFrame()
# outlier_result_df["UserID"] = UserID_le.inverse_transform(preprocessed_features["UserID"])
# outlier_result_df["TerminalSN"] = TerminalSN_le.inverse_transform(preprocessed_features["TerminalSN"])
# outlier_result_df["Timestamps"] = raw_df["TIMESTAMPS"]
outlier_result_df["Time_Of_Day"] = preprocessed_features["Time_Of_Day"]
outlier_result_df["Outlier"] = train_outliers
print(outlier_result_df)

# Get Percentage of Outliers
outlier_percentage = len(outlier_result_df.loc[outlier_result_df["Outlier"] ==
                                               -1]) / len(outlier_result_df)
print(outlier_percentage)
Exemplo n.º 4
0
#修改完成图
locate = []
position = []
l = []
num = 0
sum = 0
line = write.get_line(dodification_path)

for i in range(1, len(lat) - 1):
    if abs(lat[i] - lat[i + 1]) + abs(lng[i] - lng[i + 1]) < 0.00037 and abs(
            lat[i] - lat[i - 1]) + abs(lng[i] - lng[i - 1]) < 0.00037:
        locate.append([lat[i], lng[i], int(time.mktime(m_t[i])), i])

cls = LocalOutlierFactor(n_neighbors=190, contamination=c)
k = cls.fit_predict(locate)

for i in range(len(k)):
    if k[i] == 1:
        position.append(locate[i])

length = int(input("请输入异常点之间的分割长度:"))

# AA00002中length = 300
# AB00006中length = 400
# AD00003中length = 500
# AD00013中length = 300
# AD00053中length = 700
# AD00083中length = 300
# AD00419中length = 300
# AF00098中length = 300
Exemplo n.º 5
0
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor
import data_generation as DG

data, data_error = DG.generate_random_data(100, 100, 10)
raw_data = data
# Add X to real data
# data = np.reshape(raw_data, (-1,1))
data = (data - min(data)) / (max(data) - min(data))

# # fit the model
clf = LocalOutlierFactor(n_neighbors=10, contamination=0.1)
y_pred = clf.fit_predict(data)
raw_y_pred = clf.negative_outlier_factor_
#y_pred_outliers = y_pred[200:]

plt.plot(data)
plt.scatter(np.arange(len(raw_y_pred)), raw_y_pred, c='red')
#plt.scatter(np.arange(len(y_pred)),y_pred,c='red')

detected_outliners = (sorted(range(len(raw_y_pred)),
                             key=lambda i: raw_y_pred[i])[:len(data_error)])
correct_percentage = np.mean(
    data_error != raw_data[sorted(detected_outliners)]) * 100
print("The error percentage: ", correct_percentage, "%")
plt.show()
Exemplo n.º 6
0
''' local outlier factor '''
labels = []
removalPairs = []  # [inliers, outliers]
cont = ['auto', 'auto', 'auto']  # contamination for each k in order
outlierCount = []
ks = [28, 56, 112]

# do detection for each k, later plotted
for j in range(0, len(ks)):
    inliers = []
    k = ks[j]
    data = nitrateMg.copy()
    time, inliers = remove_missing_values(time, data)
    localFactorDetection = LocalOutlierFactor(n_neighbors=k,
                                              contamination=cont[j])
    pred = localFactorDetection.fit_predict(inliers.reshape(-1, 1))
    #1 is an inlier, -1 is an outlier

    count = 0
    outliers = np.zeros(len(inliers)) + float('nan')
    # using pred as mask, seperate outliers and inliers
    for i in range(0, len(pred)):
        if (pred[i] == -1):
            outliers[i] = inliers[i]
            inliers[i] = float('nan')
            count += 1

    outlierCount.append(count)
    removalPairs.append([inliers, outliers, time])
    labels.append('%d neighbours. Outliers: %d (%0.1f%%)' %
                  (k, count, 100 * float(count) / len(data)))
Exemplo n.º 7
0
            flierprops=flierprops,
            whiskerprops=whiskerprops,
            capprops=capprops)
#plt.savefig('Distribution.png',dpi=400,bbox_inches='tight')

scaledData = np.log(data)

ax = plt.figure(figsize=(8, 5)).gca(title='Log Sales Distribution',
                                    xlabel='Product',
                                    ylabel='Log Sales')
sns.violinplot(data=scaledData)
#plt.savefig('Violin.png',dpi=400,bbox_inches='tight')

# remove outliers
outliers = LocalOutlierFactor(n_neighbors=20, contamination=.05)
scaledData['inlier'] = outliers.fit_predict(scaledData)
cleanData = scaledData.loc[scaledData.inlier == 1, products]
#sns.pairplot(cleanData, plot_kws={'s': 5})
#plt.tight_layout();

sns.clustermap(cleanData.corr(),
               annot=True,
               fmt='.1%',
               center=0.0,
               vmin=-1,
               vmax=1,
               cmap=sns.diverging_palette(250, 10, n=20))
#plt.savefig('Heatmap.png',dpi=400,bbox_inches='tight')

# run PCA
        out_df = out_df.append(working, ignore_index=True)

    out_df = out_df[blank_dict.keys()]
    out_df.to_csv(out_csv, index=False)

if visualize:
    print('Visualizing')
    brain_vol_df = pd.read_csv(brain_vol_csv)

    collated_csv = os.path.join(out_folder, 'collated.csv')
    clean_table = pd.read_csv(collated_csv, index_col='mr_id')
    clean_table = clean_table[clean_table['exclude'] != 1]

    clf = LocalOutlierFactor(n_neighbors=20, contamination=0.06)

    y_pred = clf.fit_predict(clean_table)
    #y_pred_unsort = y_pred.copy()
    x_scores = clf.negative_outlier_factor_
    #x_scores_unsort = x_scores.copy()
    clean_table['outlier'] = y_pred

    clean_table['normal_control'] = [
        all([i, not j])
        for i, j in zip(clean_table['control'], clean_table['sci'])
    ]
    clean_table['sci_control'] = [
        all([i, j]) for i, j in zip(clean_table['control'], clean_table['sci'])
    ]

    clean_table['normal_scd'] = [
        all([i, not j]) for i, j in zip(clean_table['scd'], clean_table['sci'])
Exemplo n.º 9
0
    fig2 = pht.plot_components(forecast)
    fig.savefig("{}/TimeSeries_fbProphet.png".format(foldername),
                bbox_inches='tight',
                dpi=100)
    fig2.savefig("{}/TimeSeries_fbProphet_components.png".format(foldername),
                 bbox_inches='tight',
                 dpi=100)

    ################################################################################
    # Anomaly Detection
    ################################################################################

    # Perform Local Outlier detection
    plt.clf()
    localOutlier = LocalOutlierFactor()
    local_pred = localOutlier.fit_predict(
        daily_transits["Transits"].values.reshape(-1, 1))
    x_range = range(len(daily_transits["Transits"]))
    plt.scatter(x_range, daily_transits["Transits"], c=local_pred)
    plt.xlabel("Day")
    plt.ylabel("Relative Transit Uses")
    plt.gcf().set_size_inches((16.0, 8.0), forward=False)
    plt.savefig("{}/AnomalyDetection_LocalOutlier.png".format(foldername),
                bbox_inches='tight',
                dpi=100)

    #perform K nearest Neighbor clustering
    knn = 20
    temp = daily_transits
    temp = temp.drop(columns=["Date"])
    try:
        nbrs = NearestNeighbors(
Exemplo n.º 10
0
def outlier_lof(df):
    lof = LocalOutlierFactor(n_jobs = -1)
    lof_res = lof.fit_predict(df)
    outliers_lof = [i for i in range(len(lof_res)) if lof_res[i] == -1]
    
    return outliers_lof
Exemplo n.º 11
0
def pre_select_data(selection, norm):
    Trainset = np.loadtxt('Trainset.csv', delimiter=',')
    (train_num, b) = Trainset.shape
    feature = b - 1
    Test = np.loadtxt('Test.csv', delimiter=',')
    test_num = Test.shape[0]
    Train_label = Trainset[:, feature]
    Train_info = Trainset[:, 0:feature]
    Test_info = Test[:, 0:feature]

    if selection == 1:
        fs = mutual_info_classif(X=Train_info, y=Train_label)
        count3 = 0
        for i in range(0, feature):
            if fs[i] == 0:
                print(i)
                count3 = count3 + 1
        data_new = np.zeros((train_num, b - count3))
        test_new = np.zeros((test_num, b - count3))
        count4 = 0
        for i in range(0, feature):
            if fs[i] != 0:
                data_new[:, count4] = Trainset[:, i]
                test_new[:, count4] = Test[:, i]
                count4 = count4 + 1
        feature = count4
        data_new[:, feature] = Train_label
        test_new[:, feature] = Test[:, b - 1]
        print('feature = ', feature)
    if selection == 2:
        clf = ExtraTreesClassifier()
        clf = clf.fit(Train_info, Train_label)
        model = SelectFromModel(clf, prefit=True)
        Train_info = model.transform(Train_info)
        Test_info = model.transform(Test_info)
        feature = Train_info.shape[1]
        data_new = np.zeros((train_num, feature + 1))
        test_new = np.zeros((test_num, feature + 1))
        data_new[:, 0:feature] = Train_info
        data_new[:, feature] = Train_label
        test_new[:, 0:feature] = Test_info
        test_new[:, feature] = Test[:, b - 1]
        print('feature = ', feature)
    if selection == 3:
        (us, fs) = f_classif(X=Train_info, y=Train_label)
        count3 = 0
        for i in range(0, feature):
            if fs[i] >= 0.05:
                print(i)
                count3 = count3 + 1
        data_new = np.zeros((train_num, b - count3))
        test_new = np.zeros((test_num, b - count3))
        count4 = 0
        for i in range(0, feature):
            if fs[i] < 0.05:
                data_new[:, count4] = Trainset[:, i]
                test_new[:, count4] = Test[:, i]
                count4 = count4 + 1
        feature = count4
        data_new[:, feature] = Train_label
        test_new[:, feature] = Test[:, b - 1]
        np.savetxt('dd.csv', data_new, delimiter=',')
        print('feature = ', feature)
    if selection == 0:
        feature = b - 1
        data_new = Trainset
        test_new = Test

    Train_data = data_new[:, 0:feature]
    Train_label = data_new[:, feature]
    Test_data = test_new[:, 0:feature]
    Test_label = test_new[:, feature]

    np.savetxt('Bayes_label.csv', Train_label, delimiter=',')
    np.savetxt('Bayes.csv', Train_data, delimiter=',')
    np.savetxt('BayesTest.csv', Test_data, delimiter=',')
    np.savetxt('Bayes_TL.csv', Test_label, delimiter=',')

    if norm == 1:
        scaler = StandardScaler()
        scaler.fit(Train_data)
        Train_data = scaler.transform(Train_data)
        Test_data = scaler.transform(Test_data)
    if norm == 2:
        scaler = MinMaxScaler()
        scaler.fit(Train_data)
        Train_data = scaler.transform(Train_data)
        Test_data = scaler.transform(Test_data)

    data_new[:, 0:feature] = Train_data
    data_new[:, feature] = Train_label
    test_new[:, 0:feature] = Test_data
    test_new[:, feature] = Test_label

    np.savetxt('datanewtrain.csv', data_new, delimiter=',')
    np.savetxt('datanewtest.csv', test_new, delimiter=',')

    #balance the data
    label1 = 0
    label2 = 0
    train_num = data_new.shape[0]
    for j in range(1, train_num):
        if data_new[j, feature] == 0:
            label1 = label1 + 1
        else:
            label2 = label2 + 1
    ratio = int(np.ceil(label1 / label2))
    count2 = 0
    B_Trainset = np.zeros(((ratio - 1) * label2, feature))
    for i in range(0, train_num):
        if data_new[i, feature] == 1:
            for c in range(0, ratio - 1):
                B_Trainset[count2 + c, :] = data_new[i, 0:feature]
            count2 = count2 + ratio - 2
    B_Trainset = B_Trainset[[i for i, x in enumerate(B_Trainset) if x.any()]]
    cut = B_Trainset.shape[0]
    dev = []
    for e in range(0, feature):
        dev.append(np.std(B_Trainset[:, e]))

    noisy = np.zeros((cut, feature))

    for b in range(0, feature):
        for c in range(0, cut):
            noisy[c, b] = np.random.uniform(-0.1 * dev[b], 0.1 * dev[b])

    B_Trainset = B_Trainset + noisy
    B_data = np.zeros((cut, feature + 1))
    B_data[:, 0:feature] = B_Trainset
    B_data[:, feature] = 1
    datab = np.vstack((data_new, B_data))

    # shuffle the data
    datab = shuffle(datab)

    TL = datab[:, feature]
    TD = datab[:, 0:feature]

    # Outlier Detection
    train_num = TD.shape[0]
    LOF = LocalOutlierFactor(n_neighbors=80)
    Outlier = LOF.fit_predict(TD, TL)
    Train = np.zeros((train_num, feature))
    Tlabel = np.zeros(train_num)
    count3 = 0
    for c in range(0, train_num):
        if Outlier[c] == 1:
            Train[count3, :] = TD[c, :]
            Tlabel[count3] = TL[c]
            count3 = count3 + 1
    Train = Train[[i for i, x in enumerate(Trainset) if x.any()]]
    Tlabel = Tlabel[[i for i, x in enumerate(Trainset) if x.any()]]

    np.savetxt('B_Trainset_data.csv', Train, delimiter=',')
    np.savetxt('B_Trainset_label.csv', Tlabel, delimiter=',')
    np.savetxt('Test_data.csv', Test_data, delimiter=',')
    np.savetxt('Test_label.csv', Test_label, delimiter=',')
Exemplo n.º 12
0
# C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2)
# C3 = [1, -2] + .2 * np.random.randn(n_points_per_cluster, 2)
# C4 = [-2, 3] + .3 * np.random.randn(n_points_per_cluster, 2)
# C5 = [3, -2] + 1.6 * np.random.randn(n_points_per_cluster, 2)
# C6 = [5, 6] + 2 * np.random.randn(n_points_per_cluster, 2)
# coords = np.vstack((C1, C2, C3, C4, C5, C6))

##
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest

LOF = LocalOutlierFactor(n_neighbors=20)
iForest = IsolationForest()

LOF.fit(coords)
lof_labels = LOF.fit_predict(coords)
iForest.fit(coords)
iforest_labels = iForest.predict(coords)

lof_scores = LOF.negative_outlier_factor_
LOF.threshold_
if_scores = iForest.decision_function(coords)
iForest.threshold_

# plot normalized scores
plt.figure()
plt.plot((lof_scores - np.mean(lof_scores)) / np.std(lof_scores))
plt.plot((if_scores - np.mean(if_scores)) / np.std(if_scores))

plt.hlines((LOF.threshold_ - np.mean(lof_scores)) / np.std(lof_scores),
           xmin=0,
Exemplo n.º 13
0
def LOF(data_file_path, k_list=[5, 20, 50]):
    """
	Use Local Outlier Facter algorithm to find outliers on the specific file
	:param data_file_path: The specific file path for input data
	:param k_list: (Optional) The list of neighbor numbers for LOF algorithm, default is k=5,20,50
	:return: None
	"""

    # Make csv data to dataframe
    df = pd.read_csv(data_file_path, encoding='latin-1')

    # Print sample data in dataframe
    print("========== Sample data ==========")
    print(df.iloc[:10])
    print()

    train_df = df

    # ===== Preprocessing ===== #

    # - Zillow file
    #
    # Remove zpid column (Zillow ID)
    if 'zpid' in list(train_df):
        train_df = train_df.drop(['zpid'], axis=1)

    # Remove latitude and longitude columns
    # if 'latitude' in list(train_df):
    # 	train_df = train_df.drop(['latitude'], axis=1)
    # if 'longitude' in list(train_df):
    # 	train_df = train_df.drop(['longitude'], axis=1)

    # Remove countryid column because all are same
    if 'countryid' in list(train_df) and len(set(train_df['countryid'])) == 1:
        train_df = train_df.drop(['countryid'], axis=1)

    # Convert some columns to be categorical such as cityid, countryid, zipcpde
    if 'cityid' in list(train_df):
        train_df['cityid'] = train_df['cityid'].apply(
            lambda x: str(x) + "_categorized")
    if 'zipcpde' in list(train_df):
        train_df['zipcpde'] = train_df['zipcpde'].apply(
            lambda x: str(x) + "_categorized")

    # - Other files
    #
    # Get rid of area_type, area_id: These features should not be used for using LOF because they are unique
    if 'area_id' in list(train_df) and 'area_type' in list(train_df):
        train_df = train_df.drop(['area_id', 'area_type'], axis=1)

    # Combine City and State to be one column
    # Or remove them because they are unique (or almost unique in some files)
    if 'City' in list(train_df) and 'State' in list(train_df):
        # train_df["CityState"] = train_df[['City', 'State']].apply(lambda x: ''.join(x), axis=1)
        train_df = train_df.drop(['City', 'State'], axis=1)

    # - All files
    #
    # Get rid of area_type: it is same for all rows
    train_df = train_df.loc[:, ~train_df.columns.str.contains(
        '^Unnamed')]  # Remove Unnamed columns if there is

    # Do one hot to convert categorical data to numeric data
    train_df = pd.get_dummies(train_df)

    print("========== Sample train data ==========")
    print(train_df.iloc[:10])
    print()

    # Normalization to make it easier to illustrate
    min_max_scaler = preprocessing.MinMaxScaler()
    train_data = min_max_scaler.fit_transform(train_df)

    # Dataframe after get rid of some columns and numerize any category columns
    print("========== Normalized train data ==========")
    print(train_data[:10])
    print()

    # ===== Local Outlier Factors ===== #
    #
    # Set a file for outlier summary
    SCRIPT_DIR = os.path.abspath(os.path.dirname(sys.argv[0]))
    data_file_name = data_file_path.split('/')[len(data_file_path.split('/')) -
                                               1]
    summary_file_name = SCRIPT_DIR + '/' + data_file_name + '_outlier_summary.csv'
    f = open(summary_file_name, 'w')

    # Print header
    header = "k-neighbors,total, outliers, non-outliers, % outliers"
    f.write(header + '\n')

    # Try LOF with different k (default = 5, 20, 50)
    result_list = {}
    for k in k_list:
        # Fit the model
        clf = LocalOutlierFactor(n_neighbors=k)
        y_pred = clf.fit_predict(train_data)
        result_list["k=" + str(k)] = y_pred

        # Count number of outliers
        outlier_number = 0
        for y in y_pred:
            if y == -1:
                outlier_number += 1

        # Print sample prediction results
        print("========== Prediction results with k=%i ==========" % k)
        print("Total:", len(y_pred))
        print("Number of outliers:", outlier_number)
        print("Number of non-outliers:", (len(y_pred) - outlier_number))
        print(
            "Percentage of outliers:", "{0:.2f}%".format(
                outlier_number / (len(y_pred) - outlier_number) * 100))
        print("Oulier result:", y_pred)
        print()

        # Write summary to file
        line = ",".join([
            str(k),
            str(len(y_pred)),
            str(outlier_number),
            str(len(y_pred) - outlier_number), "{0:.2f}%".format(
                outlier_number / (len(y_pred) - outlier_number) * 100)
        ])
        f.write(line + '\n')
    f.close()

    # Set another file for outlier results
    result_file_name = SCRIPT_DIR + '/' + data_file_name + '_outlier_results.csv'
    pd.DataFrame(result_list).to_csv(result_file_name, index=False)
Exemplo n.º 14
0
#将数据按期属性(按列进行)减去其均值,并处以其标准差。得到的结果是,对于每个属性/每列来说所有数据都聚集在0附近,标准差为1
Y = np.array(X)
Y_scaled = preprocessing.scale(Y)

print(Y_scaled)
# DBscan聚类 检测异常点
# 默认eps=0.5 min_samples=5
#clf=DBSCAN(eps=0.8,metric='euclidean',algorithm='auto')
# 默认n_neighbors=20,contamination=0.1
clf = LocalOutlierFactor(n_neighbors=10, contamination=0.08)

# 孤立森林,默认n_estimators=100, contamination=0.1
# 方法报错,存在bug,待fix
#clf = IsolationForest(n_estimators=100, contamination=0.04)

y_pred = clf.fit_predict(Y_scaled)
#y_pred = clf.predict(Y_scaled)
print(clf)
print(y_pred)

x = [n[0] for n in X]
y = [n[1] for n in X]
# 可视化操作
plt.scatter(x, y, c=y_pred, marker='o')
plt.title("LOF-Babymother Data")
plt.xlabel("score")
plt.ylabel("reduced weight")
plt.legend(["user"])
plt.show()
data['聚类标签'] = y_pred
#data.to_excel('/Users/martin_yan/Desktop/clustering5.22-6.11(3).xlsx',index=False, encoding="utf_8_sig")
    
    return model

sen = label_sentences(cb_healthcare)
model = train_doc2vec_model(sen)


vector_list=[]
for i in range(len(cb_healthcare)):
    vector_list.append(model.docvecs[i])

#X,y=vector_list[0:1200],vector_list[1201:1226]

df_x = pd.DataFrame(X)
clf_lof = LocalOutlierFactor(n_neighbors=20,metric='euclidean')
y_pred = clf_lof.fit_predict(df_x)


X_scores = -(clf_lof.negative_outlier_factor_)

#%%
clf = LocalOutlierFactor(n_neighbors=80,metric='euclidean')
y_pred = clf.fit_predict(df_x)
y_pred_score = clf._decision_function(df_x)
scores_doc = -(clf.negative_outlier_factor_)

lofScores_10 = pd.DataFrame(scores_doc,columns = ['LOF_scores'])
#lofScores_10['N/M']=patent['기업이름']
sort = lofScores_10.sort_values(["LOF_scores"],ascending=[False])
sorts2 = np.array(sort)
#%%
Exemplo n.º 16
0
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor
print(__doc__)

np.random.seed(42)

# Generate train data
X = 0.3 * np.random.randn(100, 2)
# Generate some abnormal novel observations
X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))
X = np.r_[X + 2, X - 2, X_outliers]

# fit the model
clf = LocalOutlierFactor(n_neighbors=20)
y_pred = clf.fit_predict(X)
y_pred_outliers = y_pred[200:]

# plot the level sets of the decision function
xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))
Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.title("Local Outlier Factor (LOF)")
plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)

a = plt.scatter(X[:200, 0], X[:200, 1], c='white')
b = plt.scatter(X[200:, 0], X[200:, 1], c='red')
plt.axis('tight')
plt.xlim((-5, 5))
plt.ylim((-5, 5))
Exemplo n.º 17
0
y_pred = []
filename = 'Outlier_multy_n={}_c={}.csv'.format(N_NEIGHBORS, CONTAMINATION)
try:
    out_frame = pd.read_csv(filename)
    y_pred = out_frame.Out
except FileNotFoundError:
    # file was not found, create and train new model, then print results to csv
    print('file ', filename, ' was not found :(')
    print('new file will be generated')
    print()
    print('create new classifier')
    outlier_clf = LocalOutlierFactor(n_neighbors = N_NEIGHBORS,
                                    contamination = CONTAMINATION
    )
    print("training model for corruption: ", CONTAMINATION, ', neighbors: ', N_NEIGHBORS)
    y_pred = outlier_clf.fit_predict(features)
    print("outliers detected, creating csv")

    # create new frame and print it to csv
    f = pd.DataFrame({'Out': y_pred})
    f.to_csv(filename)

# read data
train_og = pd.read_hdf("train.h5", "train")
all_data = pd.read_hdf("train.h5", "train").drop(['y'], axis = 1)

# insert outlier-column
train_og.insert(0, column = 'outlier', value = y_pred)

# NOTE: split here
train_og, X_test = train_test_split(train_og, test_size=0.33)
Exemplo n.º 18
0
    def detect_conceptually_irrelevant_name(self):
        identifiers_with_vector = [identifier for identifier in self.identifiers if identifier.vector is not None]
        identifiers_with_vector_original = deepcopy(identifiers_with_vector)
        vectors = [identifier.vector for identifier in self.identifiers if identifier.vector is not None]
        vectors_original = deepcopy(vectors)

        clf = LocalOutlierFactor(n_neighbors=int(len(vectors_original) / 3.), )
        y_pred = clf.fit_predict(vectors_original)
        lof_scores = clf.negative_outlier_factor_
        # Normalization
        lof_scores_normalized_original = (lof_scores.max() - lof_scores) / (lof_scores.max() - lof_scores.min())
        print('lof_scores_normalized_original', lof_scores_normalized_original)
        print('average naming debt: "{0}"'.format(np.mean(lof_scores_normalized_original)))
        Visualized.draw_names_plot(identifiers=identifiers_with_vector_original,
                                   lof_scores_normalized=lof_scores_normalized_original)
        if len(vectors_original) < 10:
            return

        vectors_avg = np.mean(vectors)
        iteration = 0
        id_out_list = list()
        id_in_list = list()

        flag = True

        while flag and iteration < 100:
            print('-' * 75)
            print('iteration "{}" ...'.format(iteration))
            clf = LocalOutlierFactor(n_neighbors=int(len(vectors) / 3.), )
            y_pred = clf.fit_predict(vectors)
            lof_scores = clf.negative_outlier_factor_
            # Normalization
            lof_scores_normalized = (lof_scores.max() - lof_scores) / (lof_scores.max() - lof_scores.min())
            for i, identifier in enumerate(identifiers_with_vector):
                identifier.local_outlier_factor = lof_scores_normalized[i]

            identifiers_with_vector_sorted = sorted(identifiers_with_vector,
                                                    key=lambda k: k.local_outlier_factor,
                                                    reverse=True)

            print('Average naming debt is "{0}"'.format(np.mean(lof_scores_normalized)))

            id_out_list.append(deepcopy(identifiers_with_vector_sorted[0]))
            print('The identifier "{0}" should be renamed'.format(identifiers_with_vector_sorted[0]))
            # Avg version
            remained_identifiers_vectors = [identifier.vector for identifier in identifiers_with_vector_sorted[1:]]
            # remained_identifiers_vectors_avg = np.mean(remained_identifiers_vectors, axis=0)
            # print('remained_identifiers_vectors_avg', remained_identifiers_vectors_avg)
            # recommended_names = self.model.wv.similar_by_vector(remained_identifiers_vectors_avg, topn=10)
            # distance_to_neighbor = [distance.cosine(id_out_list[-1].vector, vector)
            #                         for vector in remained_identifiers_vectors]

            # for i, dist in enumerate(distance_to_neighbor):
            #     if dist == 0:
            #         distance_to_neighbor[i] = max(distance_to_neighbor)

            # nearest_neighbor_index = distance_to_neighbor.index(min(distance_to_neighbor))

            nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(remained_identifiers_vectors)
            distances, indices = nbrs.kneighbors([id_out_list[-1].vector], n_neighbors=5)
            print('I ', indices[0])
            random.shuffle(indices[0])
            nearest_neighbor_indice = indices[0][0]

            recommended_names = self.model.wv.similar_by_vector(remained_identifiers_vectors[nearest_neighbor_indice],
                                                                topn=len(self.identifiers) + 1)

            print('similar by vector,', recommended_names)

            # Check post conditions
            # Check if the name is verb for function or noun for attribute

            recommended_name = recommended_names[0][0]
            rank = 1
            while recommended_name in [identifier.id_name for identifier in identifiers_with_vector]\
                    or len(recommended_name) < 4\
                    or recommended_name in ['char', 'int', 'float', 'double', 'string', 'class']:
                recommended_name = recommended_names[rank][0]
                rank += 1

            for identifier in identifiers_with_vector:
                if identifier.unique_number == id_out_list[-1].unique_number:
                    identifier.id_name = recommended_name
                    identifier.parts = identifier.get_identifier_parts()
                    identifier.vector = identifier.get_single_vector_for_identifier(model=self.model)
                    id_in_list.append(deepcopy(identifier))
                    print('##### id changed', identifier.unique_number)
                    break

            vectors = [identifier.vector for identifier in identifiers_with_vector]
            vectors_avg_new = np.mean(vectors, axis=0)
            d = distance.cosine(vectors_avg, vectors_avg_new)
            print('distance', d)
            print('improvement', )
            if d <= 0.05:
                flag = False
            vectors_avg = vectors_avg_new
            iteration += 1

        print('Number of iterations: "{0}"'.format(iteration))
        print('To be renamed ids: "{0}"'.format([identifier.id_name for identifier in id_out_list]))
        print('Recommended names ids: "{0}"'.format([identifier.id_name for identifier in id_in_list]))
        Visualized.draw_names_plot(identifiers=identifiers_with_vector,
                                   lof_scores_normalized=lof_scores_normalized)

        print('Final IDs', [identifier.id_name for identifier in identifiers_with_vector])
from scipy.io import loadmat
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

from sklearn.decomposition import PCA

data = loadmat('ex8data2.mat')

X = data['X']

e1 = EllipticEnvelope()
labels1 = e1.fit_predict(X)

e2 = LocalOutlierFactor()
labels2 = e2.fit_predict(X)

n_components = 3

pca1 = PCA(n_components=n_components)
Xproj = pca1.fit_transform(X)

plt.figure()
plt.clf()
ax = plt.axes(projection='3d')

# ax.scatter(image_array[:, 0], image_array[:, 1], image_array[:, 2], c=labels, cmap='coolwarm', marker=',')

ax.scatter(Xproj[:, 0], Xproj[:, 1], Xproj[:, 2], marker='o', c=labels1)

plt.show()
Exemplo n.º 20
0
def anomaly_detection(testdata_name,rank_method_index,test_EVs_ts,test_MVs_ts):
    # Local Outlier Factor
    from sklearn.neighbors import LocalOutlierFactor
    from myFunctions import gen_dist_mat
    
    #
    experimentName = '{}_LOF'.format(testdata_name)
    # Choose ranking method
    # rank_group = rank_high_low
    rank_group = rank_methods[rank_method_index]
    rank_method_name = rank_methods_names[rank_method_index]
    
    test_weather_ts = test_EVs_ts[0] # test weather data
    
    # MV_index = 0 # MV we are examining
    MV_predictions = []
    for MV_index in range(len(MVs)):
        predictions = []
        for n in range(test_weather_ts.shape[0]):
            # The 20th closest weather data
            weather_group = rank_group(weather_ts,test_weather_ts[n])['Day'][:20]
    
            print('{} - group length:{}'.format(n,len(weather_group)))
            if len(weather_group) < 10:
                predictions.append('len<')
                continue
            
    
            # reshape to row array to concatenate
            test_data_point = test_MVs_ts[MV_index,n].reshape((1,MVs_ts[MV_index,weather_group].shape[1]))
            # concatenated matrix of training data and the test data sample
            NT_data = np.concatenate((MVs_ts[MV_index,weather_group],test_data_point),axis = 0)
            
            LOF = LocalOutlierFactor(n_neighbors = 3,metric='precomputed')
            D = gen_dist_mat(NT_data) # distance matrix
            
            # if distance matrix are all zeros(all TS are identical), then skip this
            if len(D[D == 0]) == D.shape[0]*D.shape[1]:
                predictions.append('D=0')
                continue
                
            pred = LOF.fit_predict(D)
            predictions.append(str(pred[-1])) # change to string to avoid comparison error in numpy later
            
            # if detected as outlier, save plot of MVs
            if pred[-1] == -1:
                plt.figure()
                # # draw only the current MV-----
                for c in weather_group:
                    plt.plot(MVs_ts[MV_index,c],color='steelblue',alpha=0.5,linestyle='dotted')
                plt.plot(test_MVs_ts[MV_index,n],color='gold')
                #--------------------------------
                
                # # draw for all MVs-------------
                # for index in range(MVs_ts.shape[0]):
                #     for c in combination:
                #         plt.plot(MVs_ts[index,c],color=color_list[index],alpha=0.5,linestyle='dotted')
                #     plt.plot(test_MVs_ts[index,n],color='gold')
                # plt.show()
                # -------------------------------
                
                dir_loc = r'C:\Users\James\Desktop\python_figs\rank\{}\{}\{}'.format(rank_method_name,experimentName,MVs[MV_index])
                # check directory if exists
                if not os.path.exists(dir_loc):
                    os.makedirs(dir_loc)
                # save faulty plot
                plt.savefig(dir_loc + '\\n{}.png'.format(n))
                plt.close()
            
        MV_predictions.append(np.array(predictions))
    
    
    p_fault = np.empty(MV_predictions[0].shape,dtype = np.bool) # faulty
    p_normal = np.empty(MV_predictions[0].shape,dtype = np.bool) # normal
    p_lack = np.empty(MV_predictions[0].shape,dtype = np.bool) # lack of data
    p_fault[:] = False
    p_normal[:] = True # False
    p_lack[:] = True # False
    for predictions in MV_predictions:
        p_fault = np.logical_or(p_fault, predictions=='-1')
        normal_with_identical = np.logical_or(predictions=='1',predictions=='D=0')
        p_normal = np.logical_and(p_normal,normal_with_identical)
        p_lack = np.logical_and(p_lack, predictions=='len<')
        
    # the indices of ts sample which are considered faulty
    fault_index = np.arange(len(p_fault))[p_fault]
    normal_index = np.arange(len(p_normal))[p_normal]
    lack_index = np.arange(len(p_lack))[p_lack]
    
    # print results:
    fd_rate = 'Fault detection rate:\t {}%'.format(len(fault_index)/test_weather_ts.shape[0]*100)
    nd_rate = 'Normal operation rate:\t {}%'.format(len(normal_index)/test_weather_ts.shape[0]*100)
    ld_rate = 'Lack of data rate:\t {}%'.format(len(lack_index)/test_weather_ts.shape[0]*100)
    
    print(fd_rate)
    print(nd_rate)
    print(ld_rate)
    
    # Save results:
    dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}'.format(rank_method_name,experimentName)
    with open(dir_loc+'\\results.txt','w') as f:
        f.write(fd_rate + '\n' + nd_rate+ '\n' + ld_rate)
    
    
    
    
    
    
    
    # Isolation Forest
    
    from sklearn.ensemble import IsolationForest
    from myFunctions import gen_dist_mat
    
    #
    experimentName = '{}_IsolationForest'.format(testdata_name)
    # Choose ranking method
    # rank_group = rank_high_low
    rank_group = rank_methods[rank_method_index]
    rank_method_name = rank_methods_names[rank_method_index]
    
    # test_weather_ts = test_EVs_ts[0] # test weather data
    
    # MV_index = 0 # MV we are examining
    MV_predictions = []
    for MV_index in range(len(MVs)):
        predictions = []
        for n in range(test_weather_ts.shape[0]):
            # The 20th closest weather data
            weather_group = rank_group(weather_ts,test_weather_ts[n])['Day'][:20]
            
            print('{} - group length:{}'.format(n,len(weather_group)))
            if len(weather_group) < 10:
                predictions.append('len<')
                continue
            
    
            # reshape to row array to concatenate
            test_data_point = test_MVs_ts[MV_index,n].reshape((1,MVs_ts[MV_index,weather_group].shape[1]))
            # concatenated matrix of training data and the test data sample
            NT_data = np.concatenate((MVs_ts[MV_index,weather_group],test_data_point),axis = 0)
            
            D = gen_dist_mat(NT_data) # distance matrix
            
            # if distance matrix are all zeros(all TS are identical), then skip this
            if len(D[D == 0]) == D.shape[0]*D.shape[1]:
                predictions.append('D=0')
                continue
            
            IsoForest = IsolationForest()
            IsoForest.fit(NT_data)
            pred = IsoForest.predict(NT_data)    
            
            predictions.append(str(pred[-1])) # change to string to avoid comparison error in numpy later
            
            # if detected as outlier, save plot of MVs
            if pred[-1] == -1:
                plt.figure()
                # # draw only the current MV-----
                for c in weather_group:
                    plt.plot(MVs_ts[MV_index,c],color='steelblue',alpha=0.5,linestyle='dotted')
                plt.plot(test_MVs_ts[MV_index,n],color='gold')
                #--------------------------------
                
                # # draw for all MVs-------------
                # for index in range(MVs_ts.shape[0]):
                #     for c in combination:
                #         plt.plot(MVs_ts[index,c],color=color_list[index],alpha=0.5,linestyle='dotted')
                #     plt.plot(test_MVs_ts[index,n],color='gold')
                # plt.show()
                # -------------------------------
                
                dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}\{}'.format(rank_method_name,experimentName,MVs[MV_index])
                # check directory if exists
                if not os.path.exists(dir_loc):
                    os.makedirs(dir_loc)
                # save faulty plot
                plt.savefig(dir_loc + '\\n{}.png'.format(n))
                plt.close()
            
        MV_predictions.append(np.array(predictions))
    
    
    p_fault = np.empty(MV_predictions[0].shape,dtype = np.bool) # faulty
    p_normal = np.empty(MV_predictions[0].shape,dtype = np.bool) # normal
    p_lack = np.empty(MV_predictions[0].shape,dtype = np.bool) # lack of data
    p_fault[:] = False
    p_normal[:] = True # False
    p_lack[:] = True # False
    for predictions in MV_predictions:
        p_fault = np.logical_or(p_fault, predictions=='-1')
        normal_with_identical = np.logical_or(predictions=='1',predictions=='D=0')
        p_normal = np.logical_and(p_normal,normal_with_identical)
        p_lack = np.logical_and(p_lack, predictions=='len<')
        
    # the indices of ts sample which are considered faulty
    fault_index = np.arange(len(p_fault))[p_fault]
    normal_index = np.arange(len(p_normal))[p_normal]
    lack_index = np.arange(len(p_lack))[p_lack]
    
    # print results:
    fd_rate = 'Fault detection rate:\t {}%'.format(len(fault_index)/test_weather_ts.shape[0]*100)
    nd_rate = 'Normal operation rate:\t {}%'.format(len(normal_index)/test_weather_ts.shape[0]*100)
    ld_rate = 'Lack of data rate:\t {}%'.format(len(lack_index)/test_weather_ts.shape[0]*100)
    
    print(fd_rate)
    print(nd_rate)
    print(ld_rate)
    
    # Save results:
    dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}'.format(rank_method_name,experimentName)
    with open(dir_loc+'\\results.txt','w') as f:
        f.write(fd_rate + '\n' + nd_rate+ '\n' + ld_rate)
Exemplo n.º 21
0
class LOFStep():
    def __init__(self, include_y=True, kwargs={'contamination': 'auto'}):
        """ Uses the local outlier factor to detect and remove outliers. Uses sklearn’s LocalOutlierFactor class.
        
        Parameters
        ----------
        include_y (bool, default=True), Whether or not to include the y data when fitting the isolation forest

        kwargs (dict, default={'contamination': 'auto'}) : arguments to pass to sklearn’s IsolationForest class initialization
        """
        self.description = "Local Outlier Factor"
        self.include_y = include_y
        self.kwargs = kwargs
        self.fitted = None
        self.changes_num_samples = True

    def fit(self, X, y=None):
        """ Fits the outlier detection on the given data
        
        Parameters
        ----------
        X (DataFrame) : training data

        y (DataFrame, default=None) : target values (if needed)

        Returns
        -------
        (DataFrame, DataFrame) : A tuple of the transformed DataFrames, the first being the X data and the second being the y data
        """
        self.fitted = LocalOutlierFactor(**self.kwargs)
        return self.transform(X, y=y)

    def transform(self, X, y=None):
        """ Transforms the given data using the previously fitted outlier detection method
        
        Parameters
        ----------
        X (DataFrame) : training data

        y (DataFrame, default=None) : target values (if needed)

        Returns
        -------
        (DataFrame, DataFrame) : A tuple of the transformed DataFrames, the first being the X data and the second being the y data
        """
        if self.fitted is None:
            raise TransformError

        outlier_labels = self.fitted.fit_predict(X, y)

        # Remove outliers from data
        for i in range(outlier_labels.shape[0]):
            if outlier_labels[i] == -1:
                X = X.drop(index=i)
                if y is not None:
                    y = y.drop(index=i)

        if y is None:
            return X.reset_index(drop=True)

        y = y.reset_index(drop=True)
        return X.reset_index(drop=True), y
Exemplo n.º 22
0
    'n_neighbors': 10,
    'n_clusters': 3
}

# connectivity matrix for structured Ward
# connectivity = kneighbors_graph(
#    array, n_neighbors=params['n_neighbors'], include_self=False)
# make connectivity symmetric
# connectivity = 0.5 * (connectivity + connectivity.T)

# algorithms
# two_means = cluster.MiniBatchKMeans(n_clusters=5)

clf = LocalOutlierFactor(n_neighbors=10)

y_pred = clf.fit_predict(array)
outliers = y_pred[200:]
vals = clf.negative_outlier_factor_

# print (y_pred)
# print(vals)

dist = list()
for each in array:
    dist.append(np.power(each[0], 2) + np.power(each[1], 2))

npList = np.column_stack(
    (df.getData().index.values, df.getData().iloc[:, 0:], dist))

print(npList)
Exemplo n.º 23
0
def _LocalOutlierFactor(X):
    n = int(round(X.shape[0] * 0.2))
    clf = LocalOutlierFactor(n_neighbors=n)
    return clf.fit_predict(X)
Exemplo n.º 24
0
df2['Salary Paid'] = df2['Salary Paid'].apply(lambda x:x.split('.')[0].strip()).replace({'\$':'', ',':''}, regex=True)


FirAtt_lst = df2['Job Title'].unique()
SecAtt_lst = df2['Employer'].unique()
ThrAtt_lst = df2['Calendar Year'].unique()

###################################     Forming a context   #######################################
Orgn_Ctx = df2.loc[df2['Job Title'].isin([FirAtt_lst[0],FirAtt_lst[1],FirAtt_lst[2],FirAtt_lst[3], FirAtt_lst[4]]) & \
                   df2['Employer'].isin([SecAtt_lst[0],SecAtt_lst[1], SecAtt_lst[2],SecAtt_lst[3], SecAtt_lst[4], SecAtt_lst[5]]) & \
                   df2['Calendar Year'].isin([ThrAtt_lst[0],ThrAtt_lst[1],ThrAtt_lst[2],ThrAtt_lst[3],ThrAtt_lst[4]])]


#######################     Finding an outlier in the selected context      #######################
clf = LocalOutlierFactor(n_neighbors=20)
Sal_outliers = clf.fit_predict(Orgn_Ctx['Salary Paid'].values.reshape(-1,1))
Queried_ID =Orgn_Ctx.iloc[Sal_outliers.argmin()][1]

print '\n\n Outlier\'s ID in the selected context is: ', Queried_ID

################# Exploring Contexts larger than the original to find the maximal #################
FirAtt_Sprset = sum(map(lambda r: list(combinations(FirAtt_lst[5:], r)), range(1, len(FirAtt_lst[5:])+1)), [])
SecAtt_Sprset = sum(map(lambda r: list(combinations(SecAtt_lst[6:], r)), range(1, len(SecAtt_lst[6:])+1)), [])
ThrAtt_Sprset = sum(map(lambda r: list(combinations(ThrAtt_lst[5:], r)), range(1, len(ThrAtt_lst[5:])+1)), [])

Sub_pop        =  []
Sub_pop_count  =  0
Epsilon        =  0.1  ### Privacy Parameter
output         =  []
context        =  []
Exemplo n.º 25
0
outliers_fraction = 0.2

lof = LocalOutlierFactor(n_neighbors=35, contamination=outliers_fraction)

np.random.seed(42)
# Data generation
mean1 = [0, 0]
mean2 = [3.5, 4]
cov1 = [[1.5, -0.3], [-0.2, .5]]
cov2 = [[0.75, 0.4], [0.3, 0.5]]

X = np.r_[np.random.multivariate_normal(mean1, cov1, 100),
          np.random.multivariate_normal(mean2, cov2, 100)]
# Add outliers

y_pred = lof.fit_predict(X)
scores_pred = lof.negative_outlier_factor_

plt.figure(figsize=(18, 9))

subplot = plt.subplot(1, 2, 1)
b = subplot.scatter(X[:, 0],
                    X[:, 1],
                    c=['k' if y == 1 else 'r' for y in y_pred],
                    s=20)
subplot = plt.subplot(1, 2, 2)
b = subplot.scatter(X[:, 0],
                    X[:, 1],
                    c=-np.log(-scores_pred),
                    s=20,
                    cmap=plt.get_cmap('Reds'))
print(x_value.shape)
print(y_value.shape)

#Algorithms used: Random Isolation, LocalOutlier factor are common  anomaly detection methods
random_isolation = IsolationForest(max_samples=len(x_value),
                                   contamination=outlier_value,
                                   random_state=3)
local_outlier = LocalOutlierFactor(n_neighbors=12, contamination=outlier_value)

n_outlier = len(fraudal_count)
#fit and predict
random_isolation.fit(x_value)
score_prediction = random_isolation.decision_function(x_value)
y_predict_lof = random_isolation.predict(x_value)

y_predict_isf = local_outlier.fit_predict(x_value)
score_prediction = local_outlier.negative_outlier_factor_

#Change the value to 0 for valid and 1 for fradual cases.
y_predict_isf[y_predict_isf == 1] = 0
y_predict_isf[y_predict_isf == -1] = 1
y_predict_lof[y_predict_lof == 1] = 0
y_predict_lof[y_predict_lof == -1] = 1

n_error_isf = (y_predict_isf != y_value).sum()
n_error_lof = (y_predict_lof != y_value).sum()
print("Error value for Isolation forest ", n_error_isf)
print("Error value for local outlier function ", n_error_lof)

print(accuracy_score(y_value, y_predict_isf))
print(accuracy_score(y_value, y_predict_isf))
Exemplo n.º 27
0
            for row in outdsreader:
                if ((row.values()[0] in FirAtt_Sprset[i]) &
                    (row.values()[1] in SecAtt_Sprset[j])):
                    pop_size += 1
                    Sal_list.append(row['Salary(K)'])
                    ID_list.append(row['ID'])

    #####################         Outlier detection in subpopulations      #############################

            if (pop_size != 0):
                #Score = np.exp(Epsilon *np.log(pop_size))    ### Score Calculation
                #Score = np.exp(Epsilon *(pop_size))
                Score = np.exp(Epsilon * (pop_size**(1. / 3)))
                Sal_arr = np.array(Sal_list)
                clf = LocalOutlierFactor(n_neighbors=4)
                Sal_outliers = clf.fit_predict(Sal_arr.reshape(-1, 1))
                for outlier_finder in range(0, len(ID_list)):
                    if ((ID_list[outlier_finder] == Queried_ID) &
                        (Sal_outliers[outlier_finder] == -1)):
                        Sub_pop.append([i, j, pop_size, Score, Sub_pop_count])
                        Sub_pop_count += 1

    Sub_pop_sorted = sorted(Sub_pop, key=lambda Sub_pop: Sub_pop[2])
    print '\n\nSubpopulations are[Att1_index, Att2_index, Population_size, Score, ID]\n\n', Sub_pop
    print '\n\nSubpopulations sorted based on the score are[Att1_index, Att2_index, Population_size, Score, ID]\n\n', \
                  Sub_pop_sorted

    ############          Max subpopulation wiht least number of attribute values for outlier        ###########

    outlier_index = len(Sub_pop) - 1
    while (Sub_pop_sorted[outlier_index -
Exemplo n.º 28
0
def lof(df, training_df):
    lof = LocalOutlierFactor(n_neighbors=20, contamination='auto')
    y_pred = lof.fit_predict(training_df)
    outliers = np.where(y_pred == -1)
    print('Removing ' + str(len(outliers[0])) + ' records')
    return df.drop(outliers[0])
Exemplo n.º 29
0
def remove_outliers_lof(data, k=10):
    k = min((len(data), k))
    lof = LocalOutlierFactor(n_neighbors=k)
    stays = lof.fit_predict(data)
    return np.array(data)[stays == 1]
Exemplo n.º 30
0
class LocalOutlierFactorFilter:
    """
    训练与预测一体,没有单独的train和test接口
    关键参数:n_neighbors : int, optional (default=20):参与预测的点的数量,无明显规律
    contamination": 可以反映过滤强度, 越大过滤强度越大
    """
    def __init__(self, name="局部异常因子"):
        self._model = LocalOutlierFactor()
        self.name = name

    def get_params(self, deep=True):
        """
        获得模型参数
        """
        return self._model.get_params(deep=deep)

    def _get_valid_params(self):
        """
        获取有效参数
        :return: List
        """
        param = self.get_params()
        return [i for i in param.keys()]

    def set_params(self, **new_params):
        """
        设置模型参数
        :param new_params: 模型参数键值
        只将模型参数包含的超参赋值给模型
        :return:
        """
        for k in new_params.keys():
            if k not in self._get_valid_params():
                raise ValueError("传入参数含有模型中不包含的参数")
                break
        feed_dict = {
            k: v
            for k, v in new_params.items() if k in self._get_valid_params()
        }
        if len(feed_dict) == 0:
            warnings.warn("模型参数未被修改")
        self._model.set_params(**feed_dict)

    def fit_predict(self, x):
        pass
        """
        :param x: 训练数据
        :param y: 训练数据标签
        :return: 训练数据准确率
        """
        return self._model.fit_predict(x)

    def _connect_SQL(self, **json_file):
        """
        连接到SQL
        :param json_file: 入参
        :return:None
        """
        json_dict = json_file
        self._SQL = SQLServer(host=json_dict['dbinfo']['ip'],
                              port=json_dict['dbinfo']['port'],
                              user=json_dict['dbinfo']['username'],
                              pwd=json_dict['dbinfo']['password'],
                              db=json_dict['dbinfo']['databasename'])

    def get_data_label(self, **json_file):
        """
        从数据库调取数据集的标签
        :param json_file:
        :return: 仅含有标签的数据集 pd.dataFrame
        """
        json_dict = json_file
        data_label = self._SQL.df_read_sqlserver(
            table=json_dict['dbinfo']['inputtable'],
            cols=json_dict['label_columns'])
        if data_label.shape[1] != 1:
            raise ValueError("错误:标签列数不为1")
        return data_label

    def get_data_features(self, **json_file):
        """
        从数据库调取数据集
        :param json_file:入参, json
        :return: 仅含有特征变量的数据集 pd.dataFrame
        """
        json_dict = json_file
        data_features = self._SQL.df_read_sqlserver(
            table=json_dict['dbinfo']['inputtable'],
            cols=json_dict['data_columns'])
        return data_features

    def train_predict_from_sql(self, **json_file):
        """
        训练模型并将模型保存
        :param json_file: 入参,json
        :return:是否成功
        """
        try:
            self._connect_SQL(**json_file)
            self.set_params(**json_file["model_params"])
            features = self.get_data_features(**json_file)
            pre = self.fit_predict(features)
            self._model.columns = features.columns.values.tolist()
            self.save_model(json_file["model_path"])  # 暂时保存
            pre.columns = ["label"]
            pre.to_csv(json_file["save_path"], index=False)
            write = self.SQL.df_write_sqlserver(
                table=json_file['dbinfo']['outputtable'],
                df=pre,
                cols=json_file['data_columns'])
            return {"info": write}
            return "success"
        except Exception as e:
            print(e)
            return 'failed,{e}'.format(e=e)

    def train_predict_from_csv(self, **json):
        try:
            features = pd.read_csv(json["path"], usecols=json['data_columns'])
            self.set_params(**json["model_params"])
            pre = pd.DataFrame(self.fit_predict(features))
            self._model.columns = json['data_columns']
            self.save_model(json["model_path"])  # 暂时保存
            pre.columns = ["label"]
            pre.to_csv(json["save_path"], index=False)
            return {"info": "success"}
        except Exception as e:
            print(e)
            return 'failed,{e}'.format(e=e)

    def train_predict_from_xls(self, **json):
        try:
            features = pd.read_excel(json["path"],
                                     usecols=json['data_columns'])
            self.set_params(**json["model_params"])
            pre = self.fit_predict(features)
            self._model.columns = json['data_columns']
            self.save_model(json["model_path"])  # 暂时保存
            pre.columns = ["label"]
            pre.to_csv(json["save_path"], index=False)
            return {"info": "success"}
        except Exception as e:
            print(e)
            return 'failed,{e}'.format(e=e)

    def save_model(self, model_path):
        """
        保存模型
        :param model_path: 模型保存路径
        :return:是否成功
        """
        try:
            joblib.dump(self._model, model_path)
        except Exception as e:
            print(e)
            return 'failed,{e}'.format(e=e)

    def get_model(self):
        """
        调用模型
        :return:模型
        """
        try:
            return self._model
        except Exception as e:
            print(e)
            return 'failed,{e}'.format(e=e)

    def load_model(self, **json):
        model_path = json['model_path']
        self._model = joblib.load(model_path)
Exemplo n.º 31
0
def anomaly_detection(testdata_name,
                      rank_method_index,
                      test_EVs_ts,
                      test_MVs_ts,
                      fig_loc,
                      result_loc,
                      contam,
                      savefig_=True):
    '''
    Runs LOF and Isolation Forest for fault detection.
    Starts with using given rank function to group test_EVs_ts data to 
    weather_ts data, then compare MVs data with test_MVs_ts data using LOF and Isolation Forest
    
    -----------------------------------------------------------------------------
    global inputs:
        weather_ts: Divided TS weather data, numpy array in NT format
        MVs_ts: Corresponding divided TS MVs data, numpy array in NT format
        n_seg: number of segments for PAA conversion
    -----------------------------------------------------------------------------
    inputs:
        testdata_name: folder name of testing dataset, used to print out progress
        rank_method_index: index to identify rank method used
        test_EVs_ts: Divided TS EVs data, numpy array in NT format
        test_MVs_ts: Corresponding divided TS MVs data, numpy array in NT format
        fig_loc: folder path for saved faulty figure plots
        result_loc: folder path for fault detection rate result text files
        contam: contamination parameter used for scikit-learn anomaly detection algorithms
        savefig_: save figure if set to True, default is True
    outputs:
        Faulty TS is saved as a plot
        The fault detection rate of a dataset is saved in a text file
    '''
    # Local Outlier Factor
    from sklearn.neighbors import LocalOutlierFactor
    from myFunctions import gen_dist_mat

    #
    experimentName = '{}_LOF'.format(testdata_name)
    # Choose ranking method
    # rank_group = rank_high_low
    rank_group = rank_methods[rank_method_index]
    rank_method_name = rank_methods_names[rank_method_index]

    test_weather_ts = test_EVs_ts[0]  # test weather data

    # MV_index = 0 # MV we are examining
    MV_predictions = []
    for MV_index in range(len(MVs)):
        predictions = []
        for n in range(test_weather_ts.shape[0]):
            # The 20th closest weather data
            weather_group = rank_group(weather_ts,
                                       test_weather_ts[n])['Day'][:30]

            print('{} - group length:{}'.format(n, len(weather_group)))
            if len(weather_group) < 10:
                predictions.append('len<')
                continue

            # reshape to row array to concatenate
            test_data_point = test_MVs_ts[MV_index, n].reshape(
                (1, MVs_ts[MV_index, weather_group].shape[1]))
            # concatenated matrix of training data and the test data sample
            NT_data = np.concatenate(
                (MVs_ts[MV_index, weather_group], test_data_point), axis=0)

            LOF = LocalOutlierFactor(n_neighbors=10,
                                     metric='precomputed',
                                     contamination=contam)
            D = gen_dist_mat(NT_data)  # distance matrix

            # if distance matrix are all zeros(all TS are identical), then skip this
            if len(D[D == 0]) == D.shape[0] * D.shape[1]:
                predictions.append('D=0')
                continue

            pred = LOF.fit_predict(D)
            predictions.append(
                str(pred[-1])
            )  # change to string to avoid comparison error in numpy later

            # if detected as outlier, save plot of MVs
            if pred[-1] == -1 and savefig_:
                plt.figure()
                # # draw only the current MV-----
                for c in weather_group:
                    plt.plot(MVs_ts[MV_index, c],
                             color='steelblue',
                             alpha=0.5,
                             linestyle='dotted')
                plt.plot(test_MVs_ts[MV_index, n], color='gold')
                #--------------------------------

                # # draw for all MVs-------------
                # for index in range(MVs_ts.shape[0]):
                #     for c in combination:
                #         plt.plot(MVs_ts[index,c],color=color_list[index],alpha=0.5,linestyle='dotted')
                #     plt.plot(test_MVs_ts[index,n],color='gold')
                # plt.show()
                # -------------------------------

                # dir_loc = r'C:\Users\James\Desktop\python_figs\rank\{}\{}\{}'.format(rank_method_name,experimentName,MVs[MV_index])
                dir_loc = fig_loc + r'\{}\{}\{}'.format(
                    rank_method_name, experimentName, MVs[MV_index])

                # check directory if exists
                if not os.path.exists(dir_loc):
                    os.makedirs(dir_loc)
                # save faulty plot
                plt.savefig(dir_loc + '\\n{}.png'.format(n))
                plt.close()

        MV_predictions.append(np.array(predictions))

    p_fault = np.empty(MV_predictions[0].shape, dtype=np.bool)  # faulty
    p_normal = np.empty(MV_predictions[0].shape, dtype=np.bool)  # normal
    p_lack = np.empty(MV_predictions[0].shape, dtype=np.bool)  # lack of data
    p_fault[:] = False
    p_normal[:] = True  # False
    p_lack[:] = True  # False
    for predictions in MV_predictions:
        p_fault = np.logical_or(p_fault, predictions == '-1')
        normal_with_identical = np.logical_or(predictions == '1',
                                              predictions == 'D=0')
        p_normal = np.logical_and(p_normal, normal_with_identical)
        p_lack = np.logical_and(p_lack, predictions == 'len<')

    # the indices of ts sample which are considered faulty
    fault_index = np.arange(len(p_fault))[p_fault]
    normal_index = np.arange(len(p_normal))[p_normal]
    lack_index = np.arange(len(p_lack))[p_lack]

    # print results:
    fd_rate = 'Fault detection rate:\t {}%'.format(
        len(fault_index) / test_weather_ts.shape[0] * 100)
    nd_rate = 'Normal operation rate:\t {}%'.format(
        len(normal_index) / test_weather_ts.shape[0] * 100)
    ld_rate = 'Lack of data rate:\t {}%'.format(
        len(lack_index) / test_weather_ts.shape[0] * 100)

    print(fd_rate)
    print(nd_rate)
    print(ld_rate)

    # Save results:
    # dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}'.format(rank_method_name,experimentName)
    dir_loc = result_loc + r'\{}\{}'.format(rank_method_name, experimentName)

    # check directory if exists
    if not os.path.exists(dir_loc):
        os.makedirs(dir_loc)

    with open(dir_loc + '\\results.txt', 'w') as f:
        f.write(fd_rate + '\n' + nd_rate + '\n' + ld_rate)

    # save prediction results
    predArr_lof = np.array(
        MV_predictions).T  # NF format(row:day/sample, col:MV)
    header = np.array(MVs).reshape(1, len(MVs))  # add header
    predArr_lof = np.concatenate((header, predArr_lof), axis=0)
    np.savetxt(dir_loc + '\\MV_predictions.csv',
               predArr_lof,
               fmt='%s',
               delimiter=',')

    # Isolation Forest

    from sklearn.ensemble import IsolationForest
    from myFunctions import gen_dist_mat

    #
    experimentName = '{}_IsolationForest'.format(testdata_name)
    # Choose ranking method
    # rank_group = rank_high_low
    rank_group = rank_methods[rank_method_index]
    rank_method_name = rank_methods_names[rank_method_index]

    # test_weather_ts = test_EVs_ts[0] # test weather data

    # MV_index = 0 # MV we are examining
    MV_predictions = []
    for MV_index in range(len(MVs)):
        predictions = []
        for n in range(test_weather_ts.shape[0]):
            # The 20th closest weather data
            weather_group = rank_group(weather_ts,
                                       test_weather_ts[n])['Day'][:30]

            print('{} - group length:{}'.format(n, len(weather_group)))
            if len(weather_group) < 10:
                predictions.append('len<')
                continue

            # reshape to row array to concatenate
            test_data_point = test_MVs_ts[MV_index, n].reshape(
                (1, MVs_ts[MV_index, weather_group].shape[1]))
            # concatenated matrix of training data and the test data sample
            NT_data = np.concatenate(
                (MVs_ts[MV_index, weather_group], test_data_point), axis=0)

            D = gen_dist_mat(NT_data)  # distance matrix

            # if distance matrix are all zeros(all TS are identical), then skip this
            if len(D[D == 0]) == D.shape[0] * D.shape[1]:
                predictions.append('D=0')
                continue

            IsoForest = IsolationForest(contamination=contam)
            IsoForest.fit(NT_data)
            pred = IsoForest.predict(NT_data)

            predictions.append(
                str(pred[-1])
            )  # change to string to avoid comparison error in numpy later

            # if detected as outlier, save plot of MVs
            if pred[-1] == -1 and savefig_:
                plt.figure()
                # # draw only the current MV-----
                for c in weather_group:
                    plt.plot(MVs_ts[MV_index, c],
                             color='steelblue',
                             alpha=0.5,
                             linestyle='dotted')
                plt.plot(test_MVs_ts[MV_index, n], color='gold')
                #--------------------------------

                # # draw for all MVs-------------
                # for index in range(MVs_ts.shape[0]):
                #     for c in combination:
                #         plt.plot(MVs_ts[index,c],color=color_list[index],alpha=0.5,linestyle='dotted')
                #     plt.plot(test_MVs_ts[index,n],color='gold')
                # plt.show()
                # -------------------------------

                # dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}\{}'.format(rank_method_name,experimentName,MVs[MV_index])
                dir_loc = fig_loc + r'\{}\{}\{}'.format(
                    rank_method_name, experimentName, MVs[MV_index])

                # check directory if exists
                if not os.path.exists(dir_loc):
                    os.makedirs(dir_loc)
                # save faulty plot
                plt.savefig(dir_loc + '\\n{}.png'.format(n))
                plt.close()

        MV_predictions.append(np.array(predictions))

    p_fault = np.empty(MV_predictions[0].shape, dtype=np.bool)  # faulty
    p_normal = np.empty(MV_predictions[0].shape, dtype=np.bool)  # normal
    p_lack = np.empty(MV_predictions[0].shape, dtype=np.bool)  # lack of data
    p_fault[:] = False
    p_normal[:] = True  # False
    p_lack[:] = True  # False
    for predictions in MV_predictions:
        p_fault = np.logical_or(p_fault, predictions == '-1')
        normal_with_identical = np.logical_or(predictions == '1',
                                              predictions == 'D=0')
        p_normal = np.logical_and(p_normal, normal_with_identical)
        p_lack = np.logical_and(p_lack, predictions == 'len<')

    # the indices of ts sample which are considered faulty
    fault_index = np.arange(len(p_fault))[p_fault]
    normal_index = np.arange(len(p_normal))[p_normal]
    lack_index = np.arange(len(p_lack))[p_lack]

    # print results:
    fd_rate = 'Fault detection rate:\t {}%'.format(
        len(fault_index) / test_weather_ts.shape[0] * 100)
    nd_rate = 'Normal operation rate:\t {}%'.format(
        len(normal_index) / test_weather_ts.shape[0] * 100)
    ld_rate = 'Lack of data rate:\t {}%'.format(
        len(lack_index) / test_weather_ts.shape[0] * 100)

    print(fd_rate)
    print(nd_rate)
    print(ld_rate)

    # Save results:
    # dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}'.format(rank_method_name,experimentName)
    dir_loc = result_loc + r'\{}\{}'.format(rank_method_name, experimentName)

    # check directory if exists
    if not os.path.exists(dir_loc):
        os.makedirs(dir_loc)

    with open(dir_loc + '\\results.txt', 'w') as f:
        f.write(fd_rate + '\n' + nd_rate + '\n' + ld_rate)

    # save prediction results
    predArr_iForest = np.array(
        MV_predictions).T  # NF format(row:day/sample, col:MV)
    header = np.array(MVs).reshape(1, len(MVs))  # add header
    predArr_iForest = np.concatenate((header, predArr_iForest), axis=0)
    np.savetxt(dir_loc + '\\MV_predictions.csv',
               predArr_iForest,
               fmt='%s',
               delimiter=',')
    # return prediction results
    return (predArr_lof, predArr_iForest)
Exemplo n.º 32
0
def BFS_Alg(Org_Vec, Queue, Data_to_write, Epsilon, max_ctx):
    Visited = []
    BFS_Vec = np.zeros(len(Org_Vec))
    for i in range(len(Org_Vec)):
        BFS_Vec[i] = Org_Vec[i]
    BFS_Flp = np.zeros(len(Org_Vec))
    termination_threshold = 500
    Terminator = 0
    # I use the Queue it for visited nodes.
    # and just use sub_q here, for each sample I add the children to this sub_q without resetting it first
    sub_q = [[
        0,
        mp.exp(Epsilon * (Orgn_Ctx.shape[0])), Orgn_Ctx.shape[0], Org_Vec
    ]]
    contexts = [Org_Vec]
    while len(Visited) < 100:
        Terminator += 1
        if (Terminator > termination_threshold):
            break
    #print 'sub_q before: ', sub_q
        for i in range(len(sub_q)):
            sub_q[i][0] = i
        Sub_elements = [elem for elem in range(len(sub_q))]
        Sub_probabilities = []
        for prob in sub_q:
            Sub_probabilities.append(prob[1] /
                                     (sum([prob[1] for prob in sub_q])))
        SubRes = np.random.choice(Sub_elements, 1, p=Sub_probabilities)
        Queue.append([
            len(Queue), sub_q[SubRes[0]][1], sub_q[SubRes[0]][2],
            sub_q[SubRes[0]][3][:]
        ])
        #print 'Queue is:', Queue
        Visited.append(sub_q[SubRes[0]][3][:])
        #print 'Visited is:', Visited
        sub_q.remove(sub_q[SubRes[0]])
        #print 'Visited is:', Visited
        for Flp_bit in range(0, (len(BFS_Vec))):
            for i in range(len(BFS_Flp)):
                BFS_Flp[i] = Queue[len(Queue) - 1][3][i]
            Sub_Sal_list = []
            Sub_ID_list = []
            BFS_Flp[Flp_bit] = 1 - BFS_Flp[Flp_bit]
            BFS_Ctx  = df2.loc[df2['Weapon'].isin(FirAtt_lst[np.where(BFS_Flp[0:len(FirAtt_lst)] == 1)].tolist()) &\
                 df2['State'].isin(SecAtt_lst[np.where(BFS_Flp[len(FirAtt_lst):len(FirAtt_lst)+len(SecAtt_lst)] == 1)].tolist())  &\
                 df2['AgencyType'].isin(ThrAtt_lst[np.where(BFS_Flp[len(FirAtt_lst)+len(SecAtt_lst):len(FirAtt_lst)+len(SecAtt_lst)+len(ThrAtt_lst)] == 1)].tolist())]
            if ((not any(np.array_equal(BFS_Flp[:], x[:])
                         for x in Visited)) and
                (not any(np.array_equal(BFS_Flp[:], x[:]) for x in contexts))
                    and (BFS_Ctx.shape[0] > 20)):
                for row in range(BFS_Ctx.shape[0]):
                    #VictimAge is column 4 and the ID is on column 0
                    Sub_Sal_list.append(BFS_Ctx.iloc[row, 4])
                    Sub_ID_list.append(BFS_Ctx.iloc[row, 0])
                Sub_Sal_arr = np.array(Sub_Sal_list)
                clf = LocalOutlierFactor(n_neighbors=20)
                Sub_Sal_outliers = clf.fit_predict(Sub_Sal_arr.reshape(-1, 1))
                for outlier_finder in range(0, len(Sub_ID_list)):
                    if ((Sub_Sal_outliers[outlier_finder] == -1)
                            and (Sub_ID_list[outlier_finder] == Queried_ID)):
                        Sub_Score = mp.exp(Epsilon * (BFS_Ctx.shape[0]))
                        sub_q.append([
                            Flp_bit, Sub_Score, BFS_Ctx.shape[0],
                            np.zeros(len(Org_Vec))
                        ])
                        for i in range(len(sub_q[len(sub_q) - 1][3])):
                            sub_q[len(sub_q) - 1][3][i] = BFS_Flp[i]
                        contexts.append(np.zeros(len(Org_Vec)))
                        for i in range(len(Org_Vec)):
                            contexts[len(contexts) - 1][i] = BFS_Flp[i]
    # Exp mechanism on the visited nodes
    for i in range(len(Queue)):
        Queue[i][0] = i
    elements = [elem for elem in range(len(Queue))]
    probabilities = []
    for prob in Queue:
        probabilities.append(prob[1] / (sum([prob[1] for prob in Queue])))
    Res = np.random.choice(elements, 1, p=probabilities)
    Data_to_write.append(Queue[Res[0]][2] / max_ctx)
    return