def test_lof_precomputed(random_state=42): """Tests LOF with a distance matrix.""" # Note: smaller samples may result in spurious test success rng = np.random.RandomState(random_state) X = rng.random_sample((10, 4)) Y = rng.random_sample((3, 4)) DXX = metrics.pairwise_distances(X, metric='euclidean') DYX = metrics.pairwise_distances(Y, X, metric='euclidean') # As a feature matrix (n_samples by n_features) lof_X = neighbors.LocalOutlierFactor(n_neighbors=3, novelty=True) lof_X.fit(X) pred_X_X = lof_X._predict() pred_X_Y = lof_X.predict(Y) # As a dense distance matrix (n_samples by n_samples) lof_D = neighbors.LocalOutlierFactor(n_neighbors=3, algorithm='brute', metric='precomputed', novelty=True) lof_D.fit(DXX) pred_D_X = lof_D._predict() pred_D_Y = lof_D.predict(DYX) assert_array_almost_equal(pred_X_X, pred_D_X) assert_array_almost_equal(pred_X_Y, pred_D_Y)
def test_n_neighbors_attribute(): X = iris.data clf = neighbors.LocalOutlierFactor(n_neighbors=500).fit(X) assert clf.n_neighbors_ == X.shape[0] - 1 clf = neighbors.LocalOutlierFactor(n_neighbors=500) assert_warns_message(UserWarning, "n_neighbors will be set to (n_samples - 1)", clf.fit, X) assert clf.n_neighbors_ == X.shape[0] - 1
def test_n_neighbors_attribute(): X = iris.data clf = neighbors.LocalOutlierFactor(n_neighbors=500).fit(X) assert clf.n_neighbors_ == X.shape[0] - 1 clf = neighbors.LocalOutlierFactor(n_neighbors=500) msg = "n_neighbors will be set to (n_samples - 1)" with pytest.warns(UserWarning, match=re.escape(msg)): clf.fit(X) assert clf.n_neighbors_ == X.shape[0] - 1
def test_score_samples(): X_train = [[1, 1], [1, 2], [2, 1]] clf1 = neighbors.LocalOutlierFactor(n_neighbors=2, contamination=0.1).fit(X_train) clf2 = neighbors.LocalOutlierFactor(n_neighbors=2).fit(X_train) assert_array_equal(clf1._score_samples([[2., 2.]]), clf1._decision_function([[2., 2.]]) + clf1.offset_) assert_array_equal(clf2._score_samples([[2., 2.]]), clf2._decision_function([[2., 2.]]) + clf2.offset_) assert_array_equal(clf1._score_samples([[2., 2.]]), clf2._score_samples([[2., 2.]]))
def test_novelty_errors(): X = iris.data # check errors for novelty=False clf = neighbors.LocalOutlierFactor() clf.fit(X) # predict, decision_function and score_samples raise ValueError for method in ['predict', 'decision_function', 'score_samples']: msg = ('{} is not available when novelty=False'.format(method)) assert_raises_regex(AttributeError, msg, getattr, clf, method) # check errors for novelty=True clf = neighbors.LocalOutlierFactor(novelty=True) msg = 'fit_predict is not available when novelty=True' assert_raises_regex(AttributeError, msg, getattr, clf, 'fit_predict')
def choose_models(): isolFor = { 'name': 'Isolation Forest', 'class': ensemble.IsolationForest(), 'parameters': { 'n_estimators': [5, 10, 20, 50, 100, 150, 200] } } locOutFac = { 'name': 'Local Outlier Factor', 'class': neighbors.LocalOutlierFactor(novelty=True), 'parameters': { 'n_neighbors': range(5, 50, 5) } } # ocSVM = {'name': 'One Class SVM', # 'class': svm.OneClassSVM(), # 'parameters': { # 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], # 'nu': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1] # } # } elEnv = { 'name': 'Elliptic Envelope', 'class': covariance.EllipticEnvelope(), 'parameters': { 'contamination': np.linspace(0.05, 0.45, 9) } } return [isolFor, locOutFac, elEnv]
def remove_outliers_and_normalize(data_dirty, n_neighbors=20): clf = neighbors.LocalOutlierFactor(n_neighbors=n_neighbors) norm = preprocessing.Normalizer() data_map = clf.fit_predict(data_dirty) data_clean = data_dirty[data_map > 0] data_normalized = norm.fit_transform(data_clean) return data_normalized, data_map
def test_novelty_training_scores(): # check that the scores of the training samples are still accessible # when novelty=True through the negative_outlier_factor_ attribute X = iris.data # fit with novelty=False clf_1 = neighbors.LocalOutlierFactor() clf_1.fit(X) scores_1 = clf_1.negative_outlier_factor_ # fit with novelty=True clf_2 = neighbors.LocalOutlierFactor(novelty=True) clf_2.fit(X) scores_2 = clf_2.negative_outlier_factor_ assert_array_almost_equal(scores_1, scores_2)
def test_lof(): # Toy sample (the last two samples are outliers): X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [5, 3], [-4, 2]] # Test LocalOutlierFactor: clf = neighbors.LocalOutlierFactor(n_neighbors=5) score = clf.fit(X).negative_outlier_factor_ assert_array_equal(clf._fit_X, X) # Assert largest outlier score is smaller than smallest inlier score: assert_greater(np.min(score[:-2]), np.max(score[-2:])) # Assert predict() works: clf = neighbors.LocalOutlierFactor(contamination=0.25, n_neighbors=5).fit(X) assert_array_equal(clf._predict(), 6 * [1] + 2 * [-1])
def test_novelty_errors(): X = iris.data # check errors for novelty=False clf = neighbors.LocalOutlierFactor() clf.fit(X) # predict, decision_function and score_samples raise ValueError for method in ["predict", "decision_function", "score_samples"]: msg = "{} is not available when novelty=False".format(method) with pytest.raises(AttributeError, match=msg): getattr(clf, method) # check errors for novelty=True clf = neighbors.LocalOutlierFactor(novelty=True) msg = "fit_predict is not available when novelty=True" with pytest.raises(AttributeError, match=msg): getattr(clf, "fit_predict")
def test_score_samples(): X_train = [[1, 1], [1, 2], [2, 1]] clf1 = neighbors.LocalOutlierFactor(n_neighbors=2, contamination=0.1, novelty=True).fit(X_train) clf2 = neighbors.LocalOutlierFactor(n_neighbors=2, novelty=True).fit(X_train) assert_array_equal( clf1.score_samples([[2.0, 2.0]]), clf1.decision_function([[2.0, 2.0]]) + clf1.offset_, ) assert_array_equal( clf2.score_samples([[2.0, 2.0]]), clf2.decision_function([[2.0, 2.0]]) + clf2.offset_, ) assert_array_equal(clf1.score_samples([[2.0, 2.0]]), clf2.score_samples([[2.0, 2.0]]))
def test_lof_values(): # toy samples: X_train = [[1, 1], [1, 2], [2, 1]] clf1 = neighbors.LocalOutlierFactor(n_neighbors=2, contamination=0.1).fit(X_train) clf2 = neighbors.LocalOutlierFactor(n_neighbors=2).fit(X_train) s_0 = 2. * sqrt(2.) / (1. + sqrt(2.)) s_1 = (1. + sqrt(2)) * (1. / (4. * sqrt(2.)) + 1. / (2. + 2. * sqrt(2))) # check predict() assert_array_almost_equal(-clf1.negative_outlier_factor_, [s_0, s_1, s_1]) assert_array_almost_equal(-clf2.negative_outlier_factor_, [s_0, s_1, s_1]) # check predict(one sample not in train) assert_array_almost_equal(-clf1._score_samples([[2., 2.]]), [s_0]) assert_array_almost_equal(-clf2._score_samples([[2., 2.]]), [s_0]) # check predict(one sample already in train) assert_array_almost_equal(-clf1._score_samples([[1., 1.]]), [s_1]) assert_array_almost_equal(-clf2._score_samples([[1., 1.]]), [s_1])
def test_hasattr_prediction(): # check availability of prediction methods depending on novelty value. X = [[1, 1], [1, 2], [2, 1]] # when novelty=True clf = neighbors.LocalOutlierFactor(novelty=True) clf.fit(X) assert hasattr(clf, 'predict') assert hasattr(clf, 'decision_function') assert hasattr(clf, 'score_samples') assert not hasattr(clf, 'fit_predict') # when novelty=False clf = neighbors.LocalOutlierFactor(novelty=False) clf.fit(X) assert hasattr(clf, 'fit_predict') assert not hasattr(clf, 'predict') assert not hasattr(clf, 'decision_function') assert not hasattr(clf, 'score_samples')
def sk_check(X_train, X_test, y_test, o_list): f_f = [neighbors.LocalOutlierFactor(n_neighbors=5),\ neighbors.LocalOutlierFactor(n_neighbors=10),\ neighbors.LocalOutlierFactor(n_neighbors=35),\ IsolationForest(max_samples='auto')] f_name = ['LOF5', 'LOF10', 'LOF35', 'i-forest'] columns = ['method'] + ['AUC', 'MCC', 'BRU'] n_row = 2 index = np.arange(n_row) # array of numbers for the number of samples df = pd.DataFrame(columns=columns, index=index) exec 'T_o =' + (' | '.join(['(y_test==' + str(i) + ')' for i in o_list])) auc_max = -1 for i in range(3): lof = f_f[i] lof.fit(X_test) outliers = -lof.negative_outlier_factor_ auc_test = roc_auc_score(T_o, outliers) if auc_test > auc_max: auc_max = auc_test df['method'][0] = f_name[i] df['MCC'][0] = mce.MCC(T_o, outliers) df['AUC'][0] = auc_max df['BRU'][0] = mce.bru_score(T_o, outliers) df['method'][1] = f_name[3] isof = f_f[3] isof.fit(X_train) scores_pred = isof.decision_function(X_test) outliers = scores_pred.max() - scores_pred df['MCC'][1] = mce.MCC(T_o, outliers) df['AUC'][1] = roc_auc_score(T_o, outliers) df['BRU'][1] = mce.bru_score(T_o, outliers) return df
def test_predicted_outlier_number(expected_outliers): # the number of predicted outliers should be equal to the number of # expected outliers unless there are ties in the abnormality scores. X = iris.data n_samples = X.shape[0] contamination = float(expected_outliers) / n_samples clf = neighbors.LocalOutlierFactor(contamination=contamination) y_pred = clf.fit_predict(X) num_outliers = np.sum(y_pred != 1) if num_outliers != expected_outliers: y_dec = clf.negative_outlier_factor_ check_outlier_corruption(num_outliers, expected_outliers, y_dec)
def __train(self, train_data, columns, label, n_neighbors=None, distance_metric=None): if (n_neighbors == None): n_neighbors = self.__n_neighbors_list[0] if (distance_metric == None): distance_metric = self.__distMetricsList[0] self.__model = neighbors.LocalOutlierFactor(n_neighbors=n_neighbors, algorithm='auto', metric=distance_metric) self.__model.fit_predict(train_data[columns])
def train_and_save(training_data, outloc): """ Trains a LOF algorithm for the purposes of novelty detection and pickles it Standardizes the data first (transforms each column by subtracting the mean and then dividing by the stddev) Parameters ---------- training_data : TYPE a pandas DataFrame of the training data. out_loc : TYPE name of the pickled object to save, which is a tuple with length 2, where the first entry is the model. The second is a list of lists, where the first list is the list of means used to transform the data and the second is the list of the stddevs used to transform the data Returns ------- a tuple with length 2, where the first entry is the model. The second is a list of lists, where the first list is the list of means used to transform the data and the second is the list of the stddevs used to transform the data """ standard_data = training_data.copy() means = [] stddevs = [] for col in training_data.columns: mean = training_data[col].mean() stddev = training_data[col].std() means.append(mean) stddevs.append(stddev) standard_data = standardize_data(training_data, (means, stddevs)) lof = neighbors.LocalOutlierFactor(novelty=True) lof.fit(standard_data) out_obj = (lof, (means, stddevs)) pickle.dump(out_obj, open(outloc, "wb" )) return out_obj
def test_lof_performance(): # Generate train/test data rng = check_random_state(2) X = 0.3 * rng.randn(120, 2) X_train = X[:100] # Generate some abnormal novel observations X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)) X_test = np.r_[X[100:], X_outliers] y_test = np.array([0] * 20 + [1] * 20) # fit the model for novelty detection clf = neighbors.LocalOutlierFactor(novelty=True).fit(X_train) # predict scores (the lower, the more normal) y_pred = -clf.decision_function(X_test) # check that roc_auc is good assert roc_auc_score(y_test, y_pred) > .99
def registerGroundTruth(truth): global model global modelOneClasses global histograms global kernel model = svm.NuSVC(kernel=kernel) for obj in truth: modelOneClasses[obj] = neighbors.LocalOutlierFactor(novelty=True) histograms = truth data = [] labels = [] for obj in truth: objData = [] for hist in truth[obj]: data.append(hist[0]) objData.append(hist[0]) labels.append(obj) modelOneClasses[obj].fit(objData) print('Fitting model to data') model.fit(data, labels)
def AdewoyinSavgolFilter(out_col, dir_model_country, windowlength=13, polyorder=4, types=['C', 'L'], save_xlsx=False, _auto_filter_date=False, re_turn_graph=False): for _out_col in out_col: path = dirs_excel[dir_model_country] + _out_col data = pd.read_excel(path + '.xlsx', sheet_name='Data') data = dateFilter(data, auto=_auto_filter_date) for _type in types: #Creating copy of original data ex outliers #Since there is approx 600 values in my data and they are stable, I will assume there are at most 12 outliers since data _contamination = (40 / data[_type].shape[0]) outlier_clf = neighbors.LocalOutlierFactor( n_neighbors=20, contamination=_contamination, n_jobs=1) data[_type + " inliers"] = outlier_clf.fit_predict( data.loc[:, _type].as_matrix().reshape(-1, 1)) inlier_max = data[(data[_type + " inliers"] == 1)][_type].max() inlier_min = data[(data[_type + " inliers"] == 1)][_type].min() absolute_inlier_max = np.maximum(np.absolute(inlier_max), np.absolute(inlier_min)) data[_type + " ex. outliers"] = [ (np.sign(x) * absolute_inlier_max) if np.absolute(x) > absolute_inlier_max else x for x in data[_type] ] ##making savgol filter and savgol period on period change savgol_signal = pd.DataFrame( signal.savgol_filter(data.loc[:, _type], window_length=windowlength, polyorder=polyorder)) data["{}_Savgol_Filtered".format(_type)] = savgol_signal signal_change_per_period = savgol_signal.diff() signal_change_per_period_sign = [ 1 if val > 0 else -1 if val < 0 else 0 for val in signal_change_per_period.iloc[:, 0] ] #AdeSavGol output data[_type + " above 0"] = data[_type + " ex. outliers"] - ( data[_type + " ex. outliers"].min()) data[_out_col + _type] = data[_type + " above 0"] * (signal_change_per_period_sign) if save_xlsx == True: writer = pd.ExcelWriter(path + '_hardcoded.xlsx') data.to_excel(writer, 'Data', index=False) if re_turn_graph == True: #x= np.arange(len(savgol_signal)-1) trace0 = go.Scatter(x=np.asarray(data['Date']), y=data[_type].as_matrix().flatten(), name="Original Data") trace1 = go.Scatter(x=np.asarray(data['Date']), y=savgol_signal.as_matrix().flatten(), name="Savgol Filter") trace2 = go.Scatter( x=np.asarray(data['Date']), y=np.asarray(signal_change_per_period_sign).flatten(), name="Savgol Filter Change") trace3 = go.Scatter(x=np.asarray(data['Date']), y=data[_type + " ex. outliers"].as_matrix().flatten(), name="L^ex-out") trace4 = go.Scatter(x=np.asarray(data['Date']), y=data[_type + " above 0"].as_matrix().flatten(), name="L^S2") trace5 = go.Scatter(x=np.asarray(data['Date']), y=data[_out_col + _type].as_matrix().flatten(), name='L_asg') trace_set = [trace0, trace1, trace2, trace3, trace4, trace5] layout = go.Layout(title=_out_col + _type) fig = go.Figure(data=trace_set, layout=layout) py.offline.iplot(fig, image='png', filename="{}-{}".format(_out_col, _type))
def test_contamination(): X = [[1, 1], [1, 0]] clf = neighbors.LocalOutlierFactor(contamination=0.6) with pytest.raises(ValueError): clf.fit(X)
def test_novelty_true_common_tests(): # the common tests are run for the default LOF (novelty=False). # here we run these common tests for LOF when novelty=True check_estimator(neighbors.LocalOutlierFactor(novelty=True))
clf.fit(X) assert hasattr(clf, "predict") assert hasattr(clf, "decision_function") assert hasattr(clf, "score_samples") assert not hasattr(clf, "fit_predict") # when novelty=False clf = neighbors.LocalOutlierFactor(novelty=False) clf.fit(X) assert hasattr(clf, "fit_predict") assert not hasattr(clf, "predict") assert not hasattr(clf, "decision_function") assert not hasattr(clf, "score_samples") @parametrize_with_checks([neighbors.LocalOutlierFactor(novelty=True)]) def test_novelty_true_common_tests(estimator, check): # the common tests are run for the default LOF (novelty=False). # here we run these common tests for LOF when novelty=True check(estimator) @pytest.mark.parametrize("expected_outliers", [30, 53]) def test_predicted_outlier_number(expected_outliers): # the number of predicted outliers should be equal to the number of # expected outliers unless there are ties in the abnormality scores. X = iris.data n_samples = X.shape[0] contamination = float(expected_outliers) / n_samples clf = neighbors.LocalOutlierFactor(contamination=contamination)
# mostra gráfico com a variação no valor de cada gene(para ver o que o filtro flat pattern irá fazer) plt.bar(np.arange(len(variancias)), variancias, width=30) plt.title("Variância em cada gene") plt.xlabel('Genes') plt.ylabel("Variância") plt.show() # Filtro flat pattern (retira genes com pouca variabilidade) model_flat = VarianceThreshold(threshold=var_media * 2) input_filtrado1 = model_flat.fit_transform(input_data) print("Tamanho inicial: ", input_data.shape, "\nTamanho depois do filtro flat pattern: ", input_filtrado1.shape) # Remoçao de anomalias(pontos que se encontram muito fora do normal) outlier_model = neighbors.LocalOutlierFactor(n_neighbors=20, contamination=0.1) remover = outlier_model.fit_predict(input_filtrado1.transpose()) input_filtrado2 = np.delete(input_filtrado1, remover, axis=1) print("Tamanho depois do filtro de outliers: ", input_filtrado2.shape) # Normalização dos dados scaled_input = preprocessing.scale(input_filtrado2) print("Média: ", scaled_input.mean()) print("Desvio padrão: ", scaled_input.std()) labels_doenca = meta.values[:, 1] # insulin sensible, insulin resistant, diabetic labels_tratamento = meta.values[:, 2] # insulina vs nao tratados print( '\n\n---------------------------- Análise Estatisticos Multivariada ----------------------------------------\n'
def test_contamination(): X = [[1, 1], [1, 0]] clf = neighbors.LocalOutlierFactor(contamination=0.6) assert_raises(ValueError, clf.fit, X)
def test_contamination_future_warning(): X = [[1, 1], [1, 2], [2, 1]] assert_warns_message( FutureWarning, 'default contamination parameter 0.1 will change ' 'in version 0.22 to "auto"', neighbors.LocalOutlierFactor().fit, X)
def LocalOutlierFactorOutlier(data, margin=0, n_neighbors=20, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, contamination='auto', novelty=False, n_jobs=None): """Returns numpy array with data points labelled as outliers Parameters ---------- n_neighbors : int, default=20 Number of neighbors to use by default for :meth:`kneighbors` queries. If n_neighbors is larger than the number of samples provided, all samples will be used. algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' Algorithm used to compute the nearest neighbors: - 'ball_tree' will use :class:`BallTree` - 'kd_tree' will use :class:`KDTree` - 'brute' will use a brute-force search. - 'auto' will attempt to decide the most appropriate algorithm based on the values passed to :meth:`fit` method. Note: fitting on sparse input will override the setting of this parameter, using brute force. leaf_size : int, default=30 Leaf size passed to :class:`BallTree` or :class:`KDTree`. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. metric : str or callable, default='minkowski' metric used for the distance computation. Any metric from scikit-learn or scipy.spatial.distance can be used. If metric is "precomputed", X is assumed to be a distance matrix and must be square. X may be a sparse matrix, in which case only "nonzero" elements may be considered neighbors. If metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two arrays as input and return one value indicating the distance between them. This works for Scipy's metrics, but is less efficient than passing the metric name as a string. Valid values for metric are: - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan'] - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] See the documentation for scipy.spatial.distance for details on these metrics: https://docs.scipy.org/doc/scipy/reference/spatial.distance.html p : int, default=2 Parameter for the Minkowski metric from :func:`sklearn.metrics.pairwise.pairwise_distances`. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. metric_params : dict, default=None Additional keyword arguments for the metric function. contamination : 'auto' or float, default='auto' The amount of contamination of the data set, i.e. the proportion of outliers in the data set. When fitting this is used to define the threshold on the scores of the samples. - if 'auto', the threshold is determined as in the original paper, - if a float, the contamination should be in the range [0, 0.5]. .. versionchanged:: 0.22 The default value of ``contamination`` changed from 0.1 to ``'auto'``. novelty : bool, default=False By default, LocalOutlierFactor is only meant to be used for outlier detection (novelty=False). Set novelty to True if you want to use LocalOutlierFactor for novelty detection. In this case be aware that that you should only use predict, decision_function and score_samples on new unseen data and not on the training set. .. versionadded:: 0.20 n_jobs : int, default=None The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. """ lof = neighbors.LocalOutlierFactor(n_neighbors=n_neighbors, algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, contamination=contamination, novelty=novelty, n_jobs=n_jobs) lof.fit(data) scores = -lof.negative_outlier_factor_ scores = list(scores) lower_range, upper_range = iqr_threshold_method(scores, margin) outlier_points = [] for i in range(len(scores)): if scores[i] < lower_range or scores[i] > upper_range: outlier_points.append(data[i]) return outlier_points
def __init__(self, params, k): self.__distMetricsList = params["distance_metrics"] self.__n_neighbors_list = params['n_neighbors'] self.__model = neighbors.LocalOutlierFactor() self.__fold_val = k
# output vector y_redox = df.loc[:, out[0]].values y_pka = df.loc[:, out[1]].values y_lnk = df.loc[:, out[2]].values y_homo = df.loc[:, out[3]].values y_tot = df.loc[:, out].values y_extended = df.loc[:, out_extended].values # one-hot x.astype("float64") y_extended.astype("float64") full_table = np.concatenate((x, y_extended), axis=1).astype("float64") n_samples = 157 outliers_fraction = 0.1 clusters_separation = [0, 1, 2] clf = neighbors.LocalOutlierFactor(novelty=True) clf.fit(full_table) # Use this to compare with other descriptors we gen # -erate later. This table gives what may be outliers # We could compare with chemical distance via other # metrics print(clf.negative_outlier_factor_ > -1.5) # regress via multivariate linear, # bayesian, gd, huberregessor(applies linear loss to outliers) # knn regressor # NN
n_neighbors = 20 fscores = [] accs = [] for z in xrange(0, 1): logfile = directory + "log-" + str(z) + ".csv" with open(logfile, "w") as file: file.write("test,PCALevel,acc,val_acc,f1\n") for x in xrange(1, 71): pca = PCA(n_components=x) Xall = pca.fit_transform(dftrain.ix[:, 1:dftrain.shape[1]].values) clf = neighbors.LocalOutlierFactor(n_neighbors=n_neighbors, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, contamination=cont, n_jobs=4) testPred = clf.fit_predict(Xall) print(len(Xall)) score = 0.0 for i in xrange(0, len(Xall)): if (testPred[i] == 1 and Yall[i] == "Normal") or (testPred[i] == -1 and Yall[i] == "Malicious"): score += 1 testAcc = float(score) / len(Yall) preds = testPred