def outliers(X_train, X_test=None, features=None, name=''): for i, model in enumerate( [IsolationForest, EllipticEnvelope, OneClassSVM, LocalOutlierFactor]): model = IsolationForest(contamination=0.1) if features: pred = model.fit_predict(X_train[features]) else: pred = model.fit_predict(X_train) X_train[f"{name}_outlier_detection_{i}"] = pred if X_test is not None: if features: pred = model.predict(X_test[features]) else: pred = model.predict(X_test) X_test[f"{name}_outlier_detection_{i}"] = pred if X_test is not None: return X_train, X_test else: return X_train
def detect_outliers_IF(df, n_estimators=100): ''' Returns the outlier scores using IsolationForest Parameters: ----------- df: pd.DataFrame, ''' clf = IsolationForest(n_estimators=n_estimators, contamination=0.1, random_state=123) clf.fit_predict(df) scores = clf.score_samples(df) # dec_func = clf.decision_function(df_imputed) return scores
def generateData(filename_train, filename_test): data = pd.read_csv(filename_train, header=None) test_data = pd.read_csv(filename_test, header=None) train_data = data.iloc[:, :-2] test_data = test_data.iloc[:, :-1] #normalize the dataset scalar = MinMaxScaler() train_data = scalar.fit(train_data).transform(train_data) #new_test_data=scalar.fit_transform(test_data) scalar_test = StandardScaler() test_data = scalar_test.fit_transform(test_data) rng = np.random.RandomState(10) clf = IsolationForest(n_estimators=200, behaviour='new', max_samples=200, max_features=5, random_state=rng, contamination='auto') clf.fit(train_data) #train data pre_label = clf.fit_predict(test_data) print(pre_label) count = 0 index_numbers = [] for index, i in enumerate(pre_label): if (i == -1): count += 1 index_numbers.append(index) print(index_numbers) print('The negative sample is :', count) return index_numbers
def calculateKNNgraphDistanceMatrixML(featureMatrix, distanceType='euclidean', k=10, param=None): r""" Thresholdgraph: KNN Graph with Machine Learning based methods IsolationForest https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html#sklearn.ensemble.IsolationForest """ distMat = distance.cdist(featureMatrix, featureMatrix, distanceType) edgeList = [] # parallel: n_jobs=-1 for using all processors clf = IsolationForest(behaviour='new', contamination='auto', n_jobs=-1) for i in np.arange(distMat.shape[0]): res = distMat[i, :].argsort()[:k + 1] preds = clf.fit_predict(featureMatrix[res, :]) for j in np.arange(1, k + 1): # weight = 1.0 if preds[j] == -1: weight = 0.0 else: weight = 1.0 #preds[j]==-1 means outliner, 1 is what we want edgeList.append((i, res[j], weight)) return edgeList
def variance_contrast(X, k=3, contamination=0.01): X = StandardScaler().fit_transform(X) pca = PCA(n_components=None, random_state=2018) pca.fit(X) # variance_original为各主成分对应的特征值,即样本在主成分空间内的投影方差 variance_original = pca.explained_variance_ # 运用孤立森林进行异常检测,得到异常样本索引anomaly_indices iforest = IsolationForest(contamination=contamination, random_state=2018, n_jobs=-1, behaviour="new") anomaly_pred = iforest.fit_predict(X) anomaly_indices = np.argwhere(anomaly_pred==-1).ravel() # 删除异常样本,得到矩阵X_trimmed X_trimmed = X[np.isin(range(len(X)), anomaly_indices, invert=True)] # 对X_trimmed进行PCA,求得特征值variance_revised pca.fit(X_trimmed) variance_revised = pca.explained_variance_ # 对删除异常样本前后的特征值进行对比 delta_ratio = (variance_revised - variance_original) / variance_original # 只选取delta_ratio中的负数,确保对应特征值是下降的 target_ratio = delta_ratio[delta_ratio < 0] # k为预设参数,表示选取特征值减小幅度最大的前k个索引 if len(target_ratio) >= k: indices_desc_topk = np.argsort(target_ratio)[:k] else: indices_desc_topk = np.argsort(target_ratio)[:len(target_ratio)] # min_max_idx为最小与最大特征值对应的索引 min_max_idx = [0, X.shape[1]-1] # 验证min_max_idx之中是否有任何一个索引出现在indices_desc_topk中 bool_result = any(np.isin(min_max_idx, indices_desc_topk)) return indices_desc_topk, bool_result
def model_iF(data): from sklearn.ensemble import IsolationForest iF = IsolationForest(random_state=0) y_pred = iF.fit_predict(data["X_test"]) return y_pred
class ISF(object): def __init__(self, file_name, config): self.dataset = config.dataset self.file_name = file_name self.x_dim = config.x_dim self.n_estimators = config.n_estimators self.max_samples = config.max_samples self.bootstrap = config.bootstrap self.max_features = config.max_features self.contamination = config.contamination self.pid = config.pid self.model = IsolationForest(n_estimators=self.n_estimators, max_samples=self.max_samples, bootstrap=self.bootstrap, max_features=self.max_features, contamination=self.contamination) def fit(self, train_input, train_label, test_input, test_label): y_pred = self.model.fit_predict(train_input) decision_function = self.model.decision_function(train_input) isf_output = ISFOutput(y_hat=y_pred, decision_function=decision_function) return isf_output
def isolation_criterion(self): clf = IsolationForest(behaviour="new", max_samples=100, random_state=1, contamination="auto") preds = clf.fit_predict(self.p_table) return preds
def scan(leagues, positions, transfer_fee, wage, age): df = load_data() df = df[df["age"] <= age] if all_name not in leagues: df = df[df["league"].isin(leagues)] df = df[(df["Value"] <= transfer_fee) & (df["Wage"] <= wage)] df["filter_positions"] = df.apply( lambda row: filter_positions(row, positions), axis=1) search_space = df.loc[df["filter_positions"] == True] search_space.reset_index(drop=True, inplace=True) # find outliers here # Returns -1 for outliers and 1 for inliers. X = search_space[possible_columns_to_compare].to_numpy() clf = IsolationForest(random_state=42, n_jobs=-1) search_space["label"] = pd.Series(list(clf.fit_predict(X))) search_space["score"] = pd.Series(list(clf.score_samples(X))) # The anomaly score of the input samples. The lower, the more outlier. search_space.sort_values(by=["score"], inplace=True) return search_space
def addSeg(self, seg1, seg2, N): self.overSeg[np.logical_or(self.overSeg == seg1, self.overSeg == seg2)] = N temp = self.overSeg == N self.segL[N] = np.mean(self.lab[:, :, 0][temp]) self.segA[N] = np.mean(self.lab[:, :, 1][temp]) self.segB[N] = np.mean(self.lab[:, :, 2][temp]) for cnnRatioIdx, cnnRes in enumerate(self.allRatioCnnRes): self.clfList[N][cnnRatioIdx] = [] self.segVectors[N][cnnRatioIdx] = [] self.segClustersL2[N][cnnRatioIdx] = [] for layerNum, outLayer in enumerate(cnnRes): v = outLayer[temp] if self.ifFilter: clf = IsolationForest(behaviour='new', max_samples=max( 4, v.shape[0] // self.clfSplit), n_estimators=self.n_trees, random_state=0, contamination='auto', n_jobs=1) v_new = v[clf.fit_predict(v) == 1, :] if v_new.shape[0] != 0: v = v_new if v.shape[0] > self.maxSize: np.random.seed(0) v = v[np.random.randint(v.shape[0], size=self.maxSize)] self.segVectors[N][cnnRatioIdx].append(v) cl = np.mean(v, axis=0) self.segClustersL2[N][cnnRatioIdx].append(cl)
def main(): data = pd.read_csv("data.csv") for idx, row in data.iterrows(): data.at[idx, 'ML'] = wordToNumber(row['ML']) data.at[idx, 'DW'] = wordToNumber(row['DW']) mldw_data = data.loc[:, 'ML':].values clf = IsolationForest(behaviour='new', max_samples=28, random_state=1, contamination='auto') preds = clf.fit_predict(mldw_data) # print(data['NAMA'][1]) i = 0 outliers_idx = [] for predict in preds: i += 1 if predict == -1: outliers_idx.append(i) for i in range(0, len(preds)): j = i + 1 if j in outliers_idx: plt.scatter(mldw_data[i][0], mldw_data[i][1], c="red") plt.annotate(data['NAMA'][i], (mldw_data[i][0], mldw_data[i][1])) else: plt.scatter(mldw_data[i][0], mldw_data[i][1], c="black") plt.show()
def remove_outliers(train_x, train_y): iso = IsolationForest(contamination='auto') yhat = iso.fit_predict(train_x) print("Removed " + str(yhat.sum()) + " outliers") mask = yhat != -1 train_x, train_y = train_x[mask, :], train_y[mask] return train_x, train_y
def isof(x, y): """This will be our function used to resample our dataset.""" print('Initiating Outlier detection') model = IsolationForest() y_pred = model.fit_predict(x) print('isof: Outliers removed', x[y_pred == -1].shape[0]) return x[y_pred == 1], y[y_pred == 1]
def isolation_forest_outlier_removal(X, y, seed, n_estimators=150, max_samples=0.8, max_features=0.8, contamination="auto"): clf = IsolationForest(n_estimators=n_estimators, max_samples=max_samples, contamination=contamination, max_features=max_features, behaviour="new", random_state=seed, n_jobs=-1) results = clf.fit_predict(X.values) outliers = 0 for r in results: if r == -1: outliers += 1 removing_indices = [i for i in range(0, len(results)) if results[i] == -1] X_train_new = X.drop(X.index[removing_indices]) y_train_new = [y[yi] for yi in range(0, len(y)) if results[yi] == 1] return X_train_new, y_train_new
def dixon(): try: data = np.array(request.json["Data"]) params = request.json['Params'] n_estimators = 100 max_samples = "auto" contamination = "auto" if "n_estimators" in params: n_estimators = params["n_estimators"] if "max_samples" in params: max_samples = params["max_samples"] if "contamination" in params: contamination = params["contamination"] clf = IsolationForest(n_estimators=n_estimators, max_samples=max_samples, contamination=contamination) indices = clf.fit_predict(data) indices = [0 if x == 1 else 1 for x in indices.tolist()] if params['ReturnDoubles']: indices = -clf.score_samples(data) indices = indices.tolist() return jsonify({"message": "OK", "data": indices}) except Exception as e: return jsonify({"message": str(e)}), 400
def grid_search(self): print("============ Starting perceptron grid search ============") best_accuracy = 0 best_var_smoothing = None best_contamination = None for var_smoothing_i in np.linspace(0.001, 1, 5): self.var_smoothing_i = var_smoothing_i for contamination_i in np.linspace(0, 0.5, 20): iso = IsolationForest(contamination=contamination_i) yhat = iso.fit_predict(self.original_x_train) mask = yhat != -1 self.x_train, self.t_train = self.original_x_train[ mask, :], self.original_t_train[mask] self.classifier = GaussianNB(var_smoothing=var_smoothing_i) mean_cross_validation_accuracy = self.cross_validation() if mean_cross_validation_accuracy == 100: print( "All train data was correctly classified during cross-validation !" ) if mean_cross_validation_accuracy > best_accuracy: best_accuracy = mean_cross_validation_accuracy best_var_smoothing = var_smoothing_i best_contamination = contamination_i print( "Grid Search final hyper-parameters :\n" " best_var_smoothing=", best_var_smoothing, "\n" + " best_contamination=", best_contamination) return best_var_smoothing, best_contamination
def outlier_smoothing(X, contamination=0.15, smoothing_window=4, plot=True, random_state=22, verbose=True): """ Outlier identification by IForest and smoothing by rolling window median value """ X_rolling_median = X.rolling(smoothing_window).median() X_rolling_mean = X.rolling(smoothing_window).mean() X_smoothing_ratio = X / X_rolling_median if plot: plt.figure(figsize=(10, 10)) plt.plot(X.index, X, label='original') plt.plot(X.index, X_rolling_median, label='rolling median') plt.title("Original vs. Rolling Median") plt.legend() plt.show() plt.figure(figsize=(10, 10)) plt.plot(X.index, X_smoothing_ratio, label="original:smoothing ratio") plt.title("Smoothing Ratio") plt.legend() plt.show() ## Find the outliers iso_forest = IsolationForest(contamination=contamination,\ random_state=random_state) peaks = np.where(iso_forest.fit_predict(X_smoothing_ratio[smoothing_window-1:].\ values.reshape(-1,1))<1) if verbose: print("Outliers found at ", X.index[peaks[0] + smoothing_window - 1]) if plot: plt.figure(figsize=(10, 10)) plt.plot(X.index, X, label='original') plt.plot(X.index.values[peaks[0]+smoothing_window-1],\ X.values[peaks[0]+smoothing_window-1], 'x' ) plt.title("Outlier Finders") plt.legend() plt.show() ## Change the outliers with corresponding smoothed values X_smoothed = X.copy() for i in range(len(X)): if np.any(peaks[0] + smoothing_window - 1 == i): X_smoothed[i] = X_rolling_mean[i] if plot: plt.figure(figsize=(10, 10)) plt.plot(X.index, X, label='original') plt.plot(X.index, X_smoothed, label='smoothed') plt.title("Original vs. smoothed") plt.legend() plt.show() return X_smoothed
def remove_outliers(df: pd.DataFrame) -> pd.DataFrame: # identify and remove outliers from dataframe iso = IsolationForest(contamination=0.05, random_state=RANDOM_STATE) predict = iso.fit_predict(df.iloc[:, 0:-1]) mask = predict != -1 return df.iloc[mask]
def if_model_v2(self, col, n_estimators): x = self.prepare_date(col) shape = self.data.shape[0] target_mapper = x.shape[0] / shape cont = np.where( target_mapper < 0.1, 0.1, np.where(target_mapper < 0.5, 0.07, np.where(target_mapper < 0.7, 0.06, 0.05))) clf = IsolationForest(n_estimators=n_estimators, n_jobs=-1, contamination=cont, max_samples=0.8) x = x.iloc[:, :1] x['t'] = np.array(range(x.shape[0])) + 1 print(x.shape) clf.fit(x) predicted = clf.fit_predict(x) x['outlier'] = np.where(predicted == -1, True, False) x.reset_index(inplace=True) x['t'] = x.reset_index().iloc[:, 0] x.rename(columns={col: 'value'}, inplace=True) x['Series'] = col print('if', cont) return x
def detect_outliers(self, features: pd.DataFrame) -> pd.DataFrame: """ Find outliers in :param features for bt_scooter: :param features: features before outliers removal :return: features after removing outliers """ # identify outliers in the training dataset of bt scooter print( ' ' + 'Find outliers in :param features ( based on :param features_columns) for bt_scooter' ) stat_list = [] # Iteratively identify outliers. # The code removes 50% of the data from the remaining records during each iteration. for i in [1, 2, 3]: print(i) X_train = features[self.features_columns] stat_list.append(list(X_train.mean())) stat_list.append(list(X_train.std())) iso = IsolationForest(contamination=0.5) features['isoutlayer'] = iso.fit_predict(X_train) features = features[features['isoutlayer'] == 1] features = features.drop(columns='isoutlayer', axis=1) pd.DataFrame( stat_list, columns=self.features_columns).to_csv(self.output_folder + '\stat_outliers.csv') return features
def detectWithIsolationForest(self): ''' Apply the Isolation Forest. ''' # Find Model Hyperparameters hpMap = self.config['AnomalyDetector'][ 'IsolationForestHyperparameters'] # Get the Thresholded Response yStar, idxList = self.removeCommonData(hpMap['spreadStatistic'], hpMap['threshold']) yStar = [[elem] for elem in yStar] # Instantiate the Local Outlier Factor ISO = IsolationForest(n_estimators=hpMap['numEstimators'], bootstrap=hpMap['bootstrap']) # Fit and Predict with the Local Outlier Factor predictions = ISO.fit_predict(yStar) scores = ISO.decision_function(yStar) # Report the Lon/Lat Points Corresponding to the Anomalies # in the Order of Decreasing Anomaly Score (i.e., the Most # Anomalous Points are Shown First) anomalyIdxList = [ idxList[i] for i in range(len(yStar)) if predictions[i] == -1 ] anomalyLonLatMap = {scores[i]: (self.M['longitude'][idx], self.M['latitude'][idx]) \ for idx in anomalyIdxList} sortedScores = sorted(scores) anomaliesLonLatSorted = [anomalyLonLatMap[sortedScores[i]] for i in range(len(sortedScores)) \ if sortedScores[i] in anomalyLonLatMap] return anomaliesLonLatSorted
def workIso(modelDir, inputDict): samples = float(inputDict['samples']) estimators = int(inputDict['estimators']) contamination = float(inputDict['contaminationIso']) if contamination == 0: contamination = 'auto' clf = IsolationForest(max_samples=samples, n_estimators=estimators, contamination=contamination, behaviour='new', random_state=42) # load our csv to df f = StringIO(inputDict["file"]) df = pd.read_csv(f) datapoints = df.to_numpy() maxVals = np.max(datapoints, axis=0) maxVals = np.tile( maxVals, (datapoints.shape[0],1) ) normalizedDatapoints = np.divide(datapoints,maxVals) labels = clf.fit_predict(normalizedDatapoints) scores = clf.score_samples(normalizedDatapoints) plotData = [] x = np.arange(0,datapoints.shape[0]) data = go.Scatter( x=x, y=datapoints[:,0], name='data', mode='lines+markers' ) plotData.append(data) outliers = go.Scatter( x=x[labels!=1], y=datapoints[labels!=1, 0], name='outliers', mode='markers' ) plotData.append(outliers) return plotData
def remove_outliers(self): clf = IsolationForest(n_estimators=20, contamination=0.1, behaviour='new') pred = clf.fit_predict(self.df.loc[:, self.xcols + self.ycols]) self.df = self.df[pred != -1] self.df = self.df.reset_index(drop=True)
def check_outlier_iforest( x: pd.DataFrame, cols=[ 'col1', 'col2', 'col3', 'col4', ]) -> pd.DataFrame: """ Check outliers with isolation forest not including null. """ from sklearn.ensemble import IsolationForest intersection_cols = list(x.columns.intersection(cols)) df = pd.DataFrame(columns=['outlier_percent']) for col in intersection_cols: temp = x[[col]] temp = temp.dropna() clf = IsolationForest() preds = clf.fit_predict(temp[col].values.reshape(-1, 1)) percent = len(preds[preds < 0]) / len(preds) df.loc[col, 'outlier_percent'] = percent if len(df) == 0: print('Outlier warnings: There are no columns.') return df.sort_values(ascending=False, by=['outlier_percent'])
def IFdrop1(df): IForest = IsolationForest(random_state=0, n_jobs=-1, verbose=0, contamination=0.03) y_pred = IForest.fit_predict(df.values.reshape(-1, 2)) return pd.DataFrame(y_pred)
def get_outliers_isolation_forest( df, n_estimators=100, contamination="auto", n_jobs=-1, ): print(f"\nGet outliers with Isolation Forest...") # Identify outliers in the training dataset iso = IsolationForest( n_estimators=n_estimators, contamination=contamination, bootstrap=True, n_jobs=n_jobs, verbose=1, random_state=rnd_state, ) # yhat = iso.fit_predict(df_X_train_processed) yhat = iso.fit_predict(df) print(f"It was found out {sum(yhat==-1)} outliers.") # Get mask for all TRAIN rows that are not outliers outliers_mask_train_iforest = yhat != -1 joblib.dump( outliers_mask_train_iforest, data_processed_dir / f"outliers_mask_train_iforest.joblib", ) print(f"Outliers by Isolation Forest saved {data_processed_dir}.") return outliers_mask_train_iforest
def main(): ''' The procedure contains two simple steps: - Scale the data to the standard distribution with mean 0 and unit variance. This might be too simplistic. - Apply the isolation forest. The contamination level is set manually. ''' domains = [] raw = [] with open(sys.argv[1]) as fhandle: for line in fhandle: record = json.loads(line.strip()) for analyser in record['analysers']: if analyser['analyser'] == 'FeaturesGenerator': raw.extend(analyser['output']) if analyser['analyser'] == 'WordSegmentation': domains.extend(analyser['output'].keys()) if len(raw) != len(domains): print(record) sys.exit(0) x_samples = scale(np.array(raw)) engine = IsolationForest(behaviour='new', contamination=0.015) y_samples = engine.fit_predict(x_samples) for index, y_sample in enumerate(y_samples): if y_sample == -1: print(domains[index])
def outlierElim(ids, data, cont=0.05): od = IsolationForest(contamination=cont, behaviour="new") outlierIds = [] for x in data: darr = data[x] f_outliers = od.fit_predict(darr) drop_o = np.nonzero(np.where(f_outliers == -1, 1, 0))[0] outlierIds.append(ids[drop_o]) common = np.hstack(outlierIds) u, count_o = np.unique(common, return_counts=True) outlier = u[count_o > 3] print(outlier) _, _, outlier_ind = np.intersect1d(outlier, ids, return_indices=True) np.savetxt(output_fld + 'ids_outlier.csv', ids[outlier_ind], delimiter=",", fmt='%12.5f') ids = np.delete(ids, outlier_ind) np.savetxt(output_fld + 'ids_outlierDropped.csv', ids, delimiter=",", fmt='%12.5f') for x in data: data[x] = np.delete(data[x], outlier_ind, axis=0) np.savetxt(output_fld + x + '_outlierDropped.csv', data[x], delimiter=",", fmt='%12.5f') return ids, data
def remove_outliers(model_name, X, y, **add_params): """ For given X and y, removes detected outliers using either Isolation Forest or Local Outlier Factor. :param model_name: str - 'isf' (for Isolation Forest), 'lof' (for Local Outlier Factor) :param X: numpy array :param y: numpy array :param add_params: additional_params for Isolation Forest / Local Outlier Factor models :return: X, y with removed outliers """ model_name = model_name.lower() if model_name == 'isf': # n_estimators=150, max_samples=0.8, max_features=0.8, contamination="auto" clf = IsolationForest(behaviour='new', random_state=RANDOM_STATE, n_jobs=-1, **add_params) elif model_name == 'lof': clf = LocalOutlierFactor(n_jobs=-1, **add_params) else: raise Exception("Choose one of predefined models ('isf' or 'lof')") results = clf.fit_predict(X) outliers = len(list(filter(lambda x: x == -1, results))) print("Isolation forest found {} outliers".format(outliers)) removing_indices = [i for i in range(0, len(results)) if results[i] == -1] X_new = np.delete(X, removing_indices, axis=0) y_new = [y[yi] for yi in range(0, len(y)) if results[yi] == 1] return X_new, y_new
def detect_outliers(train_data): outliers_list = [] for i in range(1, 1000): clf = IsolationForest(contamination='auto', behaviour='new') outliers_predict = clf.fit_predict(train_data) print("------------------- Isolation Forest ", i) outliers = 0 outliers_id = [] for i in range(1, len(outliers_predict)): o = outliers_predict[i] if o == -1: outliers += 1 outliers_id.append(i) outliers_list.append(outliers_id) my_dict = {} results = [] for i in range(0, x_train.shape[0]): my_dict[i] = 0 for l in outliers_list: for i in l: my_dict[i] += 1 my_dict_s = sorted(my_dict.items(), key=lambda kv: kv[1]) print(my_dict_s) for i in my_dict.keys(): if my_dict[i] > 900: results.append(i) #print_histogram(d) return results
def outlier_removal(df, col, method, params): if method == 'Isolation Forest': do_outlier_removal = IsolationForest(**params) if method == 'Local Outlier Factor': do_outlier_removal = LocalOutlierFactor(**params) else: method == None do_outlier_removal.fit(np.array(df[col])) if method == 'Isolation Forest': outlier_scores = do_outlier_removal.decision_function(np.array(df[col])) df[('meta', 'Outlier Scores - ' + method + str(params))] = outlier_scores is_outlier = do_outlier_removal.predict(np.array(df[col])) df[('meta', 'Outliers - ' + method + str(params))] = is_outlier if method == 'Local Outlier Factor': is_outlier = do_outlier_removal.fit_predict(np.array(df[col])) df[('meta', 'Outliers - ' + method + str(params))] = is_outlier df[('meta', 'Outlier Factor - ' + method + str(params))] = do_outlier_removal.negative_outlier_factor_ return df, do_outlier_removal