def iforest(X_train, X_test, Y_train, Y_test): from pyod.models.iforest import IForest model = IForest(random_state=0) model.fit(X_train) pred = model.predict(X_test) acc = np.sum(pred == Y_test) / X_test.shape[0] print(acc) return (acc * 100)
def outlier_detection(x_raw, y_raw): """ Filter all ourlier points :param x_raw: feature in ndarray :param y_raw: label in ndarray :return x_clean, y_clean: cleaned feature and label in ndarray """ # TODO Filter the outliers. print() print("Detecting outliers...") print("Before outlier detection: {}".format(x_raw.shape)) outliers_fraction = 0.04 random_state = np.random.RandomState(42) # all outlier detection method candidate list as follows classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state), 'Improving Supervised Outlier Detection with Unsupervised Representation Learning': XGBOD(contamination=outliers_fraction), } clf_name = 'Isolation Forest' clf = IForest(contamination=outliers_fraction, random_state=random_state) # clf_name = 'Angle-based Outlier Detector (ABOD)' # clf = ABOD(contamination=outliers_fraction, method='default') clf.fit(x_raw) y_pred = clf.predict(x_raw) # for pyod, 1 means outliers and 0 means inliers # for sklearn, -1 means outliers and 1 means inliers idx_y_pred = [i for i in range(0, 1212) if y_pred[i] == 1] x_clean = del_rowsorcolumns(x_raw, idx_y_pred, axis=0) y_clean = del_rowsorcolumns(y_raw, idx_y_pred, axis=0) print("After outlier detection: {}".format(x_clean.shape)) assert (x_clean.shape[0] == y_clean.shape[0]) return x_clean, y_clean
def detect_outliers(stocks: list, all_stocks_cip: pd.DataFrame, rules=None): """ Returns a dataframe describing those outliers present in stocks based on the provided rules. """ if rules is None: rules = default_point_score_rules() str_rules = { str(r):r for r in rules } rows = [] stocks_by_sector_df = stocks_by_sector() # NB: ETFs in watchlist will have no sector stocks_by_sector_df.index = stocks_by_sector_df['asx_code'] for stock in stocks: #print("Processing stock: ", stock) try: sector = stocks_by_sector_df.at[stock, 'sector_name'] sector_companies = list(stocks_by_sector_df.loc[stocks_by_sector_df['sector_name'] == sector].asx_code) # day_low_high() may raise KeyError when data is currently being fetched, so it appears here... day_low_high_df = day_low_high(stock, all_stocks_cip.columns) except KeyError: warning(None, "Unable to locate watchlist entry: {} - continuing without it".format(stock)) continue state = { 'day_low_high_df': day_low_high_df, # never changes each day, so we init it here 'all_stocks_change_in_percent_df': all_stocks_cip, 'stock': stock, 'daily_range_threshold': 0.20, # 20% at either end of the daily range gets a point } points_by_rule = defaultdict(int) for date in all_stocks_cip.columns: market_avg = all_stocks_cip[date].mean() sector_avg = all_stocks_cip[date].filter(items=sector_companies).mean() stock_move = all_stocks_cip.at[stock, date] state.update({ 'market_avg': market_avg, 'sector_avg': sector_avg, 'stock_move': stock_move, 'date': date }) for rule_name, rule in str_rules.items(): points_by_rule[rule_name] += rule(state) d = { 'stock': stock } d.update(points_by_rule) rows.append(d) df = pd.DataFrame.from_records(rows) df = df.set_index('stock') print(df) from pyod.models.iforest import IForest clf = IForest() clf.fit(df) scores = clf.predict(df) results = [row[0] for row, value in zip(df.iterrows(), scores) if value > 0] #print(results) print("Found {} outlier stocks".format(len(results))) return results
def transform(self, df2: pd.DataFrame) -> pd.DataFrame: """Apply the transforms to the dataframe.""" le = LabelEncoder() df2['mm'] = df2['make'] + ' ' + df2['model'] g_mm_count = df2.groupby(['mm']).count().reset_index() mm_more_than_100 = g_mm_count[g_mm_count['make'] > 100]['mm'] df2 = df2[df2['mm'].isin(mm_more_than_100)] dfn3 = df2.copy() g1 = dfn3.groupby('mm') clf1 = IForest(contamination=0.01) flag = [1] if 1 in flag: dff1 = pd.DataFrame(columns=[ 'idv_id', 'kms_run', 'owners', 'age', 'Popularity Index', 'quoted_price', 'outlier', 'dep_percentage' ]) for idv_id, idv_id_df in g1: idv_id_df1 = idv_id_df[[ 'kms_run', 'owners', 'age', 'quoted_price', 'dep_percentage' ]] clf1.fit(idv_id_df1) y_pred = clf1.predict(idv_id_df1) idv_id_df['outlier'] = y_pred.tolist() dff1 = pd.concat([dff1, idv_id_df]) outlier_idv_if_dff1 = set(dff1[dff1['outlier'] == 1].index) df2 = df2.drop(outlier_idv_if_dff1) df = df2.copy() X = df[[ 'make', 'model', 'city', 'variant', 'owners', 'kms_run', 'age', 'Popularity Index', 'ex_showroom_price', 'fuel_type', 'transmission', 'color' ]] categorical_feature_mask = X.dtypes == object categorical_cols = X.columns[categorical_feature_mask].tolist() self.dic = {} for i in categorical_cols: X[i] = le.fit_transform(X[i]) self.dic[i] = dict(zip(le.classes_, le.transform(le.classes_))) y = df[['dep_percentage']] aa = pd.concat([X, y], axis=1) return aa
def add_other_class(num, size, pad): res = pd.read_csv("data/train.txt", header=None).values tif_data = [] for r in tqdm(range(res.shape[0])): img = get_cell(res[r][1], res[r][2], size) if img is None: print("img NOT Exist.", res[r]) continue img = img.reshape(-1).tolist() tif_data.append([labels_key[res[r][0]]] + img) tif_data = np.array(tif_data) print(tif_data.shape) np.random.shuffle(tif_data) clf = IForest() clf.fit(tif_data[:, 1:]) i = 0 pos = [] false_num = 0 while True: ix = np.random.randint(pad, dataset.RasterXSize - pad) iy = np.random.randint(pad, dataset.RasterYSize - pad) t = get_cell(ix, iy, size) if t is None: continue t = t.reshape(1, -1) y_test_pred = clf.predict(t)[0] # outlier labels (0 or 1) if y_test_pred == 1: i += 1 pos.append(["其他"] + [ix, iy]) print("{}/{} added.".format(i, num)) else: false_num += 1 print("{}/{} is not include {}.{}. false_num: {}".format( i, num, ix, iy, false_num)) if i == num: break pos = np.concatenate((res, np.array(pos)), axis=0) print(Counter(pos[:, 0])) pd.DataFrame(pos).to_csv("data/train_enhance.txt", index=None, header=None) pos[:, 2] = -1 * (pos[:, 2].astype(np.int)) pd.DataFrame(pos).to_csv("data/train_enhance_view.txt", index=None, header=None)
class IForestWrapper: def __init__(self, **kwargs): self._model = IForest(**kwargs) def fit(self, X, T): # unsupervised learning Targets not used self._model.fit(X) return self def predict(self, X): Y = self._model.predict(X) return Y def predict_proba(self, X): probs = self._model.predict_proba(X) return probs
def main(): dataset, label = pre_data() from numpy import nan as NA from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values=NA, strategy="mean") dataset = imputer.fit_transform(dataset) x_train, x_test, y_train, y_label = train_test_split(dataset, label, test_size=0.3, random_state=44) # x_train, x_test, y_train, y_label =[], [], [], [] # for i in range(1000): # x_train.append(dataset[i]) # y_train.append(label[i]) # for i in range(6000,10000): # x_train.append(dataset[i]) # y_train.append(label[i]) # x_test = dataset[1000:6000] # y_label = label[1000:6000] for i in range(3): clf_name = 'IForest' clf = IForest() clf.fit(x_train) # get the prediction label and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores from sklearn.metrics import accuracy_score from sklearn.metrics import precision_score from sklearn.metrics import recall_score print(accuracy_score(y_train, y_train_pred)) print(precision_score(y_train, y_train_pred)) print(recall_score(y_train, y_train_pred)) # get the prediction on the test data y_test_pred = clf.predict(x_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(x_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print(accuracy_score(y_label, y_test_pred)) print(precision_score(y_train, y_train_pred)) print(recall_score(y_train, y_train_pred)) print("\nOn Test Data:") evaluate_print(clf_name, y_label, y_test_scores)
def get_IF_scores(dataframe, cols, outliers_fraction=0.01, standardize=True): '''Takes df, a list selected column nmaes, outliers_fraction = 0.01 default Returns: df with Isolation Forest (IF) scores added ''' if standardize: #standardize selected variables minmax = MinMaxScaler(feature_range=(0, 1)) dataframe[cols] = minmax.fit_transform(dataframe[cols]) #Convert dataframe to a numpy array in order to incorprate our algorithm arrays = [] for row in cols: row = dataframe[row].values.reshape(-1, 1) arrays.append(row) X = np.concatenate((arrays), axis=1) #fit clf = IForest(contamination=outliers_fraction, random_state=0) clf.fit(X) # predict raw anomaly score scores_pred = clf.decision_function(X) * -1 # prediction of a datapoint category outlier or inlier y_pred = clf.predict(X) n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1) CheckOutliers.df3 = dataframe CheckOutliers.df3['outlier'] = y_pred.tolist() print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers, 'found with HBOS')
n_test = 100 # number of testing points X_train, y_train, X_test, y_test = generate_data( n_train=n_train, n_test=n_test, contamination=contamination) # train IForest detector clf_name = 'IForest' clf = IForest() clf.fit(X_train) # get the prediction label and decision_scores_ on the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # visualize the results visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
#print(data['s']) #划分测试集和训练集 X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33) #使用pyod中的IForest算法拟合数据 clf_name = 'IForest' clf = IForest() clf.fit(X_train) #预测得到由0和1组成的数组,1表示离群点,0表示飞离群点 y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores,The outlier scores of the training data. #预测样本是不是离群点,返回0和1 的数组 y_test_pred = clf.predict(X_test) y_test_scores = clf.decision_function( X_test) # outlier scores,The anomaly score of the input samples. #使用sklearn中的roc_auc_score方法得到auc值,即roc曲线下面的面积 try: sumAuc_train += sklearn.metrics.roc_auc_score(y_train, y_train_scores, average='macro') sumAuc_test += sklearn.metrics.roc_auc_score(y_test, y_test_scores, average='macro') #s=precision_score(y_train, y_train_scores, average='macro') i += 1 print(sumAuc_train, sumAuc_test) except ValueError:
class TestIForest(unittest.TestCase): def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = IForest(contamination=self.contamination, random_state=42) self.clf.fit(self.X_train) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert (hasattr(self.clf, '_mu') and self.clf._mu is not None) assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert (hasattr(self.clf, 'estimators_') and self.clf.estimators_ is not None) assert (hasattr(self.clf, 'estimators_samples_') and self.clf.estimators_samples_ is not None) assert (hasattr(self.clf, 'max_samples_') and self.clf.max_samples_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def test_model_clone(self): clone_clf = clone(self.clf) def tearDown(self): pass
class TestIForest(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = IForest(contamination=self.contamination, random_state=42) self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): assert_true(hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true(hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert_true(hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None) assert_true(hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert_true(hasattr(self.clf, 'estimators_') and self.clf.estimators_ is not None) assert_true(hasattr(self.clf, 'estimators_samples_') and self.clf.estimators_samples_ is not None) assert_true(hasattr(self.clf, 'max_samples_') and self.clf.max_samples_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass
############################################# # And the conversion. if IForest is not None: onx = to_onnx(model1, initial_types=initial_type, target_opset=14) ############################################### # Checking discrepencies # ++++++++++++++++++++++ if IForest is not None: data = sc_data.astype(np.float32) expected_labels = model1.predict(data) expected_proba = model1.predict_proba(data) sess = InferenceSession(onx.SerializeToString()) res = sess.run(None, {'float_input': data}) onx_labels = res[0] onx_proba = res[1] diff_labels = np.abs(onx_labels.ravel() - expected_labels.ravel()).max() diff_proba = np.abs(onx_proba.ravel() - expected_proba.ravel()).max() print("dicrepencies:", diff_labels, diff_proba) print("ONNX labels", onx_labels) print("ONNX probabilities", onx_proba)
def train_model(request): global clf if request.method == 'POST': try: json_data = json.loads(request.body) print(json_data) file = json_data['file'] data = pd.read_csv(file) data = data.fillna(0) s = data["Birth year"] s[s != 0] data["Birth year"] = s[s != 0].str.replace("/", "").astype(int) data = data.fillna(0) data['Birth year'].apply(type) data['Uid'] = data['Uid'].astype(str).str.replace(' ', '').astype(float) s = data['Uid'] X1 = data['Birth year'].values.reshape(-1, 1) X2 = data['Uid'].values.reshape(-1, 1) X = np.concatenate((X1, X2), axis=1) outliers_fraction = 0.01 outliers_fraction = 0.01 xx, yy = np.meshgrid(np.linspace(0, 1, 100), np.linspace(0, 1, 100)) clf = IForest(contamination=outliers_fraction, random_state=0) clf.fit(X) # predict raw anomaly score scores_pred = clf.decision_function(X) * -1 # prediction of a datapoint category outlier or inlier y_pred = clf.predict(X) n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1) plt.figure(figsize=(8, 8)) # copy ofa dataframe data1 = data data['outlier'] = y_pred.tolist() # sales - inlier feature 1, profit - inlier feature 2 inliers_Uid = np.array(data['Uid'][data['outlier'] == 0]).reshape( -1, 1) inliers_Birth_year = np.array( data['Birth year'][data['outlier'] == 0]).reshape(-1, 1) # sales - outlier feature 1, profit - outlier feature 2 outliers_Uid = data1['Uid'][data1['outlier'] == 1].values.reshape( -1, 1) outliers_Birth_year = data1['Birth year'][data1['outlier'] == 1].values.reshape(-1, 1) print('OUTLIERS: ', n_outliers, 'INLIERS: ', n_inliers) output = {'OUTLIERS ': n_outliers, 'INLIERS ': n_inliers} return JsonResponse(output) except Exception: return JsonResponse(Exception, safe=False)
# Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train IForest detector clf_name = 'IForest' clf = IForest() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # visualize the results visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, y_test_pred, show_figure=True, save_figure=False)
class TestIForest(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination) self.clf = IForest(contamination=self.contamination) self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): if not hasattr( self.clf, 'decision_scores_') or self.clf.decision_scores_ is None: self.assertRaises(AttributeError, 'decision_scores_ is not set') if not hasattr(self.clf, 'labels_') or self.clf.labels_ is None: self.assertRaises(AttributeError, 'labels_ is not set') if not hasattr(self.clf, 'threshold_') or self.clf.threshold_ is None: self.assertRaises(AttributeError, 'threshold_ is not set') if not hasattr(self.clf, 'estimators_') or self.clf.estimators_ is None: self.assertRaises(AttributeError, 'estimators_ is not set') if not hasattr( self.clf, 'estimators_samples_') or self.clf.estimators_samples_ is None: self.assertRaises(AttributeError, 'estimators_samples_ is not set') if not hasattr(self.clf, 'max_samples_') or self.clf.max_samples_ is None: self.assertRaises(AttributeError, 'max_samples_ is not set') def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_evaluate(self): self.clf.fit_predict_evaluate(self.X_test, self.y_test) def tearDown(self): pass
def detect_outliers(stocks: list, all_stocks_cip: pd.DataFrame, rules=None): """ Returns a dataframe describing those outliers present in stocks based on the provided rules. All_stocks_cip is the "change in percent" for at least the stocks present in the specified list """ if rules is None: rules = default_point_score_rules() str_rules = {str(r): r for r in rules} rows = [] stocks_by_sector_df = (stocks_by_sector() ) # NB: ETFs in watchlist will have no sector stocks_by_sector_df.index = stocks_by_sector_df["asx_code"] for stock in stocks: # print("Processing stock: ", stock) try: sector = stocks_by_sector_df.at[stock, "sector_name"] sector_companies = list(stocks_by_sector_df.loc[ stocks_by_sector_df["sector_name"] == sector].asx_code) # day_low_high() may raise KeyError when data is currently being fetched, so it appears here... day_low_high_df = day_low_high(stock, all_stocks_cip.columns) except KeyError: warning( None, "Unable to locate watchlist entry: {} - continuing without it". format(stock), ) continue state = { "day_low_high_df": day_low_high_df, # never changes each day, so we init it here "all_stocks_change_in_percent_df": all_stocks_cip, "stock": stock, "daily_range_threshold": 0.20, # 20% at either end of the daily range gets a point } points_by_rule = defaultdict(int) for date in all_stocks_cip.columns: market_avg = all_stocks_cip[date].mean() sector_avg = all_stocks_cip[date].filter( items=sector_companies).mean() stock_move = all_stocks_cip.at[stock, date] state.update({ "market_avg": market_avg, "sector_avg": sector_avg, "stock_move": stock_move, "date": date, }) for rule_name, rule in str_rules.items(): try: points_by_rule[rule_name] += rule(state) except TypeError: # handle nan's in dataset safely pass d = {"stock": stock} d.update(points_by_rule) rows.append(d) df = pd.DataFrame.from_records(rows) df = df.set_index("stock") # print(df) clf = IForest() clf.fit(df) scores = clf.predict(df) results = [ row[0] for row, value in zip(df.iterrows(), scores) if value > 0 ] # print(results) print("Found {} outlier stocks".format(len(results))) return results
class Remove_Outliers(BaseEstimator, TransformerMixin): def __init__(self, target, contamination=.20, random_state=42, methods=['knn', 'iso', 'mcd']): self.target = target self.contamination = contamination self.random_state = random_state self.methods = methods def fit(self, data, y=None): return (None) def transform(self, data, y=None): return (data) def fit_transform(self, dataset, y=None): data = dataset.copy() if 'iso' in self.methods: self.iso_forest = IForest(contamination=self.contamination, random_state=self.random_state, behaviour='new') self.iso_forest.fit(data.drop(self.target, axis=1)) iso_predict = self.iso_forest.predict( data.drop(self.target, axis=1)) data['iso'] = iso_predict if 'knn' in self.methods: self.knn_out = KNN(contamination=self.contamination) self.knn_out.fit(data.drop(self.target, axis=1)) knn_predict = self.knn_out.predict(data.drop(self.target, axis=1)) data['knn'] = knn_predict if 'pca' in self.methods: self.out_pca = PCA_RO(contamination=self.contamination, random_state=self.random_state) self.out_pca.fit(data.drop(self.target, axis=1)) pca_predict = self.out_pca.predict(data.drop(self.target, axis=1)) data['pca'] = pca_predict # use for those features which are gaussian distributed if 'mcd' in self.methods: self.mcd = EllipticEnvelope(contamination=0.01) self.mcd.fit(data.drop(self.target, axis=1)) mcd_predict = self.mcd.predict(data.drop(self.target, axis=1)) data['mcd'] = mcd_predict data['vote_outlier'] = 0 for i in self.methods: data['vote_outlier'] = data['vote_outlier'] + data[i] self.outliers = data[data['vote_outlier'] == len(self.methods)] return dataset[[ True if i not in self.outliers.index else False for i in dataset.index ]]