class LocalOutlierFactorFilter: """ 训练与预测一体,没有单独的train和test接口 关键参数:n_neighbors : int, optional (default=20):参与预测的点的数量,无明显规律 contamination": 可以反映过滤强度, 越大过滤强度越大 """ def __init__(self, name="局部异常因子"): self._model = LocalOutlierFactor() self.name = name def get_params(self, deep=True): """ 获得模型参数 """ return self._model.get_params(deep=deep) def _get_valid_params(self): """ 获取有效参数 :return: List """ param = self.get_params() return [i for i in param.keys()] def set_params(self, **new_params): """ 设置模型参数 :param new_params: 模型参数键值 只将模型参数包含的超参赋值给模型 :return: """ for k in new_params.keys(): if k not in self._get_valid_params(): raise ValueError("传入参数含有模型中不包含的参数") break feed_dict = { k: v for k, v in new_params.items() if k in self._get_valid_params() } if len(feed_dict) == 0: warnings.warn("模型参数未被修改") self._model.set_params(**feed_dict) def fit_predict(self, x): pass """ :param x: 训练数据 :param y: 训练数据标签 :return: 训练数据准确率 """ return self._model.fit_predict(x) def _connect_SQL(self, **json_file): """ 连接到SQL :param json_file: 入参 :return:None """ json_dict = json_file self._SQL = SQLServer(host=json_dict['dbinfo']['ip'], port=json_dict['dbinfo']['port'], user=json_dict['dbinfo']['username'], pwd=json_dict['dbinfo']['password'], db=json_dict['dbinfo']['databasename']) def get_data_label(self, **json_file): """ 从数据库调取数据集的标签 :param json_file: :return: 仅含有标签的数据集 pd.dataFrame """ json_dict = json_file data_label = self._SQL.df_read_sqlserver( table=json_dict['dbinfo']['inputtable'], cols=json_dict['label_columns']) if data_label.shape[1] != 1: raise ValueError("错误:标签列数不为1") return data_label def get_data_features(self, **json_file): """ 从数据库调取数据集 :param json_file:入参, json :return: 仅含有特征变量的数据集 pd.dataFrame """ json_dict = json_file data_features = self._SQL.df_read_sqlserver( table=json_dict['dbinfo']['inputtable'], cols=json_dict['data_columns']) return data_features def train_predict_from_sql(self, **json_file): """ 训练模型并将模型保存 :param json_file: 入参,json :return:是否成功 """ try: self._connect_SQL(**json_file) self.set_params(**json_file["model_params"]) features = self.get_data_features(**json_file) pre = self.fit_predict(features) self._model.columns = features.columns.values.tolist() self.save_model(json_file["model_path"]) # 暂时保存 pre.columns = ["label"] pre.to_csv(json_file["save_path"], index=False) write = self.SQL.df_write_sqlserver( table=json_file['dbinfo']['outputtable'], df=pre, cols=json_file['data_columns']) return {"info": write} return "success" except Exception as e: print(e) return 'failed,{e}'.format(e=e) def train_predict_from_csv(self, **json): try: features = pd.read_csv(json["path"], usecols=json['data_columns']) self.set_params(**json["model_params"]) pre = pd.DataFrame(self.fit_predict(features)) self._model.columns = json['data_columns'] self.save_model(json["model_path"]) # 暂时保存 pre.columns = ["label"] pre.to_csv(json["save_path"], index=False) return {"info": "success"} except Exception as e: print(e) return 'failed,{e}'.format(e=e) def train_predict_from_xls(self, **json): try: features = pd.read_excel(json["path"], usecols=json['data_columns']) self.set_params(**json["model_params"]) pre = self.fit_predict(features) self._model.columns = json['data_columns'] self.save_model(json["model_path"]) # 暂时保存 pre.columns = ["label"] pre.to_csv(json["save_path"], index=False) return {"info": "success"} except Exception as e: print(e) return 'failed,{e}'.format(e=e) def save_model(self, model_path): """ 保存模型 :param model_path: 模型保存路径 :return:是否成功 """ try: joblib.dump(self._model, model_path) except Exception as e: print(e) return 'failed,{e}'.format(e=e) def get_model(self): """ 调用模型 :return:模型 """ try: return self._model except Exception as e: print(e) return 'failed,{e}'.format(e=e) def load_model(self, **json): model_path = json['model_path'] self._model = joblib.load(model_path)
def active_modify_label_only_training_set(param, X, y, detector_name, detector, noise_true_ratio=0.1, threshold=0.5, repeat=10, random_state=0, log_identifier="", verbose=False, id=0, dataset='toy'): all_confusion_matrix = [] total_time = 0 detection_errors = 0 n_instances, n_features = X.shape outlier_detection_confusion_matrix = [] counter = Counter(y) minority_class = min(counter, key=counter.get) roc = [] noise_proportition = 0 accs = [] added_noise_count = 0 for i in range(repeat): model = lgb.LGBMClassifier( boosting_type="gbdt", learning_rate=param["learning_rate"], n_estimators=param["n_estimators"], max_depth=param["max_depth"], num_leaves=param["num_leaves"], objective="binary", seed=random_state ) # model = RandomForestClassifier( # n_estimators=param["n_estimators"], # max_depth=param["max_depth"], # random_state=random_state # ) train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=i) true_minority_indices = np.array(train_y == minority_class) current_noise_count = int(noise_true_ratio * true_minority_indices.sum()) noised_train_y, groundtruth = add_noise_to_majority(train_y, current_noise_count, random_state=i, verbose=verbose) added_noise_count += Counter(groundtruth)[-1] noised_minority_indices = noised_train_y==minority_class noise_proportition = 1 - true_minority_indices.sum() / noised_minority_indices.sum() current_noise_true_ratio = round(noise_proportition / (1 - noise_proportition), 3) if detector.__class__ == LocalOutlierFactor: detector = LocalOutlierFactor(n_neighbors=5, contamination=min(noise_proportition, 0.5)) detector.set_params(contamination=noise_proportition) noised_minority_X = train_X[noised_minority_indices] if noise_proportition > 0: start = time.time() if verbose: print(detector_name, "start detecting") # predict and update outlier_prediction = outlier_detection.omni_detector_detect(detector, noised_minority_X) expanded_outlier_prediction = np.ones(shape=(len(train_y))) expanded_outlier_prediction[noised_minority_indices] = outlier_prediction updated_y = update_by_outlier_prediction(noised_train_y, expanded_outlier_prediction) # collect stats info n_errors = (outlier_prediction != groundtruth[noised_minority_indices]).sum() detection_cf = confusion_matrix(groundtruth[noised_minority_indices], outlier_prediction) outlier_detection_confusion_matrix.append(np.ravel(detection_cf)) total_time = time.time() - start detection_errors += n_errors if verbose: print(detector_name, "finish detecting", time.time() - start, n_errors) else: updated_y = train_y expanded_outlier_prediction = np.ones_like(train_y) detection_cf = confusion_matrix(groundtruth[noised_minority_indices], groundtruth[noised_minority_indices]) outlier_detection_confusion_matrix.append([0, 0, 0, detection_cf[0][0]]) trial_id = "{}-{}-{:.3f}".format(detector_name, id, noise_true_ratio) # train model lgb_model = model.fit(train_X, updated_y) # predict training set prediction_training = lgb_model.predict(train_X) # # save plot: based on updated_y # training_predicted_result = updated_y + prediction_training * 2 # colors, _ = tool.category2color(training_predicted_result, { # 0: "#5079a5", # 1: "#dd565c", # 2: "#79b7b2", # 3: "#ef8e3b" # }) # trial_id = "{}-{}-{:.4f}".format(detector_name, id, noise_true_ratio) # # save plots # path = join("figures", "gaussian-using-noised-label", trial_id + ".png") # simple_plot.save_plot2png(train_X[:, 0], train_X[:, 1], colors, path, noise_true_ratio) # # save plot: based on updated_y # training_predicted_result = train_y + prediction_training * 2 # colors, _ = tool.category2color(training_predicted_result, { # 0: "#5079a5", # 1: "#dd565c", # 2: "#79b7b2", # 3: "#ef8e3b" # }) # # save plots # folder = join("figures", "gaussian-using-true-label", detector_name) # if not os.path.exists(folder): # os.mkdir(folder) # path = join(folder, trial_id + ".png") # simple_plot.save_plot2png(train_X[:, 0], train_X[:, 1], colors, path, noise_true_ratio) # save detection and classification result all_info = np.concatenate((train_X, train_y[:, np.newaxis], groundtruth[:, np.newaxis], expanded_outlier_prediction[:, np.newaxis], prediction_training[:, np.newaxis] ), axis=1 ) result_root = os.path.join("outlier-result", log_identifier) if not os.path.exists(result_root): os.mkdir(result_root) np.savetxt(os.path.join(result_root, trial_id + ".csv"), all_info, delimiter=',', fmt='%d') # predict testing set predicted_proba = lgb_model.predict_proba(test_X) prediction_proba = predicted_proba[:, 1] prediction = np.where(prediction_proba > threshold, [1], [0]) # metrics auc = roc_auc_score(test_y, prediction_proba) acc = accuracy_score(test_y, prediction) accs.append(acc) roc.append(auc) conf_mat = confusion_matrix(test_y, prediction) conf_mat = np.ravel(conf_mat) all_confusion_matrix.append(conf_mat) # aggregate metric results aggregated_conf_mat = np.array(all_confusion_matrix).mean(axis=0) aggregated_detection_conf_mat = np.array(outlier_detection_confusion_matrix).mean(axis=0) # current_noise_count = noise_proportition / (1 - noise_proportition) current_noise_true_ratio = noise_true_ratio roc_mean = np.array(roc).mean() acc_mean = np.array(accs).mean() print("average noise count", added_noise_count / repeat) majority_class = 1 - minority_class classification_conf_mat = np.array(aggregated_conf_mat).ravel().tolist() detection_conf_mat = np.array(aggregated_detection_conf_mat).ravel().tolist() metric_bundle = [current_noise_true_ratio, detector_name, roc_mean, acc_mean] + classification_conf_mat + detection_conf_mat file_log(log_identifier, *metric_bundle) return metric_bundle
conf_mat = confusion_matrix(groundtruth, outlier_prediction) n_errors = conf_mat[0][1] + conf_mat[1][0] print(detector_name, list2str(np.ravel(conf_mat)), n_errors) minority_updated_y = update_by_outlier_prediction(y[y==minority_class], outlier_prediction) updated_y = np.array(y) updated_y[y==minority_class] = minority_updated_y return updated_y if __name__ == '__main__': d = LocalOutlierFactor( n_neighbors=35, contamination=0.05) d.set_params(contamination=0.2) noise_ratio = 0.2 seed = 40 X, y = load_santander() n_samples, n_features = X.shape detectors = generate_detectors(n_samples, n_features, noise_ratio, random_state=seed) param = { 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'num_leaves': 30 } model = lgb.LGBMClassifier( boosting_type="gbdt", learning_rate=param["learning_rate"],