Exemplo n.º 1
0
class LocalOutlierFactorFilter:
    """
    训练与预测一体,没有单独的train和test接口
    关键参数:n_neighbors : int, optional (default=20):参与预测的点的数量,无明显规律
    contamination": 可以反映过滤强度, 越大过滤强度越大
    """
    def __init__(self, name="局部异常因子"):
        self._model = LocalOutlierFactor()
        self.name = name

    def get_params(self, deep=True):
        """
        获得模型参数
        """
        return self._model.get_params(deep=deep)

    def _get_valid_params(self):
        """
        获取有效参数
        :return: List
        """
        param = self.get_params()
        return [i for i in param.keys()]

    def set_params(self, **new_params):
        """
        设置模型参数
        :param new_params: 模型参数键值
        只将模型参数包含的超参赋值给模型
        :return:
        """
        for k in new_params.keys():
            if k not in self._get_valid_params():
                raise ValueError("传入参数含有模型中不包含的参数")
                break
        feed_dict = {
            k: v
            for k, v in new_params.items() if k in self._get_valid_params()
        }
        if len(feed_dict) == 0:
            warnings.warn("模型参数未被修改")
        self._model.set_params(**feed_dict)

    def fit_predict(self, x):
        pass
        """
        :param x: 训练数据
        :param y: 训练数据标签
        :return: 训练数据准确率
        """
        return self._model.fit_predict(x)

    def _connect_SQL(self, **json_file):
        """
        连接到SQL
        :param json_file: 入参
        :return:None
        """
        json_dict = json_file
        self._SQL = SQLServer(host=json_dict['dbinfo']['ip'],
                              port=json_dict['dbinfo']['port'],
                              user=json_dict['dbinfo']['username'],
                              pwd=json_dict['dbinfo']['password'],
                              db=json_dict['dbinfo']['databasename'])

    def get_data_label(self, **json_file):
        """
        从数据库调取数据集的标签
        :param json_file:
        :return: 仅含有标签的数据集 pd.dataFrame
        """
        json_dict = json_file
        data_label = self._SQL.df_read_sqlserver(
            table=json_dict['dbinfo']['inputtable'],
            cols=json_dict['label_columns'])
        if data_label.shape[1] != 1:
            raise ValueError("错误:标签列数不为1")
        return data_label

    def get_data_features(self, **json_file):
        """
        从数据库调取数据集
        :param json_file:入参, json
        :return: 仅含有特征变量的数据集 pd.dataFrame
        """
        json_dict = json_file
        data_features = self._SQL.df_read_sqlserver(
            table=json_dict['dbinfo']['inputtable'],
            cols=json_dict['data_columns'])
        return data_features

    def train_predict_from_sql(self, **json_file):
        """
        训练模型并将模型保存
        :param json_file: 入参,json
        :return:是否成功
        """
        try:
            self._connect_SQL(**json_file)
            self.set_params(**json_file["model_params"])
            features = self.get_data_features(**json_file)
            pre = self.fit_predict(features)
            self._model.columns = features.columns.values.tolist()
            self.save_model(json_file["model_path"])  # 暂时保存
            pre.columns = ["label"]
            pre.to_csv(json_file["save_path"], index=False)
            write = self.SQL.df_write_sqlserver(
                table=json_file['dbinfo']['outputtable'],
                df=pre,
                cols=json_file['data_columns'])
            return {"info": write}
            return "success"
        except Exception as e:
            print(e)
            return 'failed,{e}'.format(e=e)

    def train_predict_from_csv(self, **json):
        try:
            features = pd.read_csv(json["path"], usecols=json['data_columns'])
            self.set_params(**json["model_params"])
            pre = pd.DataFrame(self.fit_predict(features))
            self._model.columns = json['data_columns']
            self.save_model(json["model_path"])  # 暂时保存
            pre.columns = ["label"]
            pre.to_csv(json["save_path"], index=False)
            return {"info": "success"}
        except Exception as e:
            print(e)
            return 'failed,{e}'.format(e=e)

    def train_predict_from_xls(self, **json):
        try:
            features = pd.read_excel(json["path"],
                                     usecols=json['data_columns'])
            self.set_params(**json["model_params"])
            pre = self.fit_predict(features)
            self._model.columns = json['data_columns']
            self.save_model(json["model_path"])  # 暂时保存
            pre.columns = ["label"]
            pre.to_csv(json["save_path"], index=False)
            return {"info": "success"}
        except Exception as e:
            print(e)
            return 'failed,{e}'.format(e=e)

    def save_model(self, model_path):
        """
        保存模型
        :param model_path: 模型保存路径
        :return:是否成功
        """
        try:
            joblib.dump(self._model, model_path)
        except Exception as e:
            print(e)
            return 'failed,{e}'.format(e=e)

    def get_model(self):
        """
        调用模型
        :return:模型
        """
        try:
            return self._model
        except Exception as e:
            print(e)
            return 'failed,{e}'.format(e=e)

    def load_model(self, **json):
        model_path = json['model_path']
        self._model = joblib.load(model_path)
Exemplo n.º 2
0
def active_modify_label_only_training_set(param, X, y, detector_name, detector, noise_true_ratio=0.1, threshold=0.5, repeat=10, random_state=0, log_identifier="", verbose=False, id=0, dataset='toy'):
    all_confusion_matrix = []
    total_time = 0
    detection_errors = 0
    n_instances, n_features = X.shape
    outlier_detection_confusion_matrix = []
    counter = Counter(y)
    minority_class = min(counter, key=counter.get)
    roc = []
    noise_proportition = 0
    accs = []
    added_noise_count = 0
    for i in range(repeat):
        model = lgb.LGBMClassifier(
            boosting_type="gbdt",
            learning_rate=param["learning_rate"],
            n_estimators=param["n_estimators"],
            max_depth=param["max_depth"],
            num_leaves=param["num_leaves"],
            objective="binary",
            seed=random_state
        )

        # model = RandomForestClassifier(
        #     n_estimators=param["n_estimators"],
        #     max_depth=param["max_depth"],
        #     random_state=random_state
        # )

        train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2,
                                                            random_state=i)
        true_minority_indices = np.array(train_y == minority_class)
        current_noise_count = int(noise_true_ratio * true_minority_indices.sum())
        noised_train_y, groundtruth = add_noise_to_majority(train_y, current_noise_count, random_state=i, verbose=verbose)

        added_noise_count += Counter(groundtruth)[-1]

        noised_minority_indices = noised_train_y==minority_class
        noise_proportition = 1 - true_minority_indices.sum() / noised_minority_indices.sum()
        current_noise_true_ratio = round(noise_proportition / (1 - noise_proportition), 3)

        if detector.__class__ == LocalOutlierFactor:
            detector = LocalOutlierFactor(n_neighbors=5, contamination=min(noise_proportition, 0.5))
        detector.set_params(contamination=noise_proportition)
        noised_minority_X = train_X[noised_minority_indices]
        if noise_proportition > 0:
            start = time.time()
            if verbose:
                print(detector_name, "start detecting")

            # predict and update
            outlier_prediction = outlier_detection.omni_detector_detect(detector, noised_minority_X)

            expanded_outlier_prediction = np.ones(shape=(len(train_y)))
            expanded_outlier_prediction[noised_minority_indices] = outlier_prediction
            updated_y = update_by_outlier_prediction(noised_train_y, expanded_outlier_prediction)

            # collect stats info
            n_errors = (outlier_prediction != groundtruth[noised_minority_indices]).sum()
            detection_cf = confusion_matrix(groundtruth[noised_minority_indices], outlier_prediction)
            outlier_detection_confusion_matrix.append(np.ravel(detection_cf))
            total_time = time.time() - start
            detection_errors += n_errors
            if verbose:
                print(detector_name, "finish detecting", time.time() - start, n_errors)
        else:
            updated_y = train_y
            expanded_outlier_prediction = np.ones_like(train_y)
            detection_cf = confusion_matrix(groundtruth[noised_minority_indices], groundtruth[noised_minority_indices])
            outlier_detection_confusion_matrix.append([0, 0, 0, detection_cf[0][0]])

        trial_id = "{}-{}-{:.3f}".format(detector_name, id, noise_true_ratio)

        # train model
        lgb_model = model.fit(train_X, updated_y)

        # predict training set
        prediction_training = lgb_model.predict(train_X)

        # # save plot: based on updated_y
        # training_predicted_result = updated_y + prediction_training * 2
        # colors, _ = tool.category2color(training_predicted_result, {
        #     0: "#5079a5",
        #     1: "#dd565c",
        #     2: "#79b7b2",
        #     3: "#ef8e3b"
        # })
        # trial_id = "{}-{}-{:.4f}".format(detector_name, id, noise_true_ratio)
        # # save plots
        # path = join("figures", "gaussian-using-noised-label", trial_id + ".png")
        # simple_plot.save_plot2png(train_X[:, 0], train_X[:, 1], colors, path, noise_true_ratio)

        # # save plot: based on updated_y
        # training_predicted_result = train_y + prediction_training * 2
        # colors, _ = tool.category2color(training_predicted_result, {
        #     0: "#5079a5",
        #     1: "#dd565c",
        #     2: "#79b7b2",
        #     3: "#ef8e3b"
        # })
        # # save plots
        # folder = join("figures", "gaussian-using-true-label", detector_name)
        # if not os.path.exists(folder):
        #     os.mkdir(folder)
        # path = join(folder, trial_id + ".png")
        # simple_plot.save_plot2png(train_X[:, 0], train_X[:, 1], colors, path, noise_true_ratio)

        # save detection and classification result
        all_info = np.concatenate((train_X,
                train_y[:, np.newaxis],
                groundtruth[:, np.newaxis],
                expanded_outlier_prediction[:, np.newaxis],
                prediction_training[:, np.newaxis]
            ),
            axis=1
        )
        result_root = os.path.join("outlier-result", log_identifier)
        if not os.path.exists(result_root):
            os.mkdir(result_root)
        np.savetxt(os.path.join(result_root, trial_id + ".csv"), all_info, delimiter=',', fmt='%d')

        # predict testing set
        predicted_proba = lgb_model.predict_proba(test_X)
        prediction_proba = predicted_proba[:, 1]
        prediction = np.where(prediction_proba > threshold, [1], [0])

        # metrics
        auc = roc_auc_score(test_y, prediction_proba)
        acc = accuracy_score(test_y, prediction)
        accs.append(acc)
        roc.append(auc)
        conf_mat = confusion_matrix(test_y, prediction)
        conf_mat = np.ravel(conf_mat)
        all_confusion_matrix.append(conf_mat)

    # aggregate metric results
    aggregated_conf_mat = np.array(all_confusion_matrix).mean(axis=0)
    aggregated_detection_conf_mat = np.array(outlier_detection_confusion_matrix).mean(axis=0)

    # current_noise_count = noise_proportition / (1 - noise_proportition)
    current_noise_true_ratio = noise_true_ratio
    roc_mean = np.array(roc).mean()
    acc_mean = np.array(accs).mean()
    print("average noise count", added_noise_count / repeat)
    majority_class = 1 - minority_class
    classification_conf_mat = np.array(aggregated_conf_mat).ravel().tolist()
    detection_conf_mat = np.array(aggregated_detection_conf_mat).ravel().tolist()
    metric_bundle = [current_noise_true_ratio, detector_name, roc_mean, acc_mean] + classification_conf_mat + detection_conf_mat

    file_log(log_identifier, *metric_bundle)
    return metric_bundle
    conf_mat = confusion_matrix(groundtruth, outlier_prediction)
    n_errors = conf_mat[0][1] + conf_mat[1][0]
    print(detector_name, list2str(np.ravel(conf_mat)), n_errors)

    minority_updated_y = update_by_outlier_prediction(y[y==minority_class], outlier_prediction)
    updated_y = np.array(y)
    updated_y[y==minority_class] = minority_updated_y
    return updated_y


if __name__ == '__main__':
    d = LocalOutlierFactor(
            n_neighbors=35,
            contamination=0.05)
    d.set_params(contamination=0.2)
    noise_ratio = 0.2
    seed = 40

    X, y = load_santander()
    n_samples, n_features = X.shape
    detectors = generate_detectors(n_samples, n_features, noise_ratio, random_state=seed)
    param = {
        'learning_rate': 0.1,
        'max_depth': 5,
        'n_estimators': 100,
        'num_leaves': 30
    }
    model = lgb.LGBMClassifier(
        boosting_type="gbdt",
        learning_rate=param["learning_rate"],