Exemplo n.º 1
0
def local_outlier_detection(training_vectors, test_vectors_clean,
                            test_vectors_anomalous):
    """Predicting outliers using Local Outlier Detection
    """
    print("Starting Local Outlier Fitting...")

    # Fitting model for novel predictions
    lof = LocalOutlierFactor(novelty=True,
                             contamination='auto',
                             algorithm='auto',
                             n_neighbors=20,
                             n_jobs=-1)
    print("Fitting with Parameters: ", lof.get_params())
    lof.fit(training_vectors)
    result_training = lof.predict(training_vectors)

    print("Fitting successful!")
    print("Starting Prediction...")
    # Predict returns 1 for inlier and -1 for outlier
    result_clean = lof.predict(test_vectors_clean)
    result_anomalous = lof.predict(test_vectors_anomalous)

    print("Predicting successful!")
    print("**************************")

    return result_clean, result_anomalous, result_training
Exemplo n.º 2
0
class Outlier(Intent):
    def __init__(self,
                 n_neighbors: int = 20,
                 contamination: Union[float, str] = 'auto') -> None:
        super().__init__()
        self.clf = LocalOutlierFactor(n_neighbors=n_neighbors,
                                      contamination=contamination)

    def to_string(self) -> str:
        return 'Outlier'

    def compute(self, df: pd.DataFrame) -> pd.DataFrame:
        nan_dropped = df.select_dtypes(include=['number']).dropna()

        min_max_scaler = preprocessing.MinMaxScaler()
        scaled = min_max_scaler.fit_transform(nan_dropped.values)

        outliers = self.clf.fit_predict(scaled)
        result = pd.DataFrame(data=outliers,
                              index=nan_dropped.index,
                              columns=[self.to_string()]).replace({
                                  -1: 1,
                                  1: 0
                              })
        return result.loc[result.iloc[:, 0] == 1].reindex(index=df.index,
                                                          fill_value=0)

    def info(self) -> Optional[Dict[str, Any]]:
        return {
            "type": "Local Outlier Factory",
            "params": self.clf.get_params()
        }
Exemplo n.º 3
0
class LocalOutlierFactorFilter:
    """
    训练与预测一体,没有单独的train和test接口
    关键参数:n_neighbors : int, optional (default=20):参与预测的点的数量,无明显规律
    contamination": 可以反映过滤强度, 越大过滤强度越大
    """
    def __init__(self, name="局部异常因子"):
        self._model = LocalOutlierFactor()
        self.name = name

    def get_params(self, deep=True):
        """
        获得模型参数
        """
        return self._model.get_params(deep=deep)

    def _get_valid_params(self):
        """
        获取有效参数
        :return: List
        """
        param = self.get_params()
        return [i for i in param.keys()]

    def set_params(self, **new_params):
        """
        设置模型参数
        :param new_params: 模型参数键值
        只将模型参数包含的超参赋值给模型
        :return:
        """
        for k in new_params.keys():
            if k not in self._get_valid_params():
                raise ValueError("传入参数含有模型中不包含的参数")
                break
        feed_dict = {
            k: v
            for k, v in new_params.items() if k in self._get_valid_params()
        }
        if len(feed_dict) == 0:
            warnings.warn("模型参数未被修改")
        self._model.set_params(**feed_dict)

    def fit_predict(self, x):
        pass
        """
        :param x: 训练数据
        :param y: 训练数据标签
        :return: 训练数据准确率
        """
        return self._model.fit_predict(x)

    def _connect_SQL(self, **json_file):
        """
        连接到SQL
        :param json_file: 入参
        :return:None
        """
        json_dict = json_file
        self._SQL = SQLServer(host=json_dict['dbinfo']['ip'],
                              port=json_dict['dbinfo']['port'],
                              user=json_dict['dbinfo']['username'],
                              pwd=json_dict['dbinfo']['password'],
                              db=json_dict['dbinfo']['databasename'])

    def get_data_label(self, **json_file):
        """
        从数据库调取数据集的标签
        :param json_file:
        :return: 仅含有标签的数据集 pd.dataFrame
        """
        json_dict = json_file
        data_label = self._SQL.df_read_sqlserver(
            table=json_dict['dbinfo']['inputtable'],
            cols=json_dict['label_columns'])
        if data_label.shape[1] != 1:
            raise ValueError("错误:标签列数不为1")
        return data_label

    def get_data_features(self, **json_file):
        """
        从数据库调取数据集
        :param json_file:入参, json
        :return: 仅含有特征变量的数据集 pd.dataFrame
        """
        json_dict = json_file
        data_features = self._SQL.df_read_sqlserver(
            table=json_dict['dbinfo']['inputtable'],
            cols=json_dict['data_columns'])
        return data_features

    def train_predict_from_sql(self, **json_file):
        """
        训练模型并将模型保存
        :param json_file: 入参,json
        :return:是否成功
        """
        try:
            self._connect_SQL(**json_file)
            self.set_params(**json_file["model_params"])
            features = self.get_data_features(**json_file)
            pre = self.fit_predict(features)
            self._model.columns = features.columns.values.tolist()
            self.save_model(json_file["model_path"])  # 暂时保存
            pre.columns = ["label"]
            pre.to_csv(json_file["save_path"], index=False)
            write = self.SQL.df_write_sqlserver(
                table=json_file['dbinfo']['outputtable'],
                df=pre,
                cols=json_file['data_columns'])
            return {"info": write}
            return "success"
        except Exception as e:
            print(e)
            return 'failed,{e}'.format(e=e)

    def train_predict_from_csv(self, **json):
        try:
            features = pd.read_csv(json["path"], usecols=json['data_columns'])
            self.set_params(**json["model_params"])
            pre = pd.DataFrame(self.fit_predict(features))
            self._model.columns = json['data_columns']
            self.save_model(json["model_path"])  # 暂时保存
            pre.columns = ["label"]
            pre.to_csv(json["save_path"], index=False)
            return {"info": "success"}
        except Exception as e:
            print(e)
            return 'failed,{e}'.format(e=e)

    def train_predict_from_xls(self, **json):
        try:
            features = pd.read_excel(json["path"],
                                     usecols=json['data_columns'])
            self.set_params(**json["model_params"])
            pre = self.fit_predict(features)
            self._model.columns = json['data_columns']
            self.save_model(json["model_path"])  # 暂时保存
            pre.columns = ["label"]
            pre.to_csv(json["save_path"], index=False)
            return {"info": "success"}
        except Exception as e:
            print(e)
            return 'failed,{e}'.format(e=e)

    def save_model(self, model_path):
        """
        保存模型
        :param model_path: 模型保存路径
        :return:是否成功
        """
        try:
            joblib.dump(self._model, model_path)
        except Exception as e:
            print(e)
            return 'failed,{e}'.format(e=e)

    def get_model(self):
        """
        调用模型
        :return:模型
        """
        try:
            return self._model
        except Exception as e:
            print(e)
            return 'failed,{e}'.format(e=e)

    def load_model(self, **json):
        model_path = json['model_path']
        self._model = joblib.load(model_path)