Exemplo n.º 1
0
def build_sentiment(classifier, name, with_proba=True, **pmml_options):
    pipeline = PMMLPipeline([
        ("tf-idf",
         TfidfVectorizer(
             analyzer="word",
             preprocessor=None,
             strip_accents=None,
             lowercase=True,
             token_pattern=None,
             tokenizer=Splitter(),
             stop_words="english",
             ngram_range=(1, 2),
             norm=None,
             dtype=(numpy.float32 if isinstance(
                 classifier, RandomForestClassifier) else numpy.float64))),
        ("selector", SelectKBest(f_classif, k=500)), ("classifier", classifier)
    ])
    pipeline.fit(sentiment_X, sentiment_y)
    pipeline.configure(**pmml_options)
    store_pkl(pipeline, name + ".pkl")
    score = DataFrame(pipeline.predict(sentiment_X), columns=["Score"])
    if with_proba == True:
        score_proba = DataFrame(pipeline.predict_proba(sentiment_X),
                                columns=["probability(0)", "probability(1)"])
        score = pandas.concat((score, score_proba), axis=1)
    store_csv(score, name + ".csv")
Exemplo n.º 2
0
def build_auto_na(regressor, name, predict_transformer = None, apply_transformer = None, **pmml_options):
	mapper = DataFrameMapper(
		[([column], [CategoricalDomain(missing_values = -1), CategoricalImputer(missing_values = -1), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year"]] +
		[(["origin"], [CategoricalDomain(missing_values = -1), SimpleImputer(missing_values = -1, strategy = "most_frequent"), OneHotEncoder()])] +
		[(["acceleration"], [ContinuousDomain(missing_values = None), CutTransformer(bins = [5, 7.5, 10, 12.5, 15, 17.5, 20, 22.5, 25], labels = False), CategoricalImputer(), LabelBinarizer()])] +
		[(["displacement"], [ContinuousDomain(missing_values = None), SimpleImputer(), CutTransformer(bins = [0, 100, 200, 300, 400, 500], labels = ["XS", "S", "M", "L", "XL"]), LabelBinarizer()])] +
		[(["horsepower"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_extreme_values", low_value = 50, high_value = 225), SimpleImputer(strategy = "median")])] +
		[(["weight"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_extreme_values", low_value = 2000, high_value = 5000), SimpleImputer(strategy = "median")])]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	], predict_transformer = predict_transformer, apply_transformer = apply_transformer)
	pipeline.fit(auto_na_X, auto_na_y)
	if isinstance(regressor, DecisionTreeRegressor):
		tree = regressor.tree_
		node_impurity = {node_idx : tree.impurity[node_idx] for node_idx in range(0, tree.node_count) if tree.impurity[node_idx] != 0.0}
		pmml_options["node_extensions"] = {regressor.criterion : node_impurity}
	pipeline.configure(**pmml_options)
	pipeline.verify(auto_na_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"])
	if isinstance(regressor, DecisionTreeRegressor):
		Xt = pipeline_transform(pipeline, auto_na_X)
		mpg_apply = DataFrame(regressor.apply(Xt), columns = ["nodeId"])
		mpg = pandas.concat((mpg, mpg_apply), axis = 1)
	store_csv(mpg, name)
Exemplo n.º 3
0
def build_auto(regressor, name, **pmml_options):
	cylinders_origin_mapping = {
		(8, 1) : "8/1",
		(6, 1) : "6/1",
		(4, 1) : "4/1",
		(6, 2) : "6/2",
		(4, 2) : "4/2",
		(6, 3) : "6/3",
		(4, 3) : "4/3"
	}
	mapper = DataFrameMapper([
		(["cylinders", "origin"], [MultiDomain([CategoricalDomain(), CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), LabelBinarizer()]),
		(["model_year"], [CategoricalDomain(), Binarizer(threshold = 77)], {"alias" : "bin(model_year, 77)"}), # Pre/post 1973 oil crisis effects
		(["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), StandardScaler()]),
		(["weight", "displacement"], ExpressionTransformer("(X[0] / X[1]) + 0.5"), {"alias" : "weight / displacement + 0.5"})
	])
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("selector", SelectUnique()),
		("regressor", regressor)
	])
	pipeline.fit(auto_X, auto_y)
	pipeline.configure(**pmml_options)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"])
	store_csv(mpg, name)
Exemplo n.º 4
0
    def __init__(self):
        #读取数据
        good_query_list = self.get_query_list('goodqueries.txt')
        bad_query_list = self.get_query_list('badqueries.txt')

        #给黑、白数据分别打标签
        good_y = [0 for i in range(0, len(good_query_list))]
        bad_y = [1 for i in range(0, len(bad_query_list))]

        queries = good_query_list + bad_query_list
        y = good_y + bad_y

        #将原始文本数据分割转化成向量
        self.vectorizer = TfidfVectorizer(tokenizer=self.get_ngrams)

        #把文本字符串转化成([i,j],Tfidf值)矩阵X
        X = self.vectorizer.fit_transform(queries)

        #分割训练数据(建立模型)和测试数据(测试模型准确度)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=20,
                                                            random_state=42)

        #定义模型训练方法(逻辑回归)
        self.lgs = PMMLPipeline([('LogisticModer',
                                  LogisticRegression(solver='liblinear'))])

        #训练模型
        self.lgs.fit(X_train, y_train)

        #测试模型准确度
        print('模型准确度:{}'.format(self.lgs.score(X_test, y_test)))

        sklearn2pmml(self.lgs, '.\lgs.pmml', with_repr=True)
Exemplo n.º 5
0
def submodel_evaluation(train_data,valid_data,model_list,\
                        category_feature,numeric_feature): 
    X_train = train_data[category_feature+numeric_feature] 
    y_train = train_data['user_type']
    X_valid = valid_data[category_feature+numeric_feature]
    y_valid = valid_data['user_type']
    
    pipeline_transformer = feature_union(category_feature,numeric_feature)    
    model_result_dict = {}
    for model in model_list:
        model_name = model.__class__.__name__
        print('model %s evaluation'%model_name)
        
        sub_model = PMMLPipeline([
            ('mapper',pipeline_transformer),
            ('classifier',model)
        ])
        sub_model.fit(X_train,y_train)
        predict_valid = sub_model.predict_proba(X_valid)[:,1]
        predict_label = sub_model.predict(X_valid)
        model_ks = plot_ks_curve(predict_valid,valid_data['user_type'])
        model_auc = roc_auc_score(y_valid, predict_valid)
        accuracy = metrics.accuracy_score(y_valid,predict_label)
        model_result_dict[model_name] = [model_ks,model_auc,accuracy]
    return model_result_dict
Exemplo n.º 6
0
def main(input_data, output_data, model_dest):
    logger = logging.getLogger(__name__)
    logger.info("Loading input and output data")
    inputs = pd.read_csv(input_data)
    outputs = pd.read_csv(output_data)
    X_train, X_test, y_train, y_test = train_test_split(
        inputs, outputs, test_size=0.4, random_state=23
    )
    model = RandomForestClassifier(verbose=True, max_depth=6, n_jobs=-1)
    logger.info("Fitting model")
    model.fit(X_train, y_train)
    logger.info("Saving model")
    dump(model, model_dest)

    pipeline = PMMLPipeline(
        [("classifier", model)]
    )

    logger.info("Saving PMML model")
    skl_to_pmml(
        pipeline,
        [
            "FLAG_OWN_CAR",
            "FLAG_OWN_REALTY",
            "CNT_CHILDREN",
            "AMT_INCOME_TOTAL",
            "AGE",
            "DAYS_EMPLOYED",
            "FLAG_WORK_PHONE",
        ],
        "APPROVED",
        model_dest + ".pmml",
    )
Exemplo n.º 7
0
def build_auto(regressor, name, **pmml_options):
	cat_columns = ["cylinders", "model_year", "origin"]
	cont_columns = ["displacement", "horsepower", "weight", "acceleration"]
	if isinstance(regressor, LGBMRegressor):
		cat_mappings = [([cat_column], [cat_domain(name), label_encoder(name)]) for cat_column in cat_columns]
	else:
		cat_mappings = [([cat_column], [cat_domain(name), label_binarizer(name)]) for cat_column in cat_columns]
	cont_mappings = [([cont_column], [cont_domain(name)]) for cont_column in cont_columns]
	mapper = DataFrameMapper(cat_mappings + cont_mappings)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	])
	if isinstance(regressor, LGBMRegressor):
		pipeline.fit(auto_X, auto_y, regressor__categorical_feature = [0, 1, 2])
	elif isinstance(regressor, IsolationForest):
		pipeline.fit(auto_X)
	else:
		pipeline.fit(auto_X, auto_y)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(n = 3, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(n = 3, random_state = 13))
	pipeline.configure(**pmml_options)
	store_pmml(pipeline, name)
	if isinstance(regressor, IsolationForest):
		decision_function = DataFrame(pipeline.decision_function(auto_X), columns = ["decisionFunction"])
		outlier = DataFrame(pipeline.predict(auto_X), columns = ["outlier"])
		outlier['outlier'] = outlier['outlier'].apply(lambda x: str(bool(x == -1)).lower())
		store_csv(pandas.concat((decision_function, outlier), axis = 1), name)
	else:
		mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"])
		store_csv(mpg, name)
  def test_sklearn2pmml_multiclass_ovr(self):
    data = load_iris(as_frame=True)

    X = data.data
    y = data.target
    y.name = "Class"

    ref = LogisticRegression(
      multi_class='ovr'
    )
    ref.fit(X, y)

    # Export to PMML
    pipeline = PMMLPipeline([
      ("classifier", ref)
    ])
    pipeline.fit(X, y)
    sklearn2pmml(pipeline, "lmc-sklearn2pmml.pmml", with_repr=True)

    try:
      # Import PMML
      model = PMMLLogisticRegression(pmml='lmc-sklearn2pmml.pmml')

      # Verify classification
      assert np.allclose(
        ref.predict_proba(X),
        model.predict_proba(X)
      )

    finally:
      remove("lmc-sklearn2pmml.pmml")
Exemplo n.º 9
0
def build_auto_h2o(regressor, name):
    transformer = ColumnTransformer(
        [(column, CategoricalDomain(), [column])
         for column in ["cylinders", "model_year", "origin"]] +
        [(column, ContinuousDomain(), [column]) for column in
         ["displacement", "horsepower", "weight", "acceleration"]])
    pipeline = PMMLPipeline([("transformer", transformer),
                             ("uploader",
                              H2OFrameCreator(column_names=[
                                  "cylinders", "model_year", "origin",
                                  "displacement", "horsepower", "weight",
                                  "acceleration"
                              ],
                                              column_types=[
                                                  "enum", "enum", "enum",
                                                  "numeric", "numeric",
                                                  "numeric", "numeric"
                                              ])), ("regressor", regressor)])
    pipeline.fit(auto_X, H2OFrame(auto_y.to_frame()))
    pipeline.verify(auto_X.sample(frac=0.05, random_state=13))
    regressor = pipeline._final_estimator
    store_mojo(regressor, name + ".zip")
    store_pkl(pipeline, name + ".pkl")
    mpg = pipeline.predict(auto_X)
    mpg.set_names(["mpg"])
    store_csv(mpg.as_data_frame(), name + ".csv")
Exemplo n.º 10
0
def build_iris(classifier, name, **pmml_options):
    cont_columns = [
        "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"
    ]
    cont_mappings = [([cont_column], ContinuousDomain())
                     for cont_column in cont_columns]
    mapper = DataFrameMapper(cont_mappings)
    pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(iris_X, iris_y)
    if isinstance(classifier, XGBClassifier):
        pipeline.verify(iris_X.sample(n=3, random_state=13),
                        precision=1e-5,
                        zeroThreshold=1e-5)
    else:
        pipeline.verify(iris_X.sample(n=3, random_state=13))
    pipeline.configure(**pmml_options)
    store_pmml(pipeline, name)
    species = DataFrame(pipeline.predict(iris_X), columns=["Species"])
    species_proba = DataFrame(pipeline.predict_proba(iris_X),
                              columns=[
                                  "probability(setosa)",
                                  "probability(versicolor)",
                                  "probability(virginica)"
                              ])
    store_csv(pandas.concat((species, species_proba), axis=1), name)
Exemplo n.º 11
0
def main(input_data, output_data, model_dest):
    logger = logging.getLogger(__name__)
    logger.info("Loading input and output data")
    inputs = pd.read_csv(input_data)
    outputs = pd.read_csv(output_data)
    X_train, X_test, y_train, y_test = train_test_split(inputs,
                                                        outputs,
                                                        test_size=0.4,
                                                        random_state=23)

    model = RandomForestRegressor(verbose=True, n_jobs=-1)

    logger.info("Fitting model")
    model.fit(X_train, y_train)

    model.fit(X_train, y_train)
    logger.info("Saving joblib model")
    dump(model, model_dest + ".joblib")

    pipeline = PMMLPipeline([("classifier",
                              RandomForestRegressor(verbose=True, n_jobs=-1))])
    pipeline.fit(X_train, y_train)
    pipeline.verify(X_test.sample(n=10))

    logger.info("Saving PMML model")
    sklearn2pmml(pipeline, model_dest + ".pmml")
Exemplo n.º 12
0
def build_audit(classifier, name, **pmml_options):
	if isinstance(classifier, LGBMClassifier):
		cat_columns = ["Age", "Employment", "Education", "Marital", "Occupation", "Gender", "Deductions"]
		cont_columns = ["Income", "Hours"]
	else:
		cat_columns = ["Employment", "Education", "Marital", "Occupation", "Gender", "Deductions"]
		cont_columns = ["Age", "Income", "Hours"]
	if isinstance(classifier, LGBMClassifier):
		cat_mappings = [([cat_column], [cat_domain(name), label_encoder(name)]) for cat_column in cat_columns]
	else:
		cat_mappings = [([cat_column], [cat_domain(name), label_binarizer(name)]) for cat_column in cat_columns]
	cont_mappings = [([cont_column], cont_domain(name)) for cont_column in cont_columns]
	mapper = DataFrameMapper(cat_mappings + cont_mappings)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	])
	if isinstance(classifier, LGBMClassifier):
		pipeline.fit(audit_X, audit_y, classifier__categorical_feature = [0, 1, 2, 3, 4, 5])
	elif isinstance(classifier, XGBClassifier):
		if name == "XGBoostAuditNA":
			audit_X["Age"] = audit_X["Age"].astype(float)
		pipeline.fit(audit_X, audit_y)
	else:
		pipeline.fit(audit_X, audit_y)
	if isinstance(classifier, XGBClassifier):
		pipeline.verify(audit_X.sample(n = 3, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(audit_X.sample(n = 3, random_state = 13))
	pipeline.configure(**pmml_options)
	store_pmml(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"])
	adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"])
	store_csv(pandas.concat((adjusted, adjusted_proba), axis = 1), name)
Exemplo n.º 13
0
def build_auto(regressor, name, fit_params = {}, predict_params = {}, **pmml_options):
	cylinders_origin_mapping = {
		(8, 1) : "8/1",
		(6, 1) : "6/1",
		(4, 1) : "4/1",
		(6, 2) : "6/2",
		(4, 2) : "4/2",
		(4, 3) : "4/3"
	}
	mapper = DataFrameMapper([
		(["cylinders"], [CategoricalDomain(), Alias(ExpressionTransformer("X[0] % 2.0 > 0.0", dtype = numpy.int8), name = "odd(cylinders)", prefit = True)]),
		(["cylinders", "origin"], [MultiDomain([None, CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), OneHotEncoder()]),
		(["model_year"], [CategoricalDomain(), CastTransformer(str), ExpressionTransformer("'19' + X[0] + '-01-01'"), CastTransformer("datetime64[D]"), DaysSinceYearTransformer(1977), Binarizer(threshold = 0)], {"alias" : "bin(model_year, 1977)"}),
		(["model_year", "origin"], [ConcatTransformer("/"), OneHotEncoder(sparse = False), SelectorProxy(SelectFromModel(RandomForestRegressor(n_estimators = 3, random_state = 13), threshold = "1.25 * mean"))]),
		(["weight", "displacement"], [ContinuousDomain(), ExpressionTransformer("(X[0] / X[1]) + 0.5", dtype = numpy.float64)], {"alias" : "weight / displacement + 0.5"}),
		(["displacement", "horsepower", "weight", "acceleration"], [MultiDomain([None, ContinuousDomain(), None, ContinuousDomain()]), StandardScaler()])
	])
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("selector", SelectUnique()),
		("regressor", regressor)
	])
	pipeline.fit(auto_X, auto_y, **fit_params)
	pipeline.configure(**pmml_options)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params)
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_X, **predict_params), columns = ["mpg"])
	store_csv(mpg, name)
Exemplo n.º 14
0
def make_pipeline_model(numeric_feature,
                        category_feature,
                        estimator,
                        X=None,
                        y=None):
    '''
    通过指定类别型和数值型特征构建以及指定的模型构建pipeline,如果给出数据集就完成训练,最终返回pipeline模型
    numeric_feature: 数值特征 list
    category_feature: 类别特征 list
    X:X数据 传入pandas.DataFrame对象
    y:Y数据 传入pandas.Series对象
    
    return:
    pipeline_model
    '''
    feature_def = gen_features(
        columns=category_feature,
        classes=[CategoricalDomain, CategoricalImputer, LabelBinarizer])
    mapper_numerical = DataFrameMapper([(numeric_feature, [
        ContinuousDomain(),
        SimpleImputer(strategy='mean'),
        StandardScaler()
    ])])
    mapper_category = DataFrameMapper(feature_def)
    mapper = FeatureUnion([('mapper_numerical', mapper_numerical),
                           ('mapper_category', mapper_category)])
    pipeline_model = PMMLPipeline([('mapper', mapper),
                                   ('classifier', estimator)])
    if X is not None and y is not None:
        pipeline_model.fit(X, y)
    return pipeline_model
Exemplo n.º 15
0
def build_iris_vec(classifier, name):
	pipeline = PMMLPipeline([
		("classifier", classifier)
	])
	pipeline.fit(iris_X, iris_y)
	store_pmml(pipeline, name)
	species = DataFrame(pipeline.predict(iris_X), columns = ["Species"])
	species_proba = DataFrame(pipeline.predict_proba(iris_X), columns = ["probability(setosa)", "probability(versicolor)", "probability(virginica)"])
	store_csv(pandas.concat((species, species_proba), axis = 1), name)
Exemplo n.º 16
0
def build_auto_isotonic(regressor, auto_isotonic_X, name):
	pipeline = PMMLPipeline([
		("regressor", regressor)
	])
	pipeline.fit(auto_isotonic_X, auto_y)
	pipeline.verify(auto_isotonic_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_isotonic_X), columns = ["mpg"])
	store_csv(mpg, name)
Exemplo n.º 17
0
def model_to_pmml(model, X_train, y_train):
    from sklearn2pmml.pipeline import PMMLPipeline
    power_pipeline = PMMLPipeline([("classifier", model)])

    power_pipeline.fit(X_train, y_train)
    from sklearn2pmml import sklearn2pmml
    sklearn2pmml(power_pipeline,
                 "LogisticRegressionPowerPV.pmml",
                 with_repr=True)
Exemplo n.º 18
0
def build_audit(mapper, classifier, name, **pmml_options):
    pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(audit_X, audit_y)
    pipeline.configure(**pmml_options)
    store_pkl(pipeline, name)
    adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"])
    adjusted_proba = DataFrame(pipeline.predict_proba(audit_X),
                               columns=["probability(0)", "probability(1)"])
    adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name)
Exemplo n.º 19
0
    def auto_build(self):
        """
        Populates output_dir path, with model artifacts, and evalution charts
        """
        self._gen_model_dir()

        self._process_csv()

        self._prepare_X_y()

        if self.eda_flag:
            logger.info("EDA")
            utils.dataset_eda(data=self.X_train,
                              output_dir=self.output_dir_path)

        self._create_categorical_transformer()

        if self.tune_flag:
            self._tune()

        self._generate_shap_plots()

        logger.info("creating pipeline")
        classifier = lgb.LGBMClassifier(**self.lgb_params)
        self.pipeline = PMMLPipeline([("mapper", self.mapper),
                                      ("classifier", classifier)])

        self.pipeline.fit(self.X_train, self.y_train)

        logger.info("Assessing model")

        y_pred = self.pipeline.predict_proba(self.X_test)[:, 1]
        y_bm = np.repeat(self.y_train.mean(), self.y_test.shape)
        utils.evaluate_model(self.y_test, y_pred, y_bm, self.output_dir_path,
                             "Model")

        logger.info("suggeting features to remove")
        self.cols_to_remove = utils.find_features_to_remove(
            importance=self.feature_importance,
            X=self.X_train,
            importance_cutoff=self.importance_cutoff,
            corr_threshold=self.corr_cutoff,
        )
        logger.info(f"candidates to remove - {self.cols_to_remove}")

        logger.info(f"saving model \n{self.output_dir_path}")

        self._save_model()
        test_input = dict(self.X_test.iloc[0])
        test_score = self.pipeline.predict_proba(self.X_test.head(1))
        logger.info(
            f"test-case model inputs \n{ test_input } \n model score \n {test_score}"
        )

        logger.info("done!")
Exemplo n.º 20
0
def xgboost_auto():
    mapper = make_xgboost_dataframe_mapper(auto_X.dtypes,
                                           missing_value_aware=False)
    pipeline = PMMLPipeline([("mapper", mapper),
                             ("regressor",
                              XGBRegressor(n_estimators=31,
                                           max_depth=3,
                                           random_state=13))])
    pipeline.fit(auto_X, auto_y)
    pipeline.configure(compact=False)
    sklearn2pmml(pipeline, "pmml/XGBoostAuto.pmml", with_repr=True)
Exemplo n.º 21
0
def xgboost_audit():
    mapper = make_xgboost_dataframe_mapper(audit_X.dtypes,
                                           missing_value_aware=False)
    pipeline = PMMLPipeline([("mapper", mapper),
                             ("classifier",
                              XGBClassifier(n_estimators=71,
                                            max_depth=5,
                                            random_state=13))])
    pipeline.fit(audit_X, audit_y)
    pipeline.configure(compact=True)
    sklearn2pmml(pipeline, "pmml/XGBoostAudit.pmml", with_repr=True)
Exemplo n.º 22
0
	def test_predict_proba_transform(self):
		predict_proba_transformer = FunctionTransformer(numpy.log)
		pipeline = PMMLPipeline([("estimator", DummyClassifier(strategy = "prior"))], predict_proba_transformer = predict_proba_transformer)
		X = DataFrame([1.0, 1.0, 1.0, 1.0, 1.0, 1.0], columns = ["x"])
		y = Series(["green", "red", "yellow", "green", "red", "green"], name = "y")
		pipeline.fit(X, y)
		self.assertEqual(["green", "red", "yellow"], pipeline._final_estimator.classes_.tolist())
		y_proba = [3 / 6.0, 2 / 6.0, 1 / 6.0]
		y_probat = [numpy.log(x) for x in y_proba]
		self.assertEqual([y_proba for i in range(0, 6)], pipeline.predict_proba(X).tolist())
		self.assertEqual([y_proba + y_probat for i in range(0, 6)], pipeline.predict_proba_transform(X).tolist())
Exemplo n.º 23
0
 def test_configure(self):
     regressor = DecisionTreeRegressor()
     pipeline = PMMLPipeline([("regressor", regressor)])
     self.assertFalse(hasattr(regressor, "pmml_options_"))
     pipeline.configure()
     self.assertFalse(hasattr(regressor, "pmml_options_"))
     pipeline.configure(compact=True, flat=True)
     self.assertTrue(hasattr(regressor, "pmml_options_"))
     pmml_options = regressor.pmml_options_
     self.assertEqual(True, pmml_options["compact"])
     self.assertEqual(True, pmml_options["flat"])
Exemplo n.º 24
0
def build_apollo(mapper, name):
    pipeline = PMMLPipeline([("mapper", mapper),
                             ("classifier", DecisionTreeClassifier())])
    pipeline.fit(df, df["success"])
    store_pkl(pipeline, name)
    success = DataFrame(pipeline.predict(df), columns=["success"])
    success_proba = DataFrame(
        pipeline.predict_proba(df),
        columns=["probability(false)", "probability(true)"])
    success = pandas.concat((success, success_proba), axis=1)
    store_csv(success, name)
Exemplo n.º 25
0
def build_auto_opt(regressor, name, fit_params = {}, **pmml_options):
	pipeline = PMMLPipeline([
		("regressor", regressor)
	])
	pipeline.fit(auto_X[auto_train_mask], auto_y[auto_train_mask], **fit_params)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"])
	store_csv(mpg, name)
Exemplo n.º 26
0
def build_audit_dict(classifier, name, with_proba = True):
	pipeline = PMMLPipeline([
		("dict-transformer", DictVectorizer()),
		("classifier", classifier)
	])
	pipeline.fit(audit_dict_X, audit_y)
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_dict_X), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_dict_X), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name)
Exemplo n.º 27
0
def make_fit_lgbmlr(gbdt, lr):
    mapper = DataFrameMapper(
        [([cat_column],
          [CategoricalDomain(), LabelEncoder()])
         for cat_column in cat_columns] + [(cont_columns, ContinuousDomain())])
    classifier = GBDTLRClassifier(gbdt, lr)
    pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(df[cat_columns + cont_columns],
                 df[label_column],
                 classifier__gbdt__categorical_feature=range(
                     0, len(cat_columns)))
    return pipeline
Exemplo n.º 28
0
def lightgbm_auto():
    mapper, categorical_feature = make_lightgbm_dataframe_mapper(
        auto_X.dtypes, missing_value_aware=False)
    pipeline = PMMLPipeline([("mapper", mapper),
                             ("regressor",
                              LGBMRegressor(n_estimators=31,
                                            max_depth=5,
                                            random_state=13))])
    pipeline.fit(auto_X,
                 auto_y,
                 regressor__categorical_feature=categorical_feature)
    pipeline.configure(compact=True)
    sklearn2pmml(pipeline, "pmml/LightGBMAuto.pmml", with_repr=False)
Exemplo n.º 29
0
	def test_predict_transform(self):
		predict_transformer = FeatureUnion([
			("identity", FunctionTransformer(None)),
			("log10", FunctionTransformer(numpy.log10))
		])
		pipeline = PMMLPipeline([("estimator", DummyRegressor())], predict_transformer = predict_transformer)
		X = DataFrame([[1, 0], [2, 0], [3, 0]], columns = ["X1", "X2"])
		y = Series([0.5, 1.0, 1.5], name = "y")
		pipeline.fit(X, y)
		y_pred = [1.0, 1.0, 1.0]
		y_predt = [1.0, 1.0, numpy.log10(1.0)]
		self.assertEqual(y_pred, pipeline.predict(X).tolist())
		self.assertEqual([y_predt for i in range(0, 3)], pipeline.predict_transform(X).tolist())
Exemplo n.º 30
0
def lightgbm_audit():
    mapper, categorical_feature = make_lightgbm_dataframe_mapper(
        audit_X.dtypes, missing_value_aware=False)
    pipeline = PMMLPipeline([("mapper", mapper),
                             ("classifier",
                              LGBMClassifier(n_estimators=71,
                                             max_depth=7,
                                             random_state=13))])
    pipeline.fit(audit_X,
                 audit_y,
                 classifier__categorical_feature=categorical_feature)
    pipeline.configure(compact=True)
    sklearn2pmml(pipeline, "pmml/LightGBMAudit.pmml", with_repr=False)