예제 #1
0
def build_sentiment(classifier, name, with_proba=True):
    pipeline = PMMLPipeline([
        ("tf-idf",
         TfidfVectorizer(
             analyzer="word",
             preprocessor=None,
             strip_accents=None,
             lowercase=True,
             token_pattern=None,
             tokenizer=Splitter(),
             stop_words="english",
             ngram_range=(1, 2),
             norm=None,
             dtype=(numpy.float32 if isinstance(
                 classifier, RandomForestClassifier) else numpy.float64))),
        ("selector", SelectorProxy(SelectPercentile(chi2, percentile=10))),
        ("classifier", classifier)
    ])
    pipeline.fit(sentiment_X, sentiment_y)
    store_pkl(pipeline, name + ".pkl")
    score = DataFrame(pipeline.predict(sentiment_X), columns=["Score"])
    if (with_proba == True):
        score_proba = DataFrame(pipeline.predict_proba(sentiment_X),
                                columns=["probability(0)", "probability(1)"])
        score = pandas.concat((score, score_proba), axis=1)
    store_csv(score, name + ".csv")
예제 #2
0
def build_audit(classifier, name, with_proba=True):
    mapper = DataFrameMapper([
        ("Age", ContinuousDomain()),
        ("Employment", [
            LabelBinarizer(),
            SelectFromModel(EstimatorProxy(
                DecisionTreeClassifier(random_state=13)),
                            threshold="1.25 * mean")
        ]),
        ("Education", [
            LabelBinarizer(),
            SelectorProxy(
                SelectFromModel(EstimatorProxy(
                    RandomForestClassifier(random_state=13, n_estimators=3)),
                                threshold="median"))
        ]), ("Marital", [LabelBinarizer(), SelectKBest(k=3)]),
        ("Occupation", [LabelBinarizer(),
                        SelectorProxy(SelectKBest(k=3))]),
        ("Income", ContinuousDomain()), ("Gender", LabelEncoder()),
        ("Deductions", LabelEncoder()), ("Hours", ContinuousDomain())
    ])
    pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(audit_X, audit_y)
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"])
    if (with_proba == True):
        adjusted_proba = DataFrame(pipeline.predict_proba(audit_X),
                                   columns=["probability_0", "probability_1"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name + ".csv")
예제 #3
0
def build_iris(classifier, name, with_proba=True):
    pipeline = PMMLPipeline([
        ("union",
         FeatureUnion([("normal_scale",
                        DataFrameMapper([
                            (iris_X.columns.values, ContinuousDomain()),
                        ])),
                       ("log_scale",
                        DataFrameMapper([(iris_X.columns.values,
                                          FunctionTransformer(numpy.log10))]))
                       ])), ("scaler", RobustScaler()),
        ("pca", IncrementalPCA(n_components=3, whiten=True)),
        ("classifier", classifier)
    ])
    pipeline.fit(iris_X, iris_y)
    store_pkl(pipeline, name + ".pkl")
    species = DataFrame(pipeline.predict(iris_X), columns=["Species"])
    if (with_proba == True):
        species_proba = DataFrame(pipeline.predict_proba(iris_X),
                                  columns=[
                                      "probability(setosa)",
                                      "probability(versicolor)",
                                      "probability(virginica)"
                                  ])
        species = pandas.concat((species, species_proba), axis=1)
    store_csv(species, name + ".csv")
예제 #4
0
def build_audit(classifier, name, with_proba = True, **kwargs):
	continuous_mapper = DataFrameMapper([
		("Age", ContinuousDomain()),
		("Income", ContinuousDomain()),
		("Hours", ContinuousDomain())
	])
	categorical_mapper = DataFrameMapper([
		("Employment", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectFromModel(EstimatorProxy(DecisionTreeClassifier(random_state = 13))))]),
		("Education", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectFromModel(EstimatorProxy(RandomForestClassifier(random_state = 13, n_estimators = 3)), threshold = "1.25 * mean"))]),
		("Marital", [CategoricalDomain(), LabelBinarizer(neg_label = -1, pos_label = 1), SelectKBest(k = 3)]),
		("Occupation", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectKBest(k = 3))]),
		("Gender", [CategoricalDomain(), LabelBinarizer(neg_label = -3, pos_label = 3)]),
		("Deductions", [CategoricalDomain(), LabelEncoder()]),
	])
	pipeline = PMMLPipeline([
		("union", FeatureUnion([
			("continuous", continuous_mapper),
			("categorical", Pipeline([
				("mapper", categorical_mapper),
				("polynomial", PolynomialFeatures())
			]))
		])),
		("classifier", classifier)
	])
	pipeline.fit(audit_X, audit_y)
	customize(classifier, **kwargs)
	store_pkl(pipeline, name + ".pkl")
	adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"])
	if(with_proba == True):
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name + ".csv")
예제 #5
0
def build_audit_dict(classifier, name, with_proba=True):
    pipeline = PMMLPipeline([("dict-transformer", DictVectorizer()),
                             ("classifier", classifier)])
    pipeline.fit(audit_dict_X, audit_y)
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_dict_X), columns=["Adjusted"])
    if (with_proba == True):
        adjusted_proba = DataFrame(
            pipeline.predict_proba(audit_dict_X),
            columns=["probability(0)", "probability(1)"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name + ".csv")
예제 #6
0
class XgbModel(object):
    def __init__(self, train, train_label, test, test_label):
        self.__train = train
        self.__train_label = train_label
        self.__test = test
        self.__test_label = test_label
        self.__bst = None
        self.__feat_imp = None
        self.__test_preds = None
        self.__test_predictions = None
        self.__output = None

    def train(self):
        self.__bst = XGBClassifier(objective="binary:logistic")
        self.__bst = PMMLPipeline([("estimator", self.__bst)])
        self.__bst.fit(self.__train,
                       self.__train_label,
                       estimator__eval_metric="auc")

    def predict(self):
        self.__test_preds = self.__bst.predict_proba(self.__test)[:, 1]
        self.__test_predictions = self.__bst.predict(self.__test)

    def feature_importances(self):
        self.__feat_imp = (pd.Series(
            self.__bst.feature_importances_,
            ["gbc", "rf", "ab", "lr"]).sort_values(ascending=False))
        self.__feat_imp.plot(kind="bar", title="Feature Importances")
        plt.ylabel("Feature Importance Score")
        plt.show()

    def evaluate(self):
        print("auc : %.4f" %
              roc_auc_score(self.__test_label, self.__test_preds))
        print("accuracy score : %.4f" %
              accuracy_score(self.__test_label, self.__test_predictions))

    def evaluate_output(self):
        self.__output = np.hstack(
            (self.__test, self.__test_label.reshape(
                (-1, 1)), self.__test_preds.reshape((-1, 1))))
        pd.DataFrame(
            self.__output).to_csv("C:\\Users\\Dell\\Desktop\\output.csv")

    def xgbmodel_output(self):
        joblib.dump(self.__bst,
                    "C:\\Users\\Dell\\Desktop\\bstML.pkl.z",
                    compress=True)
예제 #7
0
def build_versicolor(classifier, name, with_proba=True):
    mapper = DataFrameMapper([((versicolor_columns[:-1],
                                [ContinuousDomain(),
                                 RobustScaler()]))])
    pipeline = PMMLPipeline([("mapper", mapper),
                             ("transformer", PolynomialFeatures(degree=3)),
                             ("selector", SelectKBest(k="all")),
                             ("classifier", classifier)])
    pipeline.fit(versicolor_X, versicolor_y)
    store_pkl(pipeline, name + ".pkl")
    species = DataFrame(pipeline.predict(versicolor_X), columns=["Species"])
    if (with_proba == True):
        species_proba = DataFrame(pipeline.predict_proba(versicolor_X),
                                  columns=["probability_0", "probability_1"])
        species = pandas.concat((species, species_proba), axis=1)
    store_csv(species, name + ".csv")
예제 #8
0
def build_audit_na(classifier, name, with_proba = True):
	mapper = DataFrameMapper(
		[([column], [ContinuousDomain(missing_values = None), Imputer()]) for column in ["Age", "Income", "Hours"]] +
		[([column], [CategoricalDomain(missing_values = None), CategoricalImputer(), PMMLLabelBinarizer()]) for column in ["Employment", "Education", "Marital", "Occupation", "Gender"]]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	])
	pipeline.fit(audit_na_X, audit_na_y)
	store_pkl(pipeline, name + ".pkl")
	adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"])
	if(with_proba == True):
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name + ".csv")
예제 #9
0
def build_iris(classifier, name, with_proba=True):
    mapper = DataFrameMapper([
        (iris_X.columns.values, ContinuousDomain()),
    ])
    pipeline = PMMLPipeline([("mapper", mapper), ("scaler", RobustScaler()),
                             ("pca", IncrementalPCA(n_components=3,
                                                    whiten=True)),
                             ("classifier", classifier)])
    pipeline.fit(iris_X, iris_y)
    store_pkl(pipeline, name + ".pkl")
    species = DataFrame(pipeline.predict(iris_X), columns=["Species"])
    if (with_proba == True):
        species_proba = DataFrame(pipeline.predict_proba(iris_X),
                                  columns=[
                                      "probability_setosa",
                                      "probability_versicolor",
                                      "probability_virginica"
                                  ])
        species = pandas.concat((species, species_proba), axis=1)
    store_csv(species, name + ".csv")
예제 #10
0
def build_audit_na(classifier, name, with_proba=True):
    employment_mapping = {
        "Consultant": "Private",
        "PSFederal": "Public",
        "PSLocal": "Public",
        "PSState": "Public",
        "SelfEmp": "Private",
        "Private": "Private"
    }
    gender_mapping = {"Female": 0, "Male": 1}
    mapper = DataFrameMapper(
        [([column], [ContinuousDomain(missing_values=None),
                     Imputer()])
         for column in ["Age", "Income", "Hours"]] + [("Employment", [
             CategoricalDomain(missing_values=None),
             CategoricalImputer(),
             LookupTransformer(employment_mapping, "Other"),
             PMMLLabelBinarizer()
         ])] + [([column], [
             CategoricalDomain(missing_values=None),
             CategoricalImputer(),
             PMMLLabelBinarizer()
         ]) for column in ["Education", "Marital", "Occupation"]] +
        [("Gender", [
            CategoricalDomain(missing_values=None),
            CategoricalImputer(),
            LookupTransformer(gender_mapping, None)
        ])])
    pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(audit_na_X, audit_na_y)
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_na_X), columns=["Adjusted"])
    if (with_proba == True):
        adjusted_proba = DataFrame(
            pipeline.predict_proba(audit_na_X),
            columns=["probability(0)", "probability(1)"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name + ".csv")
def PMML_creation(train_path, test_path, pmml_predictions, pmml_path,path,acct_id):
    logging.info('PMML creation Started.')
    data = pd.read_csv(r'' + train_path)
    data2 = pd.read_csv(r'' + test_path)

    features = ['avg_delay_categorical',
                'variance_categorical',
                'LMH_cumulative',
                'avg_of_invoices_closed',
                'avg_of_all_delays',
                'payment_count_quarter_q1', 'payment_count_quarter_q2',
                'payment_count_quarter_q3', 'payment_count_quarter_q4',
                'invoice_count_quarter_q1', 'invoice_count_quarter_q2',
                'invoice_count_quarter_q3', 'invoice_count_quarter_q4',
                'number_invoices_closed']

    #rf = RandomForestClassifier(n_estimators=100,random_state =42, class_weight = {0: 1, 1:1}, max_depth = 8, max_features =0.5) #duracell
    #rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight={0: 1, 1: 1}, max_depth=8,max_features=0.5, min_weight_fraction_leaf=0.1)   #gettyimages

    # rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight={0: 1, 1: 1}, max_depth=8,max_features=0.5)  #milliken
    #rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight={0: 1, 1: 2}, max_depth=8,max_features=0.5) #graybar
    # rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight={0: 1, 1: 1}, max_depth=7,
    #                              max_features=0.4, min_samples_split=4, min_samples_leaf=3,
    #                              min_weight_fraction_leaf=0.1)
    # rf = RandomForestClassifier(n_estimators=250, random_state=42, class_weight={0: 1, 1: 2}, max_depth=7,
    #                              max_features=0.4, min_samples_leaf=4, min_weight_fraction_leaf=0.2)
    # rf = RandomForestClassifier(n_estimators=100, max_leaf_nodes=10, random_state=42, class_weight={0: 1, 1: 2},
    #     #                             criterion='gini', max_depth=7, max_features=0.4, n_jobs=-1,
    #     #                             min_weight_fraction_leaf=0.4)
    # rf = RandomForestClassifier(n_estimators=320, max_leaf_nodes=20, random_state=42, class_weight={0:1, 1:2},
    #                              criterion='gini', max_depth=7, max_features=0.4,n_jobs=-1)
    #
    # rf = xgboost.XGBClassifier(random_state=42, n_estimators=206, min_samples_split=10, min_samples_leaf=6,
    #                             max_features='sqrt', max_depth=1, learning_rate=0.0015)
    # rf = xgboost.XGBClassifier(random_state=42, n_estimators=302, min_samples_split=10, min_samples_leaf=10,
    #                             max_features='sqrt', max_depth=1, learning_rate=0.0074)
    # rf=xgboost.XGBClassifier(random_state=42, n_estimators=145, min_samples_split=24, min_samples_leaf=1,
    #                       max_features='sqrt', max_depth=33, learning_rate=0.0077)

    # rf = RandomForestClassifier(random_state=42, n_estimators=400, min_weight_fraction_leaf=0.3, min_samples_split=24,
    #                              min_samples_leaf=1, max_features='sqrt', max_depth=1, criterion='entropy')

    #rf= xgboost.XGBClassifier(random_state=42,n_estimators=106,min_samples_split=24, min_samples_leaf= 16, max_features= 'auto',max_depth=1, learning_rate= 0.0044)
    
    #rf = LGBMClassifier(class_weight={0: 1, 1: 5}, max_depth=10, num_leaves=1000, min_data_in_leaf=500,
     #                     learning_rate=0.08)
    #rf = xgboost.XGBClassifier(random_state=42,n_estimators=106,min_samples_split=24, min_samples_leaf= 16, max_features= 'sqrt',max_depth=35, learning_rate= 0.0077)
    #rf=  xgboost.XGBClassifier(random_state=42,n_estimators=400,min_samples_split=34, min_samples_leaf= 16, max_features= 'sqrt',max_depth=35, learning_rate= 0.0099)
    #rf= xgboost.XGBClassifier(random_state=42,n_estimators=200,min_samples_split=14, min_samples_leaf= 1, max_features= 'sqrt',max_depth=20, learning_rate= 0.0077)
    #final_report =  pd.read_csv(path+'/account_'+acct_id+'/summary.csv')
    model = joblib.load(path+'/account_'+acct_id+'/trained_model/model.pkl')
    #model_name = str(model).split('(')[0]
    params = model.get_params()
    classifier= type(model)()
    rf = classifier.set_params(**params)
    print("-"*100)
    print(rf)
    #rf = type(model)(model.get_params)
    print((rf.get_params()))
    print("-"*100)
    print(model.get_params())

    

    # rf= xgboost.XGBClassifier(random_state=42, n_estimators=320, min_samples_split=5, min_samples_leaf=6,
    #                             max_features='log2', max_depth=50, learning_rate=0.0093)
    # rf = xgboost.XGBClassifier(random_state=42, n_estimators=445, min_samples_split=5, min_samples_leaf=8,
    #                             max_features='sqrt', max_depth=1, learning_rate=0.00959591836734694)

    mapper = DataFrameMapper([('avg_delay_categorical', None),
                              ('variance_categorical', None),
                              ('LMH_cumulative', None),
                              ('avg_of_invoices_closed', None),
                              ('avg_of_all_delays', None),
                              ('payment_count_quarter_q1', None),
                              ('payment_count_quarter_q2', None),
                              ('payment_count_quarter_q3', None),
                              ('payment_count_quarter_q4', None),
                              ('invoice_count_quarter_q1', None),
                              ('invoice_count_quarter_q2', None),
                              ('invoice_count_quarter_q3', None),
                              ('invoice_count_quarter_q4', None),
                              ('number_invoices_closed', None)
                              ])

    labels = data.loc[:, 'output']
    labels.name = 'output'

    data = data[features].astype('double')
    print(data.dtypes)

    pipeline = PMMLPipeline([("mapper", mapper), ("estimator", rf)])
    pickle_pipeline = Pipeline([("mapper", mapper), ("model", rf)])

    pipeline.fit(data, labels)
    pickle_pipeline.fit(data, labels)

    predictions = pipeline.predict(data2[features])
    predictions_prob = pipeline.predict_proba(data2[features])
    data2['PMML_predictions'] = predictions
    for i in range(0, data2.shape[0]):
        data2.at[i, 'PMML_pred_proba_0'] = predictions_prob[i][0]
        data2.at[i, 'PMML_pred_proba_1'] = predictions_prob[i][1]

    data2.to_csv(pmml_predictions, index=False)
    sklearn2pmml(pipeline, r"" + pmml_path + '_PIPELINED' + ".pmml", debug=True)
    joblib.dump(pickle_pipeline, r"" + pmml_path + "_PIPELINED.pkl")
    logging.info('PMML created of size ' + str(file_size(r"" + pmml_path + ".pmml")))
예제 #12
0
        'FIN_KUNNR': 'payer',
        'FIN_PAID_AMT': 'paid_amount'
    })
#

test['ship_to'] = test['ship_to'].astype('str').str.split('.').str[0]

#
# test_transformations = pd.DataFrame(mapper.fit_transform(test),columns=['create_minus_claim_date', 'category_history', 'cal_cust_history', 'ZZ_CLAIMDATE_SIMP_DT_month', 'ship_to_history', 'original_with_avg_dispute', 'rank_xref_in_kunnr', 'b_value', 'rank_kunwe_in_kunnr'])
#
# test_transformations.to_csv('test_transformations.csv')

test_result = pd.DataFrame()
test_result['output'] = pipeline.predict(test)
#
test_result['predict_proba1'] = pipeline.predict_proba(test)[:, 0]
test_result['predict_proba2'] = pipeline.predict_proba(test)[:, 1]

test_result['actual_result'] = test['labels']

from sklearn.metrics import classification_report

print(
    classification_report(test_result['actual_result'], test_result['output']))

from sklearn2pmml import sklearn2pmml
#
# #sklearn2pmml(pipeline, "only_b_value.pmml", user_classpath=[r"D:\jesus\sap\sklearn2pmml-plugin-1.0-SNAPSHOT.jar"],debug=True)
#
sklearn2pmml(
    pipeline,
pipeline.fit(train,train['labels'])

#test = pd.read_csv(r'Data/UDM_DISPUTE_20171231-20180202.csv')

test = pd.read_csv('validation.csv')

test['main_output']=(test['FIN_PAID_AMT']>(0.01*test['FIN_ORIGINAL_AMT']))
test['labels']=test['main_output'].map({True:-1,False:1})


test = test.rename(columns={'ZZ_CLAIMDATE_SIMP_DT': 'customer_claim_date', 'CREATE_TIME': 'deduction_created_date','ZZ_XREF3': 'product_category','KUNWE': 'ship_to','FIN_ORIGINAL_AMT': 'original_dispute_amount','FIN_KUNNR': 'payer','FIN_PAID_AMT': 'paid_amount'})
from sklearn.linear_model import LogisticRegression
test_result = pd.DataFrame()
test_result['output'] = pipeline.predict(test.head(1790))
#
test_result['predict_proba1'] = pipeline.predict_proba(test.head(1790))[:,0]
test_result['predict_proba2'] = pipeline.predict_proba(test.head(1790))[:,1]

test_result['actual_result'] = test['labels'].head(1790)

from sklearn.metrics import classification_report

print(classification_report(test_result['actual_result'],test_result['output']))


#pipeline.predict()
#
# test_real = pd.read_csv('test_real3.csv',encoding='latin')
#
# test_results = pd.DataFrame(columns=['result','prob1','prob2'])
#