y = np.ravel(ipo_train["underpriced"])
sk_log_reg = LogisticRegression()
sk_log_model = sk_log_reg.fit(X, y)
probList = []
probList = sk_log_model.predict_proba(ipo_test[features.values.tolist()])[:, 1]
probArray = np.asarray(probList)
fpr, tpr, thresholds = roc_curve(ipo_test["underpriced"], probArray)
aucResult = auc(fpr, tpr)
print("scikit logistic AUC: " + str(aucResult))

# Define Formula for all Microsoft algorithms
formula = "underpriced ~ " + "+".join(features)

# Revoscalepy rx-logit
rx_logit_model = rx_logit(formula=formula, data=ipo_train)
probArray = rv.rx_predict(rx_logit_model, data=ipo_test)
probList = probArray["underpriced_Pred"].values
probArray = np.asarray(probList)
fpr, tpr, thresholds = roc_curve(ipo_test["underpriced"], probArray)
aucResult = auc(fpr, tpr)
print("rx-logit AUC: " + str(aucResult))

# Plot ROC
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC Curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
# SQL Server data
# Importing data from SQL Server using pyodbc
# Using the revoscalepy library
import numpy as np
import pandas as pd
import pyodbc
from revoscalepy import rx_lin_mod, rx_predict, rx_summary

# Connecting and reading the data
con = pyodbc.connect('DSN=AWDW;UID=RUser;PWD=Pa$$w0rd')
query = """SELECT CustomerKey, Age,
             YearlyIncome, TotalChildren,
             NumberCarsOwned
           FROM dbo.vTargetMail;"""
TM = pd.read_sql(query, con)
TM.head(5)
TM.shape

# Check the summary of the NumberCarsOwned
summary = rx_summary("NumberCarsOwned", TM)
print(summary)

# Create a linear model
linmod = rx_lin_mod("NumberCarsOwned ~ YearlyIncome + Age + TotalChildren",
                    data=TM)
predmod = rx_predict(linmod, data=TM, output_data=TM)
predmod.head(10)

# End of script
# Make integers from ordinals
TM['EducationInt'] = TM['Education'].cat.codes
TM['CommuteDistanceInt'] = TM['CommuteDistance'].cat.codes
TM['OccupationInt'] = TM['Occupation'].cat.codes
# Check the distribution
# TM['OccupationInt'].value_counts().sort_index()
# TM['Occupation'].value_counts().sort_index()

# Create a linear model
from revoscalepy import rx_lin_mod, rx_predict
linmod = rx_lin_mod(
    """NumberCarsOwned ~ TotalChildren + OccupationInt + NumberChildrenAtHome +
    EducationInt + CommuteDistanceInt + BikeBuyer""",
    data=TM)
TMPredict = rx_predict(linmod, data=TM, output_data=TM)
TMPredict[["NumberCarsOwned", "NumberCarsOwned_Pred"]].head(5)
TMPredict[["NumberCarsOwned",
           "NumberCarsOwned_Pred"]].head(20).plot(kind="area",
                                                  color=('green', 'orange'))
plt.show()

# Naive Bayes
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

# Arrange the data - feature matrix and target vector
# Split the data
Xtrain = TM.loc[TM.TrainTest == 1, [
    'TotalChildren', 'NumberChildrenAtHome', 'HouseOwnerFlag',
    'NumberCarsOwned', 'EducationInt', 'OccupationInt', 'CommuteDistanceInt'
예제 #4
0
#write_rts_model(NN_model, "NN", connection_string)

##########################################################################################################################################

##	Random Forest Scoring

##########################################################################################################################################

# Make Predictions, then import them into Python.
forest_prediction_sql = RxSqlServerData(table="Forest_Prediction",
                                        string_as_factors=True,
                                        connection_string=connection_string)
rx_predict(forest_model,
           data=LoS_Test,
           output_data=forest_prediction_sql,
           type="response",
           extra_vars_to_write=["lengthofstay", "eid"],
           overwrite=True)

# Compute the performance metrics of the model.
forest_prediction = rx_import(input_data=forest_prediction_sql)
forest_metrics = evaluate_model(
    observed=forest_prediction['lengthofstay'],
    predicted=forest_prediction['lengthofstay_Pred'],
    model="RF")

##########################################################################################################################################

##	Boosted Trees Scoring

##########################################################################################################################################
def main(tablename, inputdf, overwrite=False):
    """Imports a data source into SQL Server table and, operating
       in-database, uses logistic regression to create a
       predictive model.

       A comparison to an out-of-database, in memory method
       is performed (TODO)
       
       Parameters
       ----------

       tablename : str
           The new or previosly create table name in database.

       inputdf : RxTextData object
           The data source

       overwrite : bool, optional (default=False)
           Whether or not to overwrite the table.  
           Set to True if this is a new table, otherwise False.
       """

    ####################################################################
    # Set the compute context to SQL SERVER
    ####################################################################

    # NB: don't need, but would be good to know what this actually does here
    # RxComputeContext(LOCAL, '9.1')

    compute_context = RxInSqlServer(connection_string=CONNECTION_STRING)
    # ,
    # num_tasks = 1,
    # auto_cleanup = False,
    # console_output=True
    # )

    rx_set_compute_context(compute_context)

    # if overwrite:

    ####################################################################
    # Create table in SQL server
    ####################################################################

    print("Creating tables...")
    data_source = RxSqlServerData(table=tablename,
                                  connection_string=CONNECTION_STRING)

    ####################################################################
    # Read data into the SQL server table that was just created
    ####################################################################

    print("Reading data into tables...")

    rx_data_step(input_data=inputdf, output_file=data_source, overwrite=True)

    #####################################################################
    # Set up a query on table for train and test data (and ensure factor levels)
    #####################################################################
    print("Setting up query and datasources for train and test sets...")

    # Train data
    data_source_train = RxSqlServerData(
        sql_query="SELECT TOP 10000 * FROM Lead_Demography_Tbl \
                    ORDER BY Lead_Id",
        connection_string=CONNECTION_STRING,
        verbose=True)

    # Import training data RxImport style from new query source
    X_y_train = rx_import(data_source_train)
    # X_y_train = rx_data_step(input_data=data_source_train, overwrite=True)

    print("Test data...")
    # Test data (let's pick ~30% size of training dataset)
    data_source_test = RxSqlServerData(
        sql_query="SELECT * FROM Lead_Demography_Tbl \
                    ORDER BY Lead_Id \
                    OFFSET 10000 ROWS \
                    FETCH FIRST 3000 ROW ONLY",
        connection_string=CONNECTION_STRING)

    # Import data RxImport style from new query source
    X_y_test = rx_import(data_source_test)
    # X_y_test = rx_data_step(input_data=data_source_test, overwrite=True)

    #####################################################################
    # Run revoscalepy logistic regression on training data (in-database)
    #####################################################################

    print('Fitting a logistic regression model...')

    mod = rx_logit(formula="Annual_Income_Bucket_gt120k ~ \
                                F(Highest_Education_Graduate_School)",
                   data=X_y_train,
                   compute_context=compute_context,
                   verbose=2)
    assert mod is not None
    assert mod._results is not None
    pprint(mod._results)

    #####################################################################
    # Summary on training data (in-database)
    #####################################################################

    # Note:  for "data" use data source and not the rximport'ed data

    print('\nSummary: \n')

    summary = rx_summary("Annual_Income_Bucket_gt120k ~ \
                                F(Highest_Education_Graduate_School)",
                         data=data_source_train,
                         compute_context=compute_context,
                         cube=True,
                         verbose=2)

    #####################################################################
    # Predict on test data (in-database)
    #####################################################################

    print("\nPredict on test: \n")

    pred = rx_predict(mod, data=X_y_test, verbose=2, write_model_vars=True)

    #####################################################################
    # Metrics for predition based on groundtruth (with scikit-learn tools)
    #####################################################################

    pred_results = pred._results['Annual_Income_Bucket_gt120k_Pred']
    # For some reason the prediction results are not in a binary [0,1] format
    y_pred = binarize(pred_results, threshold=(min(pred_results) + \
                        max(pred_results))/2).reshape(-1, 1)
    y_true = pred._data['Annual_Income_Bucket_gt120k']

    print("Model prediction results:", y_pred)
    print("Actual values: ", y_true)

    print("Accuracy score: ", accuracy_score(y_true=y_true, y_pred=y_pred))