y = np.ravel(ipo_train["underpriced"]) sk_log_reg = LogisticRegression() sk_log_model = sk_log_reg.fit(X, y) probList = [] probList = sk_log_model.predict_proba(ipo_test[features.values.tolist()])[:, 1] probArray = np.asarray(probList) fpr, tpr, thresholds = roc_curve(ipo_test["underpriced"], probArray) aucResult = auc(fpr, tpr) print("scikit logistic AUC: " + str(aucResult)) # Define Formula for all Microsoft algorithms formula = "underpriced ~ " + "+".join(features) # Revoscalepy rx-logit rx_logit_model = rx_logit(formula=formula, data=ipo_train) probArray = rv.rx_predict(rx_logit_model, data=ipo_test) probList = probArray["underpriced_Pred"].values probArray = np.asarray(probList) fpr, tpr, thresholds = roc_curve(ipo_test["underpriced"], probArray) aucResult = auc(fpr, tpr) print("rx-logit AUC: " + str(aucResult)) # Plot ROC plt.figure() lw = 2 plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC Curve') plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel("False Positive Rate") plt.ylabel("True Positive Rate")
# SQL Server data # Importing data from SQL Server using pyodbc # Using the revoscalepy library import numpy as np import pandas as pd import pyodbc from revoscalepy import rx_lin_mod, rx_predict, rx_summary # Connecting and reading the data con = pyodbc.connect('DSN=AWDW;UID=RUser;PWD=Pa$$w0rd') query = """SELECT CustomerKey, Age, YearlyIncome, TotalChildren, NumberCarsOwned FROM dbo.vTargetMail;""" TM = pd.read_sql(query, con) TM.head(5) TM.shape # Check the summary of the NumberCarsOwned summary = rx_summary("NumberCarsOwned", TM) print(summary) # Create a linear model linmod = rx_lin_mod("NumberCarsOwned ~ YearlyIncome + Age + TotalChildren", data=TM) predmod = rx_predict(linmod, data=TM, output_data=TM) predmod.head(10) # End of script
# Make integers from ordinals TM['EducationInt'] = TM['Education'].cat.codes TM['CommuteDistanceInt'] = TM['CommuteDistance'].cat.codes TM['OccupationInt'] = TM['Occupation'].cat.codes # Check the distribution # TM['OccupationInt'].value_counts().sort_index() # TM['Occupation'].value_counts().sort_index() # Create a linear model from revoscalepy import rx_lin_mod, rx_predict linmod = rx_lin_mod( """NumberCarsOwned ~ TotalChildren + OccupationInt + NumberChildrenAtHome + EducationInt + CommuteDistanceInt + BikeBuyer""", data=TM) TMPredict = rx_predict(linmod, data=TM, output_data=TM) TMPredict[["NumberCarsOwned", "NumberCarsOwned_Pred"]].head(5) TMPredict[["NumberCarsOwned", "NumberCarsOwned_Pred"]].head(20).plot(kind="area", color=('green', 'orange')) plt.show() # Naive Bayes from sklearn.metrics import accuracy_score from sklearn.naive_bayes import GaussianNB # Arrange the data - feature matrix and target vector # Split the data Xtrain = TM.loc[TM.TrainTest == 1, [ 'TotalChildren', 'NumberChildrenAtHome', 'HouseOwnerFlag', 'NumberCarsOwned', 'EducationInt', 'OccupationInt', 'CommuteDistanceInt'
#write_rts_model(NN_model, "NN", connection_string) ########################################################################################################################################## ## Random Forest Scoring ########################################################################################################################################## # Make Predictions, then import them into Python. forest_prediction_sql = RxSqlServerData(table="Forest_Prediction", string_as_factors=True, connection_string=connection_string) rx_predict(forest_model, data=LoS_Test, output_data=forest_prediction_sql, type="response", extra_vars_to_write=["lengthofstay", "eid"], overwrite=True) # Compute the performance metrics of the model. forest_prediction = rx_import(input_data=forest_prediction_sql) forest_metrics = evaluate_model( observed=forest_prediction['lengthofstay'], predicted=forest_prediction['lengthofstay_Pred'], model="RF") ########################################################################################################################################## ## Boosted Trees Scoring ##########################################################################################################################################
def main(tablename, inputdf, overwrite=False): """Imports a data source into SQL Server table and, operating in-database, uses logistic regression to create a predictive model. A comparison to an out-of-database, in memory method is performed (TODO) Parameters ---------- tablename : str The new or previosly create table name in database. inputdf : RxTextData object The data source overwrite : bool, optional (default=False) Whether or not to overwrite the table. Set to True if this is a new table, otherwise False. """ #################################################################### # Set the compute context to SQL SERVER #################################################################### # NB: don't need, but would be good to know what this actually does here # RxComputeContext(LOCAL, '9.1') compute_context = RxInSqlServer(connection_string=CONNECTION_STRING) # , # num_tasks = 1, # auto_cleanup = False, # console_output=True # ) rx_set_compute_context(compute_context) # if overwrite: #################################################################### # Create table in SQL server #################################################################### print("Creating tables...") data_source = RxSqlServerData(table=tablename, connection_string=CONNECTION_STRING) #################################################################### # Read data into the SQL server table that was just created #################################################################### print("Reading data into tables...") rx_data_step(input_data=inputdf, output_file=data_source, overwrite=True) ##################################################################### # Set up a query on table for train and test data (and ensure factor levels) ##################################################################### print("Setting up query and datasources for train and test sets...") # Train data data_source_train = RxSqlServerData( sql_query="SELECT TOP 10000 * FROM Lead_Demography_Tbl \ ORDER BY Lead_Id", connection_string=CONNECTION_STRING, verbose=True) # Import training data RxImport style from new query source X_y_train = rx_import(data_source_train) # X_y_train = rx_data_step(input_data=data_source_train, overwrite=True) print("Test data...") # Test data (let's pick ~30% size of training dataset) data_source_test = RxSqlServerData( sql_query="SELECT * FROM Lead_Demography_Tbl \ ORDER BY Lead_Id \ OFFSET 10000 ROWS \ FETCH FIRST 3000 ROW ONLY", connection_string=CONNECTION_STRING) # Import data RxImport style from new query source X_y_test = rx_import(data_source_test) # X_y_test = rx_data_step(input_data=data_source_test, overwrite=True) ##################################################################### # Run revoscalepy logistic regression on training data (in-database) ##################################################################### print('Fitting a logistic regression model...') mod = rx_logit(formula="Annual_Income_Bucket_gt120k ~ \ F(Highest_Education_Graduate_School)", data=X_y_train, compute_context=compute_context, verbose=2) assert mod is not None assert mod._results is not None pprint(mod._results) ##################################################################### # Summary on training data (in-database) ##################################################################### # Note: for "data" use data source and not the rximport'ed data print('\nSummary: \n') summary = rx_summary("Annual_Income_Bucket_gt120k ~ \ F(Highest_Education_Graduate_School)", data=data_source_train, compute_context=compute_context, cube=True, verbose=2) ##################################################################### # Predict on test data (in-database) ##################################################################### print("\nPredict on test: \n") pred = rx_predict(mod, data=X_y_test, verbose=2, write_model_vars=True) ##################################################################### # Metrics for predition based on groundtruth (with scikit-learn tools) ##################################################################### pred_results = pred._results['Annual_Income_Bucket_gt120k_Pred'] # For some reason the prediction results are not in a binary [0,1] format y_pred = binarize(pred_results, threshold=(min(pred_results) + \ max(pred_results))/2).reshape(-1, 1) y_true = pred._data['Annual_Income_Bucket_gt120k'] print("Model prediction results:", y_pred) print("Actual values: ", y_true) print("Accuracy score: ", accuracy_score(y_true=y_true, y_pred=y_pred))