Exemplo n.º 1
0
model_type = "linear"
conn_str = 'Driver=SQL Server;Server=<Server Name>;Database=MLDB;Uid=<User Name>;Pwd=<Password>;'
cnxn = pyodbc.connect(conn_str)
inputsql = 'select "RentalCount", "Year", "Month", "Day", "WeekDay", "Snow", "Holiday", "FWeekDay" from dbo.rental_data where Year < 2015'
rental_train_data = pd.read_sql(inputsql, cnxn)

# Used to sample data
# train = rental_train_data.sample(frac=0.8, random_state=1)
# test = rental_train_data.loc[~ rental_train_data.index.isin(train.index)]
# print("Train {0} / test {1}".format(len(train), len(test)))

rental_train_data["Holiday"] = rental_train_data["Holiday"].astype("category")
rental_train_data["Snow"] = rental_train_data["Snow"].astype("category")
rental_train_data["WeekDay"] = rental_train_data["WeekDay"].astype("category")

if model_type == "linear":
    linmod_model = rx_lin_mod("RentalCount ~ Month + Day + WeekDay + Snow + Holiday", data = rental_train_data)
    trained_model = rx_serialize_model(linmod_model, realtime_scoring_only = True)
if model_type == "dtree":
	dtree_model = rx_dtree("RentalCount ~ Month + Day + WeekDay + Snow + Holiday", data = rental_train_data)
	trained_model = rx_serialize_model(dtree_model, realtime_scoring_only = True)

print(rx_summary("~ Month + Day + WeekDay + Snow + Holiday", rental_train_data))

# Dump learned model to file
with open(r'c:\temp\trained_model.pickle', mode='wb') as f:
    f.write(trained_model)

cursor=cnxn.cursor()
cursor.execute("INSERT INTO rental_models(model_name, lang, native_model) VALUES(?, ?, ?)", (model_type + "_model", "Python", trained_model))
cnxn.commit()
TM['Occupation'].cat.reorder_categories(
    ["Manual", "Clerical", "Skilled Manual", "Professional", "Management"],
    inplace=True)

# Make integers from ordinals
TM['EducationInt'] = TM['Education'].cat.codes
TM['CommuteDistanceInt'] = TM['CommuteDistance'].cat.codes
TM['OccupationInt'] = TM['Occupation'].cat.codes
# Check the distribution
# TM['OccupationInt'].value_counts().sort_index()
# TM['Occupation'].value_counts().sort_index()

# Create a linear model
from revoscalepy import rx_lin_mod, rx_predict
linmod = rx_lin_mod(
    """NumberCarsOwned ~ TotalChildren + OccupationInt + NumberChildrenAtHome +
    EducationInt + CommuteDistanceInt + BikeBuyer""",
    data=TM)
TMPredict = rx_predict(linmod, data=TM, output_data=TM)
TMPredict[["NumberCarsOwned", "NumberCarsOwned_Pred"]].head(5)
TMPredict[["NumberCarsOwned",
           "NumberCarsOwned_Pred"]].head(20).plot(kind="area",
                                                  color=('green', 'orange'))
plt.show()

# Naive Bayes
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

# Arrange the data - feature matrix and target vector
# Split the data
Xtrain = TM.loc[TM.TrainTest == 1, [
# SQL Server data
# Importing data from SQL Server using pyodbc
# Using the revoscalepy library
import numpy as np
import pandas as pd
import pyodbc
from revoscalepy import rx_lin_mod, rx_predict, rx_summary

# Connecting and reading the data
con = pyodbc.connect('DSN=AWDW;UID=RUser;PWD=Pa$$w0rd')
query = """SELECT CustomerKey, Age,
             YearlyIncome, TotalChildren,
             NumberCarsOwned
           FROM dbo.vTargetMail;"""
TM = pd.read_sql(query, con)
TM.head(5)
TM.shape

# Check the summary of the NumberCarsOwned
summary = rx_summary("NumberCarsOwned", TM)
print(summary)

# Create a linear model
linmod = rx_lin_mod("NumberCarsOwned ~ YearlyIncome + Age + TotalChildren",
                    data=TM)
predmod = rx_predict(linmod, data=TM, output_data=TM)
predmod.head(10)

# End of script
         "Month" : { "type" : "integer" },
         "Day" : { "type" : "integer" },
         "RentalCount" : { "type" : "integer" },
         "WeekDay" : {
             "type" : "factor",
             "levels" : ["1", "2", "3", "4", "5", "6", "7"]
         },
         "Holiday" : {
             "type" : "factor",
             "levels" : ["1", "0"]
         },
         "Snow" : {
             "type" : "factor",
             "levels" : ["1", "0"]
         }
     }
data_source = RxSqlServerData(sql_query=inputsql, connection_string=conn_str, column_info=column_info)

linmod_model = rx_lin_mod("RentalCount ~ Month + Day + WeekDay + Snow + Holiday", data = data_source, computeContext = cc)
trained_model = rx_serialize_model(linmod_model, realtime_scoring_only = True)

with open(r'c:\temp\trained_model.pickle', mode='wb') as f:
    f.write(trained_model)

print(rx_summary("RentalCount ~ Month + Day + WeekDay + Snow + Holiday", data_source))


cnxn = pyodbc.connect(conn_str)
cursor=cnxn.cursor()
cursor.execute("INSERT INTO rental_models(model_name, lang, native_model) VALUES(?, ?, ?)", ("linear_model", "Python", trained_model))
cnxn.commit()