model_type = "linear" conn_str = 'Driver=SQL Server;Server=<Server Name>;Database=MLDB;Uid=<User Name>;Pwd=<Password>;' cnxn = pyodbc.connect(conn_str) inputsql = 'select "RentalCount", "Year", "Month", "Day", "WeekDay", "Snow", "Holiday", "FWeekDay" from dbo.rental_data where Year < 2015' rental_train_data = pd.read_sql(inputsql, cnxn) # Used to sample data # train = rental_train_data.sample(frac=0.8, random_state=1) # test = rental_train_data.loc[~ rental_train_data.index.isin(train.index)] # print("Train {0} / test {1}".format(len(train), len(test))) rental_train_data["Holiday"] = rental_train_data["Holiday"].astype("category") rental_train_data["Snow"] = rental_train_data["Snow"].astype("category") rental_train_data["WeekDay"] = rental_train_data["WeekDay"].astype("category") if model_type == "linear": linmod_model = rx_lin_mod("RentalCount ~ Month + Day + WeekDay + Snow + Holiday", data = rental_train_data) trained_model = rx_serialize_model(linmod_model, realtime_scoring_only = True) if model_type == "dtree": dtree_model = rx_dtree("RentalCount ~ Month + Day + WeekDay + Snow + Holiday", data = rental_train_data) trained_model = rx_serialize_model(dtree_model, realtime_scoring_only = True) print(rx_summary("~ Month + Day + WeekDay + Snow + Holiday", rental_train_data)) # Dump learned model to file with open(r'c:\temp\trained_model.pickle', mode='wb') as f: f.write(trained_model) cursor=cnxn.cursor() cursor.execute("INSERT INTO rental_models(model_name, lang, native_model) VALUES(?, ?, ?)", (model_type + "_model", "Python", trained_model)) cnxn.commit()
# into train and test. try: from sklearn.model_selection import train_test_split except ImportError: from sklearn.cross_validation import train_test_split wines_train, wines_test = train_test_split(wines) ############################## # And we train. We drop the color which is a non numerical # features. We will add it later. from revoscalepy import rx_dtree cols = wines.columns.drop(["quality", "color"]) model = rx_dtree("quality ~" + "+".join(cols), data=wines_train, method="anova") ###################### # We predict. from revoscalepy import rx_predict_rx_dtree pred = rx_predict_rx_dtree(model, wines_test, extra_vars_to_write=["quality"]) print(pred.head()) ########################### # The column 'quality_Pred' is the prediction. # We estimate its quality with the metric `R2 <http://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html>`_ # and we plot them. from sklearn.metrics import r2_score
import sklearn as sk titanic_data = pd.read_csv('data/titanic.csv') titanic_data.head() #change all string columns to categorical titanic_data_object_types = titanic_data.select_dtypes(include=['object']) titanic_data_object_types_columns = np.array(titanic_data_object_types.columns) for column in titanic_data_object_types_columns: titanic_data[column] = titanic_data[column].astype('category', ordered=False) titanic_data['Pclass'] = titanic_data['Pclass'].astype('category', ordered=False) #rx_dtree works with formulas, just like rxDTree in R form = 'Survived ~ Pclass + Sex + Age + Parch + Fare + Embarked' #train decision tree and extract the tree's information titanic_data_tree = rp.rx_dtree(form, titanic_data, max_depth=50) #test data test = pd.read_csv('data/test.csv') test_data_object_types = test.select_dtypes(include=['object']) test_data_object_types_columns = np.array(test_data_object_types.columns) for column in test_data_object_types_columns: test[column] = test[column].astype('category', ordered=False) #predict on test data titanic_data_predictions = rp.rx_predict_rx_dtree(titanic_data_tree, test)
from sklearn.cross_validation import train_test_split wines_train, wines_test = train_test_split(wines) # And we train. We drop the color which is a non numerical # features. # # # In[27]: from revoscalepy import rx_dtree cols = wines.columns.drop(["quality", "color"]) model = rx_dtree("quality ~" + "+".join(cols), data=wines_train, method="anova", allow_disk_write=False) # Now let's evaluate the model accuracy. # # # In[28]: from revoscalepy import rx_predict_rx_dtree pred = rx_predict_rx_dtree(model, wines_test, extra_vars_to_write=["quality"]) print(pred.head()) # The column 'quality_Pred' is the prediction. # We estimate its quality with the metric [R2](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html)
sel.fit_transform(uni_X, uni_y) # Review retained features (all included) sel.get_support(indices=True) # Feature Importance model = ExtraTreesClassifier(n_estimators=100) model.fit(uni_X, uni_y) feat_importances = pd.Series(model.feature_importances_, index=scaled_X.columns) feat_importances.nlargest(10).plot(kind='barh') plt.show() # Use revoscalepy to build a decision tree to predict underpriced with the full dataset # in order to plot the most important featuresn -- compare to ExtraTreesClassifier model = rx_dtree("underpriced ~" + "+".join(uni_vars), data=ipo2609Cleaned, method="anova", importance=True) importance = model.importance importance.columns = ["feature importance"] importance.sort_values("feature importance").plot(kind="bar") plt.show() # Categorical Values and Recursive Feature Elimination (RFE) rfe_vars = ipo2609Cleaned.columns.values.tolist() rfe_vars.remove( 'underpriced') # The value we are predicting, so remove from factors rfe_vars.remove('Name') # Company identifier, not relevant rfe_vars.remove('perm') # permanent identifier, not relevant rfe_vars.remove( 'dt1') # Price after first day trading, unknown at time of prediction