Пример #1
0
model_type = "linear"
conn_str = 'Driver=SQL Server;Server=<Server Name>;Database=MLDB;Uid=<User Name>;Pwd=<Password>;'
cnxn = pyodbc.connect(conn_str)
inputsql = 'select "RentalCount", "Year", "Month", "Day", "WeekDay", "Snow", "Holiday", "FWeekDay" from dbo.rental_data where Year < 2015'
rental_train_data = pd.read_sql(inputsql, cnxn)

# Used to sample data
# train = rental_train_data.sample(frac=0.8, random_state=1)
# test = rental_train_data.loc[~ rental_train_data.index.isin(train.index)]
# print("Train {0} / test {1}".format(len(train), len(test)))

rental_train_data["Holiday"] = rental_train_data["Holiday"].astype("category")
rental_train_data["Snow"] = rental_train_data["Snow"].astype("category")
rental_train_data["WeekDay"] = rental_train_data["WeekDay"].astype("category")

if model_type == "linear":
    linmod_model = rx_lin_mod("RentalCount ~ Month + Day + WeekDay + Snow + Holiday", data = rental_train_data)
    trained_model = rx_serialize_model(linmod_model, realtime_scoring_only = True)
if model_type == "dtree":
	dtree_model = rx_dtree("RentalCount ~ Month + Day + WeekDay + Snow + Holiday", data = rental_train_data)
	trained_model = rx_serialize_model(dtree_model, realtime_scoring_only = True)

print(rx_summary("~ Month + Day + WeekDay + Snow + Holiday", rental_train_data))

# Dump learned model to file
with open(r'c:\temp\trained_model.pickle', mode='wb') as f:
    f.write(trained_model)

cursor=cnxn.cursor()
cursor.execute("INSERT INTO rental_models(model_name, lang, native_model) VALUES(?, ?, ?)", (model_type + "_model", "Python", trained_model))
cnxn.commit()
# into train and test.

try:
    from sklearn.model_selection import train_test_split
except ImportError:
    from sklearn.cross_validation import train_test_split
wines_train, wines_test = train_test_split(wines)

##############################
# And we train. We drop the color which is a non numerical
# features. We will add it later.

from revoscalepy import rx_dtree
cols = wines.columns.drop(["quality", "color"])
model = rx_dtree("quality ~" + "+".join(cols),
                 data=wines_train,
                 method="anova")

######################
# We predict.

from revoscalepy import rx_predict_rx_dtree
pred = rx_predict_rx_dtree(model, wines_test, extra_vars_to_write=["quality"])
print(pred.head())

###########################
# The column 'quality_Pred' is the prediction.
# We estimate its quality with the metric `R2 <http://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html>`_
# and we plot them.

from sklearn.metrics import r2_score
Пример #3
0
import sklearn as sk

titanic_data = pd.read_csv('data/titanic.csv')

titanic_data.head()

#change all string columns to categorical
titanic_data_object_types = titanic_data.select_dtypes(include=['object'])
titanic_data_object_types_columns = np.array(titanic_data_object_types.columns)
for column in titanic_data_object_types_columns:
    titanic_data[column] = titanic_data[column].astype('category',
                                                       ordered=False)
titanic_data['Pclass'] = titanic_data['Pclass'].astype('category',
                                                       ordered=False)

#rx_dtree works with formulas, just like rxDTree in R
form = 'Survived ~ Pclass + Sex + Age  + Parch  + Fare + Embarked'

#train decision tree and extract the tree's information
titanic_data_tree = rp.rx_dtree(form, titanic_data, max_depth=50)

#test data
test = pd.read_csv('data/test.csv')
test_data_object_types = test.select_dtypes(include=['object'])
test_data_object_types_columns = np.array(test_data_object_types.columns)
for column in test_data_object_types_columns:
    test[column] = test[column].astype('category', ordered=False)

#predict on test data
titanic_data_predictions = rp.rx_predict_rx_dtree(titanic_data_tree, test)
    from sklearn.cross_validation import train_test_split

wines_train, wines_test = train_test_split(wines)

# And we train. We drop the color which is a non numerical
# features.
#
#

# In[27]:

from revoscalepy import rx_dtree

cols = wines.columns.drop(["quality", "color"])
model = rx_dtree("quality ~" + "+".join(cols),
                 data=wines_train,
                 method="anova",
                 allow_disk_write=False)

# Now let's evaluate the model accuracy.
#
#

# In[28]:

from revoscalepy import rx_predict_rx_dtree

pred = rx_predict_rx_dtree(model, wines_test, extra_vars_to_write=["quality"])
print(pred.head())

# The column 'quality_Pred' is the prediction.
# We estimate its quality with the metric [R2](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html)
Пример #5
0
sel.fit_transform(uni_X, uni_y)
# Review retained features (all included)
sel.get_support(indices=True)

# Feature Importance
model = ExtraTreesClassifier(n_estimators=100)
model.fit(uni_X, uni_y)
feat_importances = pd.Series(model.feature_importances_,
                             index=scaled_X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()

# Use revoscalepy to build a decision tree to predict underpriced with the full dataset
#  in order to plot the most important featuresn -- compare to ExtraTreesClassifier
model = rx_dtree("underpriced ~" + "+".join(uni_vars),
                 data=ipo2609Cleaned,
                 method="anova",
                 importance=True)
importance = model.importance
importance.columns = ["feature importance"]
importance.sort_values("feature importance").plot(kind="bar")
plt.show()

# Categorical Values and Recursive Feature Elimination (RFE)
rfe_vars = ipo2609Cleaned.columns.values.tolist()

rfe_vars.remove(
    'underpriced')  # The value we are predicting, so remove from factors
rfe_vars.remove('Name')  # Company identifier, not relevant
rfe_vars.remove('perm')  # permanent identifier, not relevant
rfe_vars.remove(
    'dt1')  # Price after first day trading, unknown at time of prediction