示例#1
0
X = cc_data[cc_data.Day < 4].iloc[:, 3:len(cc_data.columns) - 1]
y = cc_data[cc_data.Day < 4].iloc[:, len(cc_data.columns) - 1]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42)

param_numTrees = int(sys.argv[1])
param_maxDepth = int(sys.argv[2])
param_impurity = sys.argv[3]

randF = RandomForestClassifier(n_jobs=10,
                               n_estimators=param_numTrees,
                               max_depth=param_maxDepth,
                               criterion=param_impurity,
                               random_state=0)

randF.fit(X_train, y_train)

predictions_rand = randF.predict(X_test)
auroc = roc_auc_score(y_test, predictions_rand)
ap = average_precision_score(y_test, predictions_rand)

cdsw.track_metric("auroc", round(auroc, 2))
cdsw.track_metric("ap", round(ap, 2))

pickle.dump(randF, open("cc_model_check.pkl", "wb"))

cdsw.track_file("cc_model_check.pkl")
示例#2
0
# Pull a sample of the dataset into an in-memory
# Pandas dataframe. Use a smaller dataset for a quick demo.
flight_df_local = spark.sql("SELECT * FROM `default`.`flights`").limit(
    set_size).toPandas()

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

# Put the data into the array format required by scikit-learn's linreg.
# Use one-hot encoding for the categorical variables
X_onehot = flight_df_local[["UniqueCarrier", "Origin", "Dest"]]
X_num = flight_df_local[["Distance", "CRSDepTime"]]
y = (flight_df_local["DepDelay"] > 0)

enc = OneHotEncoder(sparse=False)
X_transform = enc.fit_transform(X_onehot)

reg = LogisticRegression().fit(np.hstack([X_transform, X_num]), y)

accuracy = reg.score(np.hstack([X_transform, X_num]), y)
cdsw.track_metric("Accuracy", accuracy)

import joblib
with open(model_path + model_name, 'wb') as fid:
    joblib.dump(reg, fid)
cdsw.track_file(model_path + model_name)

with open(model_path + enc_name, 'wb') as fid:
    joblib.dump(enc, fid)
cdsw.track_file(model_path + enc_name)
示例#3
0
def train_model(
    path,
    x_columns,
    y_column,
    output_path="models",
    test_size=0.3,
    random_state=None,
    cross_validation_folds=5,
    verbose=2,
):
    logging.info("Begin `train_model`")
    logging.debug(f"Load Data from {path}")
    logging.debug(f"x_columns: {x_columns}")
    logging.debug(f"y_column: {y_column}")

    df = load_data(path, x_columns + [y_column])

    logging.info("Preprocessing Inputs")
    df, new_X_columns = preprocess_model_inputs(df, x_columns, y_column)

    logging.debug(f"new_X_columns: {new_X_columns}")
    X = df[new_X_columns]
    y = df[y_column]

    logging.info("Splitting data into test and train sets")
    logging.info(f"test_size: {test_size}")
    logging.info(f"random_state: {random_state}")

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=test_size,
        random_state=random_state,
        stratify=y,
    )

    grid_params = get_grid_search_params()

    logging.info("Getting Grid Search CV Pipeline")
    logging.debug(f"cross_validation_folds: {cross_validation_folds}")

    pipeline = model_grid_search_cv(
        new_X_columns,
        get_base_estimator(random_state),
        grid_params,
        verbose=verbose,
        cross_validation_folds=cross_validation_folds)

    try:
        pipeline.fit(X_train, y_train)
    except Exception as e:
        logging.exception("Exception during pipeline fitting")
        raise e

    logging.info("Finished Grid Search CV")
    logging.info(f"Best Score: {pipeline['grid_search_cv'].best_score_}")
    logging.info(f"Best Params: {pipeline['grid_search_cv'].best_params_}")

    write_out_model_params(pipeline["grid_search_cv"].best_params_,
                           output_path)

    eval_predictions("train", pipeline, X_train, y_train, output_path)
    eval_predictions("test", pipeline, X_test, y_test, output_path)

    logging.info(f"Writing out model to {output_path}/trained.model")

    joblib.dump(pipeline, f"{output_path}/trained.model")
    cdsw.track_file(f"{output_path}/trained.model")

    logging.info("End `train_model`")
"Gradient Boost Tree - Feature Rankings Sorted By Importance Value %s" % (sortedFeaturRankings)
"When summed together, the values equal 1.0"

#Return Paramaters to CDSW User Interface
cdsw.track_metric("auroc", auroc)
cdsw.track_metric("aupr", aupr)
cdsw.track_metric("F1", f1score)
cdsw.track_metric("WeightedPrecision", weightedPrecision)
cdsw.track_metric("weightedRecall", weightedRecall)
cdsw.track_metric("numIter",param_BestModel_NumIter)
cdsw.track_metric("maxDepth",param_BestModel_Depth)
cdsw.track_metric("cvFolds",user_gbt_param_numFolds)


from pyspark.mllib.evaluation import BinaryClassificationMetrics
gbt_labelPredictionSet = gbt_predictions.select('prediction','label').rdd.map(lambda lp: (lp.prediction, lp.label))
gbtmetrics = BinaryClassificationMetrics(gbt_labelPredictionSet)

#Save GBT Model to Disk
gbtmodel.write().overwrite().save("models/spark/gbt")


!rm -r -f models/spark/gbt
!rm -r -f models/spark_gbt.tar
!hdfs dfs -get models/spark/gbt 
!hdfs dfs -get models/
!tar -cvf models/spark_gbt.tar models/spark/gbt

cdsw.track_file("models/spark_gbt.tar")

spark.stop()
示例#5
0
model.l1_ratio_
cdsw.track_metric("l1_ratio", model.l1_ratio_)

model.alpha_
cdsw.track_metric("alpha", model.alpha_)

# ## Model coefficients
model.intercept_
cdsw.track_metric("intercept", model.intercept_)

zip(feature_cols, model.coef_)
for i in range(0, len(feature_cols)):
    cdsw.track_metric(feature_cols[i], model.coef_[i])

# ## r squared scores
r_train = model.score(train_features, train_labels)
r_train
cdsw.track_metric("r_train", r_train)

r_test = model.score(test_features, test_labels)
r_test
cdsw.track_metric("r_test", r_test)

# ## Persist model during experiment
filename = 'bikeshare_model.pkl'
joblib.dump(model, filename)
cdsw.track_file(filename)

#timestamp = datetime.now().strftime('%Y%m%d%H%M%S%f')
#joblib.dump(model, 'bikeshare_model_' + timestamp + '.pkl')
示例#6
0
filtered, vocab = cutoff(sorted_vocab, tokenized, freqs)

sentiments, messages = balance_classes(sentiments, filtered)

token_ids = convert_to_token_ids(messages, vocab)

print(type(token_ids))
print(len(token_ids) * 0.5)
split_idx = int(len(token_ids) * 0.5)

split_frac = 0.98  # for small data
#split_frac = 0.8 # for big data
train_features, train_labels, tf, tl, vf, vl = split_data(
    token_ids, sentiments, vocab, split_frac=split_frac)

model = create_model(train_features, train_labels, vocab)

acc, loss = train_model(model, train_features, train_labels,
                        print_every=1)  # for small data
#acc, loss = train_model(model, train_features, train_labels) # for big data

import cdsw
model_filename = "model.torch"
vocab_filename = "vocab.pickle"
cdsw.track_file(model_filename)
cdsw.track_file(vocab_filename)

cdsw.track_metric("Accuracy", acc)
cdsw.track_metric("Loss", loss)
示例#7
0
print("test", test_score)    
print(classification_report(y_test, pipe.predict(X_test)))
data[labels.name + ' probability'] = pipe.predict_proba(X)[:, 1]


# Create LIME Explainer
feature_names = list(ce.columns_)
categorical_features = list(ce.cat_columns_ix_.values())
categorical_names = {i: ce.classes_[c]
                     for c, i in ce.cat_columns_ix_.items()}
class_names = ['No ' + labels.name, labels.name]
explainer = LimeTabularExplainer(ce.transform(data),
                                 feature_names=feature_names,
                                 class_names=class_names,
                                 categorical_features=categorical_features,
                                 categorical_names=categorical_names)    


# Create and save the combined Logistic Regression and LIME Explained Model.
explainedmodel = ExplainedModel(data=data, labels=labels, model_name='telco_linear',
                                categoricalencoder=ce, pipeline=pipe,
                                explainer=explainer,data_dir=data_dir)
explainedmodel.save()


# If running as as experiment, this will track the metrics and add the model trained in this training run to the experiment history.
cdsw.track_metric("train_score",round(train_score,2))
cdsw.track_metric("test_score",round(test_score,2))
cdsw.track_metric("model_path",explainedmodel.model_path)
cdsw.track_file(explainedmodel.model_path)
示例#8
0
#Return Paramaters to CDSW User Intemlpace
cdsw.track_metric("auroc", auroc)
cdsw.track_metric("aupr", aupr)
cdsw.track_metric("F1", f1score)
cdsw.track_metric("WeightedPrecision", weightedPrecision)
cdsw.track_metric("weightedRecall", weightedRecall)
#cdsw.track_metric("Layers",param_BestModel_Layers)
#cdsw.track_metric("maxIter",param_BestModel_Iter)
cdsw.track_metric("cvFolds",user_mlp_param_numFolds)


from pyspark.mllib.evaluation import BinaryClassificationMetrics
labelPredictionSet = mlp_predictions.select('prediction','label').rdd.map(lambda lp: (lp.prediction, lp.label))
metrics = BinaryClassificationMetrics(labelPredictionSet)
#metrics.areaUnderPR


#Save MLP Model to Disk
mlpmodel.write().overwrite().save("models/spark/mlp")

!rm -r -f models/spark/mlp
!rm -r -f models/spark_mlp.tar
!hdfs dfs -get models/spark/mlp 
!hdfs dfs -get models/
!tar -cvf models/spark_mlp.tar models/spark/mlp

cdsw.track_file("models/spark_mlp.tar")

spark.stop()
示例#9
0
#Retrieving Paramaters from the Best RF Model 
param_BestModel_Iter = bestSVMModel._java_obj.getMaxIter()

#Return Paramaters to CDSW User Interface
cdsw.track_metric("auroc", auroc)
cdsw.track_metric("aupr", aupr)
cdsw.track_metric("F1", f1score)
cdsw.track_metric("WeightedPrecision", weightedPrecision)
cdsw.track_metric("weightedRecall", weightedRecall)
cdsw.track_metric("maxIter",param_BestModel_Iter)
cdsw.track_metric("cvFolds",user_svm_param_numFolds)


from pyspark.mllib.evaluation import BinaryClassificationMetrics
labelPredictionSet = svm_predictions.select('prediction','label').rdd.map(lambda lp: (lp.prediction, lp.label))
metrics = BinaryClassificationMetrics(labelPredictionSet)

#Save SVM Model to Disk
svmModel.write().overwrite().save("models/spark/svm")

!rm -r -f models/spark/svm
!rm -r -f models/spark_svm.tar
!hdfs dfs -get models/spark/svm
!hdfs dfs -get models/
!tar -cvf models/spark_svm.tar models/spark/svm

cdsw.track_file("models/spark_svm.tar")

spark.stop()
示例#10
0
#results['best_parameters'] = gs.best_params_

results = dict()
results['best_score'] = gs.best_score_
results['n_splits'] = gs.n_splits_
#results['scorer'] = gs.scorer_
#results['best_parameters'] = gs.best_params_

results_df = pd.DataFrame(results, index=[0])

#Todo - do this more rigourously following best practices. Pull more metrics out.
#Todo - could pass more scoring criteria from calling script, dynamically, even creating custom scoring functions
print("Best Accuracy Score")
print(results)

run_time_suffix = datetime.datetime.now()
run_time_suffix = run_time_suffix.strftime("%d%m%Y%H%M%S")

dump(clf, "models/clf_" + run_time_suffix + ".joblib")

# Tracking metric but if need to reuse the metric it's better to write to file?
cdsw.track_metric("Best Accuracy", results['best_score'])

# Tracking metric in db
import sqlite3
conn = sqlite3.connect('newdb.db')
results_df.to_sql(name='experiments', if_exists='append', con=conn)
#pd.read_sql('select * from experiment_results', conn)

cdsw.track_file('newdb.db')
示例#11
0
                                    numTrees= param_numTrees, maxDepth =param_maxDepth, maxBins=500, predictionCol="prediction")
labelConv = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelInd.labels)

pipeline = Pipeline(stages=[ucInd, oInd, dInd,wInd,labelInd,encoder, assembler,classifier, labelConv])
(train, test) = df.randomSplit([0.7, 0.3])
model = pipeline.fit(train)
predictions = model.transform(test)
predictions.head()

# ## Model Evalution
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="delayCatInd", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

rfModel = model.stages[-1]
print(rfModel)  # summary only

cdsw.track_metric("accuracy", accuracy)

model.write().overwrite().save("flightdelay/rfcModel")

!rm -r -f /home/cdsw/models/spark/rfcModel
!rm -r -f /home/cdsw/models/spark_rfc.tar
!hdfs dfs -get models/spark/rfcModel /home/cdsw/models/spark/rfcModel
!tar -cvf spark_rfc.tar /home/cdsw/models/spark

cdsw.track_file("/home/cdsw/models/spark_rfc.tar")

spark.stop()
ID.build_vocab(train_data)
# TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE)
TEXT.build_vocab(train_data,
                 max_size=MAX_VOCAB_SIZE,
                 vectors="glove.twitter.27B.25d",
                 unk_init=torch.Tensor.normal_)
SENTIMENT.build_vocab(train_data)
AIRLINE.build_vocab(train_data)

print(TEXT.vocab.freqs.most_common(20))
# save this - need for model prediction
outfile = open(model_dir + 'vocab_index.pkl', 'wb')
pickle.dump(TEXT.vocab.stoi, outfile, -1)
outfile.close()
cdsw.track_file(model_dir + 'vocab_index.pkl')

# check labels, 0 is negative, 1 is positive
print(SENTIMENT.vocab.stoi)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 32

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    sort_key=lambda x: x.text,  # sort by text
    batch_size=BATCH_SIZE,
    device=device)

示例#13
0
ID.build_vocab(train_data)
# TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE)
TEXT.build_vocab(train_data,
                 max_size=MAX_VOCAB_SIZE,
                 vectors="glove.twitter.27B.25d",
                 unk_init=torch.Tensor.normal_)
SENTIMENT.build_vocab(train_data)
AIRLINE.build_vocab(train_data)

print(TEXT.vocab.freqs.most_common(20))
# save this - need for model prediction
outfile = open(model_dir + 'vocab_index.pkl', 'wb')
pickle.dump(TEXT.vocab.stoi, outfile, -1)
outfile.close()
cdsw.track_file(model_dir + 'vocab_index.pkl')

# check labels, 0 is negative, 1 is positive
print(SENTIMENT.vocab.stoi)

# In[8]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 32

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    sort_key=lambda x: x.text,  # sort by text
    batch_size=BATCH_SIZE,
    device=device)
示例#14
0
# Read and prepare data:
flights_pd = pd.read_csv('data/flights.csv')

flights_clean_pd = flights_pd \
  .filter(['dep_delay', 'arr_delay']) \
  .dropna() \
  .loc[flights_pd.dep_delay < 400, :]

features = flights_clean_pd.filter(['dep_delay'])
targets = flights_clean_pd.filter(['arr_delay'])

train_x, test_x, train_y, test_y = train_test_split(features,
                                                    targets,
                                                    test_size=0.2)

# Specify the model and train it using the training
# sample:
model = LinearRegression(fit_intercept=fit_intercept)
model.fit(train_x, train_y)

# Evaluate the model using the test sample. Track the
# value of R-squared (rounded to four digits after the
# decimal) to compare experiment results:
r2 = model.score(test_x, test_y)
cdsw.track_metric('R_squared', round(r2, 4))

# Save the model for future use:
dump(model, 'model.joblib')
cdsw.track_file('model.joblib')
示例#15
0

#This code will train a model based on arguments passed to the Experiments feature of CML.

#args for experiment: wine_hiveS3 gb s3a://ml-field/demo/wine/ us-west-2


from churnexplainer import train
from churnexplainer.data import dataset, load_dataset
import cdsw

#os.environ['MODEL_TYPE'] = 'gb'
os.gentenv('MODEL_TYPE', sys.argv[1])

#os.environ['DATASET'] = 'wine_hiveS3'
os.gentenv('DATASET', sys.argv[2])

#os.environ['S3_BUCKET'] = 's3a://ml-field/demo/wine/'
os.gentenv('S3_BUCKET', sys.argv[3])

#os.environ['S3_BUCKET_REGION'] = 'us-west-2'
os.gentenv('S3_BUCKET_REGION', sys.argv[4])


train_score, test_score, model_path = train.experiment_and_save()

cdsw.track_metric("train_score",round(train_score,2))
cdsw.track_metric("test_score",round(test_score,2))
cdsw.track_metric("model_path",model_path)
cdsw.track_file(model_path)
import sys
import cdsw

args = len(sys.argv) - 1
sum = 0
x = 1

while (args >= x):
    print("Argument %i:%s" % (x, sys.argv[x]))
    sum = sum + int(sys.argv[x])
    x = x + 1

print("Sum of the numbers is: %i." % sum)

#Trace metric
cdsw.track_metric("sum", sum)

#Trace File
file = open('add_result.txt', 'w')
file.write("Sum of the numbers is: %i." % sum)
file.close()
cdsw.track_file('add_result.txt')
示例#17
0
cdsw.track_metric("numTrees", param_numTrees)
cdsw.track_metric("maxDepth", param_maxDepth)
cdsw.track_metric("impurity", param_impurity)

randF.fit(pdTrain[features], pdTrain['label'])

predictions = randF.predict(pdTest[features])

#temp = randF.predict_proba(pdTest[features])

pd.crosstab(pdTest['label'],
            predictions,
            rownames=['Actual'],
            colnames=['Prediction'])

list(zip(pdTrain[features], randF.feature_importances_))

y_true = pdTest['label']
y_scores = predictions
auroc = roc_auc_score(y_true, y_scores)
ap = average_precision_score(y_true, y_scores)
print(auroc, ap)

cdsw.track_metric("auroc", auroc)
cdsw.track_metric("ap", ap)

pickle.dump(randF, open("models/sklearn_rf.pkl", "wb"))

cdsw.track_file("models/sklearn_rf.pkl")
auroc = evaluator.evaluate(mlp_predictions, {evaluator.metricName: "areaUnderROC"})
aupr = evaluator.evaluate(mlp_predictions, {evaluator.metricName: "areaUnderPR"})
"The AUROC is %s and the AUPR is %s" % (auroc, aupr)

f1score = multiEvaluator.evaluate(mlp_predictions, {multiEvaluator.metricName: "f1"})
weightedPrecision = multiEvaluator.evaluate(mlp_predictions, {multiEvaluator.metricName: "weightedPrecision"})
weightedRecall = multiEvaluator.evaluate(mlp_predictions, {multiEvaluator.metricName: "weightedRecall"})

"The F1 score: %s the Weighted Precision: %s the Weighted Recall is %s" % (f1score, weightedPrecision, weightedRecall)

#Return Paramaters to CDSW User Interface
cdsw.track_metric("auroc", auroc)
cdsw.track_metric("aupr", aupr)
cdsw.track_metric("F1", f1score)
cdsw.track_metric("WeightedPrecision", weightedPrecision)
cdsw.track_metric("weightedRecall", weightedRecall)
cdsw.track_metric("cvFolds",user_mlp_param_numFolds)

#Save MLP Model to Disk
mlpmodel.write().overwrite().save("models/spark/stackedmlp")

!rm -r -f models/spark/stackedmlp
!rm -r -f models/spark_stackedmlp.tar
!hdfs dfs -get models/spark/stackedmlp 
!hdfs dfs -get models/
!tar -cvf models/spark_stackedmlp.tar models/spark/stackedmlp

cdsw.track_file("models/spark_stackedmlp.tar")

spark.stop()
示例#19
0
# # Step 4 Evaluate Model
from pyspark.ml.evaluation import BinaryClassificationEvaluator

#Predict on Test Data
predictions = model.transform(testData)

#Evaluate
evaluator = BinaryClassificationEvaluator()
auroc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})
aupr = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"})
print("The AUROC is {:f} and the AUPR is {:f}".format(auroc, aupr))

#Track metric value in CDSW
cdsw.track_metric("auroc", auroc)
cdsw.track_metric("aupr", aupr)

# # Save Model for deployement
#model.write().overwrite().save("models/spark")

#bring model back into project and tar it
#!rm -rf models/
#!mkdir models
#!hdfs dfs -get ./models/spark models/
#!tar -cvf models/spark_rf.tar models/spark
#!rm -r -f models/spark
#!mv models/spark_rf.tar spark_rf.tar

cdsw.track_file("spark_rf.tar")

spark.stop()
sortedFeaturRankings = sorted(FeautureRankings, reverse=True)

"Random Forest - Feature Rankings Sorted By Importance Value %s" % (sortedFeaturRankings)
"When summed together, the values equal 1.0"

#Return Paramaters to CDSW User Interface
cdsw.track_metric("auroc", auroc)
cdsw.track_metric("aupr", aupr)
cdsw.track_metric("F1", f1score)
cdsw.track_metric("WeightedPrecision", weightedPrecision)
cdsw.track_metric("weightedRecall", weightedRecall)
cdsw.track_metric("numTrees",param_BestModel_NumTrees)

from pyspark.mllib.evaluation import BinaryClassificationMetrics
rf_labelPredictionSet = rf_predictions.select('prediction','label').rdd.map(lambda lp: (lp.prediction, lp.label))
rfmetrics = BinaryClassificationMetrics(rf_labelPredictionSet)

#Save Stacked Model to Disk

rf_model.write().overwrite().save("models/spark/vanilla")

!rm -r -f models/spark
!rm -r -f models/spark_rf_vanilla.tar
!hdfs dfs -get models/spark 
!hdfs dfs -get models/
!tar -cvf models/spark_rf._vanilla.tar models/spark/vanilla

cdsw.track_file("models/spark_rf_vanilla.tar")

spark.stop()
示例#21
0
"When summed together, the values equal 1.0"

#Return Paramaters to CDSW User Interface
cdsw.track_metric("auroc", auroc)
cdsw.track_metric("aupr", aupr)
cdsw.track_metric("F1", f1score)
cdsw.track_metric("WeightedPrecision", weightedPrecision)
cdsw.track_metric("weightedRecall", weightedRecall)
cdsw.track_metric("numTrees",param_BestModel_NumTrees)
cdsw.track_metric("maxDepth",param_BestModel_Depth)
cdsw.track_metric("impurity",param_BestModel_impurity)
cdsw.track_metric("cvFolds",user_rf_param_numFolds)


from pyspark.mllib.evaluation import BinaryClassificationMetrics
labelPredictionSet = rf_predictions.select('prediction','label').rdd.map(lambda lp: (lp.prediction, lp.label))
metrics = BinaryClassificationMetrics(labelPredictionSet)

#Save RF Model to Disk
rfmodel.write().overwrite().save("models/spark/rf")

!rm -r -f models/spark/rf
!rm -r -f models/spark_rf.tar
!hdfs dfs -get models/spark/rf 
!hdfs dfs -get models/
!tar -cvf models/spark_rf.tar models/spark/rf

cdsw.track_file("models/spark_rf.tar")

spark.stop()
示例#22
0
cdsw.track_metric("numTrees",param_numTrees)
cdsw.track_metric("maxDepth",param_maxDepth)
cdsw.track_metric("impurity",param_impurity)

# Fit and Predict
randF.fit(pdTrain[features], pdTrain['label'])
predictions=randF.predict(pdTest[features])

#temp = randF.predict_proba(pdTest[features])

pd.crosstab(pdTest['label'], predictions, rownames=['Actual'], colnames=['Prediction'])

list(zip(pdTrain[features], randF.feature_importances_))


y_true = pdTest['label']
y_scores = predictions
auroc = roc_auc_score(y_true, y_scores)
ap = average_precision_score (y_true, y_scores)
print(auroc, ap)

cdsw.track_metric("auroc", auroc)
cdsw.track_metric("ap", ap)

pickle.dump(randF, open("iot_model.pkl","wb"))

cdsw.track_file("iot_model.pkl")

time.sleep(15)
print("Slept for 15 seconds.")
示例#23
0
cdsw.track_metric("maxDepth", param_maxDepth)
cdsw.track_metric("impurity", param_impurity)

randF.fit(pdTrain[features], pdTrain['label'])

predictions = randF.predict(pdTest[features])

#temp = randF.predict_proba(pdTest[features])

pd.crosstab(pdTest['label'],
            predictions,
            rownames=['Actual'],
            colnames=['Prediction'])

list(zip(pdTrain[features], randF.feature_importances_))

y_true = pdTest['label']
y_scores = predictions
auroc = roc_auc_score(y_true, y_scores)
ap = average_precision_score(y_true, y_scores)
print(auroc, ap)

cdsw.track_metric("auroc", auroc)
cdsw.track_metric("ap", ap)

pickle.dump(randF, open("sklearn_rf.pkl", "wb"))

cdsw.track_file("sklearn_rf.pkl")

time.sleep(20)
print("Slept for 20 seconds.")
示例#24
0
(start, end) = (start, end) if start < end else (end, start)
split_point = find_split_point(start, end, precision_rate(start),
                               precision_rate(end))
print('\nSplit point:', split_point)

# Update the deployed model split point
import subprocess
subprocess.call([
    "sed", "-i",
    's/split_point=.*/split_point=' + str(round(split_point, 3)) + "/ ",
    "/home/cdsw/4_model_deploy.py"
])

# model precision rate

precision1 = (loss1 < split_point).sum().item() / float(len(loss1))
precision2 = (loss2 > split_point).sum().item() / float(len(loss2))
print('Precision rate for normal cases:', precision1)
print('Precision rate for fraud cases:', precision2)
print('Overall precision:', (precision1 + precision2) / 2)

torch.save(model.state_dict(), 'model/creditcard-fraud.model')

# track experiment metrics
# If running as as experiment, this will track the metrics and add the model trained in this
# training run to the experiment history.
cdsw.track_metric("split_point", round(split_point, 2))
cdsw.track_metric("precision", round(((precision1 + precision2) / 2), 2))
cdsw.track_file('model/creditcard-fraud.model')
cdsw.track_file('model/cc_scaler.pkl')
示例#25
0
# __________________________________________________________________________________________________

# # 11. MODEL EVALUATION
# Predict Model
pred_rf_test = rf.predict(X_test)

# => melaukan prediksi dengan data test
# __________________________________________________________________________________________________

#cdsw.track_metric("accuracy", accuracy_score(y_test, pred_rf_test))
print(accuracy_score(y_test, pred_rf_test))

# => Evaluasi model yang dibuat dengan melihat tingkat akurasi yang dihasilkan
# __________________________________________________________________________________________________

probs = rf.predict_proba(X_test)
probs = probs[:, 1]

#cdsw.track_metric("auc", roc_auc_score(y_test, probs))
print(roc_auc_score(y_test, probs))

# => Evaluasi model yang dibuat dengan melihat nilai Area Under Curve (AUC)
# __________________________________________________________________________________________________

pickle.dump(rf, open("sklearn_rf_large.pkl", "wb"))

cdsw.track_file("sklearn_rf_large.pkl")

# => Save model
# __________________________________________________________________________________________________
示例#26
0
features = [
    "intl_plan_indexed", "account_length", "number_vmail_messages",
    "total_day_calls", "total_day_charge", "total_eve_calls",
    "total_eve_charge", "total_night_calls", "total_night_charge",
    "total_intl_calls", "total_intl_charge", "number_customer_service_calls"
]
randF = RandomForestClassifier(n_jobs=10, n_estimators=args.trees)
randF.fit(pdTrain[features], pdTrain['label'])
predictions = randF.predict(pdTest[features])

## Feature Importance
list(zip(pdTrain[features], randF.feature_importances_))

## AUROC
y_true = pdTest['label']
y_scores = predictions
auroc = roc_auc_score(y_true, y_scores)
ap = average_precision_score(y_true, y_scores)
print(auroc, ap)

cdsw.track_metric("auroc", auroc)
cdsw.track_metric("ap", ap)

## Serialize and track model
pickle.dump(randF, open("model.pkl", "wb"))
cdsw.track_file("model.pkl")

## Stop Spark

spark.stop()