def test_multi_output_predict_proba(): sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5, tol=1e-3) param = {'loss': ('hinge', 'log', 'modified_huber')} # inner function for custom scoring def custom_scorer(estimator, X, y): if hasattr(estimator, "predict_proba"): return 1.0 else: return 0.0 grid_clf = GridSearchCV(sgd_linear_clf, param_grid=param, scoring=custom_scorer, cv=3, error_score=np.nan) multi_target_linear = MultiOutputClassifier(grid_clf) multi_target_linear.fit(X, y) multi_target_linear.predict_proba(X) # SGDClassifier defaults to loss='hinge' which is not a probabilistic # loss function; therefore it does not expose a predict_proba method sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5, tol=1e-3) multi_target_linear = MultiOutputClassifier(sgd_linear_clf) multi_target_linear.fit(X, y) err_msg = "The base estimator should implement predict_proba method" with pytest.raises(ValueError, match=err_msg): multi_target_linear.predict_proba(X)
def test_multi_output_predict_proba(): sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5) param = {'loss': ('hinge', 'log', 'modified_huber')} # inner function for custom scoring def custom_scorer(estimator, X, y): if hasattr(estimator, "predict_proba"): return 1.0 else: return 0.0 grid_clf = GridSearchCV(sgd_linear_clf, param_grid=param, scoring=custom_scorer, cv=3) multi_target_linear = MultiOutputClassifier(grid_clf) multi_target_linear.fit(X, y) multi_target_linear.predict_proba(X) # SGDClassifier defaults to loss='hinge' which is not a probabilistic # loss function; therefore it does not expose a predict_proba method sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5) multi_target_linear = MultiOutputClassifier(sgd_linear_clf) multi_target_linear.fit(X, y) err_msg = "The base estimator should implement predict_proba method" with pytest.raises(AttributeError, match=err_msg): multi_target_linear.predict_proba(X)
def test_multi_output_predict_proba(): sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5, loss="log_loss") param = {"loss": ("hinge", "log", "modified_huber")} # inner function for custom scoring def custom_scorer(estimator, X, y): if hasattr(estimator, "predict_proba"): return 1.0 else: return 0.0 grid_clf = GridSearchCV(sgd_linear_clf, param_grid=param, scoring=custom_scorer, cv=3) multi_target_linear = MultiOutputClassifier(grid_clf) multi_target_linear.fit(X, y) multi_target_linear.predict_proba(X) # SGDClassifier defaults to loss='hinge' which is not a probabilistic # loss function; therefore it does not expose a predict_proba method sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5) multi_target_linear = MultiOutputClassifier(sgd_linear_clf) multi_target_linear.fit(X, y) err_msg = "probability estimates are not available for loss='hinge'" with pytest.raises(AttributeError, match=err_msg): multi_target_linear.predict_proba(X)
def test_multi_output_classifier(self): X, y = make_multilabel_classification(n_classes=3, random_state=0) X = X.astype(numpy.float32) clf = MultiOutputClassifier(LogisticRegression()).fit(X, y) onx = to_onnx(clf, X[:1], target_opset=TARGET_OPSET, options={'zipmap': False}) self.assertNotIn("ZipMap", str(onx)) sess = InferenceSession(onx.SerializeToString()) res = sess.run(None, {'X': X}) exp_lab = clf.predict(X) exp_prb = clf.predict_proba(X) assert_almost_equal(exp_lab, res[0]) self.assertEqual(len(exp_prb), len(res[1])) for e, g in zip(exp_prb, res[1]): assert_almost_equal(e, g, decimal=5) # check option nocl=True onx = to_onnx(clf, X[:1], target_opset=TARGET_OPSET, options={id(clf): { 'nocl': True, 'zipmap': False }}) self.assertNotIn("ZipMap", str(onx)) sess = InferenceSession(onx.SerializeToString()) res = sess.run(None, {'X': X}) exp_lab = clf.predict(X) exp_prb = clf.predict_proba(X) assert_almost_equal(exp_lab, res[0]) self.assertEqual(len(exp_prb), len(res[1])) for e, g in zip(exp_prb, res[1]): assert_almost_equal(e, g, decimal=5) # check option nocl=False onx = to_onnx(clf, X[:1], target_opset=TARGET_OPSET, options={id(clf): { 'nocl': False, 'zipmap': False }}) self.assertNotIn("ZipMap", str(onx)) sess = InferenceSession(onx.SerializeToString()) res = sess.run(None, {'X': X}) exp_lab = clf.predict(X) exp_prb = clf.predict_proba(X) assert_almost_equal(exp_lab, res[0]) self.assertEqual(len(exp_prb), len(res[1])) for e, g in zip(exp_prb, res[1]): assert_almost_equal(e, g, decimal=5)
def test_multiclass_multioutput_estimator_predict_proba(): seed = 542 # make test deterministic rng = np.random.RandomState(seed) # random features X = rng.normal(size=(5, 5)) # random labels y1 = np.array(['b', 'a', 'a', 'b', 'a']).reshape(5, 1) # 2 classes y2 = np.array(['d', 'e', 'f', 'e', 'd']).reshape(5, 1) # 3 classes Y = np.concatenate([y1, y2], axis=1) clf = MultiOutputClassifier(LogisticRegression( multi_class='ovr', solver='liblinear', random_state=seed)) clf.fit(X, Y) y_result = clf.predict_proba(X) y_actual = [np.array([[0.23481764, 0.76518236], [0.67196072, 0.32803928], [0.54681448, 0.45318552], [0.34883923, 0.65116077], [0.73687069, 0.26312931]]), np.array([[0.5171785, 0.23878628, 0.24403522], [0.22141451, 0.64102704, 0.13755846], [0.16751315, 0.18256843, 0.64991843], [0.27357372, 0.55201592, 0.17441036], [0.65745193, 0.26062899, 0.08191907]])] for i in range(len(y_actual)): assert_almost_equal(y_result[i], y_actual[i])
def train_and_predict(X, y, train_ratio=0.2, n_trials=10, random_state=None): micro, macro, c, std, f1, f1_std = [], [], [], [], [], [] for i in range(n_trials): np.random.seed(random_state) X_train, y_train, X_test, y_test = iterative_train_test_split( X, y, test_size=1 - train_ratio) clf = MultiOutputClassifier( LogisticRegressionCV(max_iter=1e4, class_weight='balanced')) with warnings.catch_warnings(): warnings.simplefilter("ignore") clf.fit(X_train, y_train.A) y_pred = np.array(clf.predict_proba(X_test))[:, :, 1].T mi = roc_auc_score(y_test.A, y_pred, average="micro") ma = roc_auc_score(y_test.A, y_pred, average="macro") y_pred = clf.predict(X_test) f = f1_score(y_test.A, y_pred, average="micro") std.append(mi) f1.append(f) f1_std.append(f) micro.append(mi) macro.append(ma) c.append( np.mean([estimator.C_.mean() for estimator in clf.estimators_])) return np.mean(micro), np.mean(macro), np.mean(c), np.std(std), np.mean( f1), np.std(f1_std)
def test_multiclass_multioutput_estimator_predict_proba(): seed = 542 # make test deterministic rng = np.random.RandomState(seed) # random features X = rng.normal(size=(5, 5)) # random labels y1 = np.array(['b', 'a', 'a', 'b', 'a']).reshape(5, 1) # 2 classes y2 = np.array(['d', 'e', 'f', 'e', 'd']).reshape(5, 1) # 3 classes Y = np.concatenate([y1, y2], axis=1) clf = MultiOutputClassifier(LogisticRegression(random_state=seed)) clf.fit(X, Y) y_result = clf.predict_proba(X) y_actual = [np.array([[0.23481764, 0.76518236], [0.67196072, 0.32803928], [0.54681448, 0.45318552], [0.34883923, 0.65116077], [0.73687069, 0.26312931]]), np.array([[0.5171785, 0.23878628, 0.24403522], [0.22141451, 0.64102704, 0.13755846], [0.16751315, 0.18256843, 0.64991843], [0.27357372, 0.55201592, 0.17441036], [0.65745193, 0.26062899, 0.08191907]])] for i in range(len(y_actual)): assert_almost_equal(y_result[i], y_actual[i])
def test_multi_output_classification(): # test if multi_target initializes correctly with base estimator and fit # assert predictions work as expected for predict, prodict_proba and score forest = RandomForestClassifier(n_estimators=10, random_state=1) multi_target_forest = MultiOutputClassifier(forest) # train the multi_target_forest and also get the predictions. multi_target_forest.fit(X, y) predictions = multi_target_forest.predict(X) assert_equal((n_samples, n_outputs), predictions.shape) predict_proba = multi_target_forest.predict_proba(X) assert len(predict_proba) == n_outputs for class_probabilities in predict_proba: assert_equal((n_samples, n_classes), class_probabilities.shape) assert_array_equal(np.argmax(np.dstack(predict_proba), axis=1), predictions) # train the forest with each column and assert that predictions are equal for i in range(3): forest_ = clone(forest) # create a clone with the same state forest_.fit(X, y[:, i]) assert_equal(list(forest_.predict(X)), list(predictions[:, i])) assert_array_equal(list(forest_.predict_proba(X)), list(predict_proba[i]))
class Classifier(): def __init__(self): #Multi label classifier forest = RandomForestClassifier(n_estimators=100, random_state=1) self.clf = MultiOutputClassifier(forest, n_jobs=-1) def fit(self, X, y): self.clf.fit(X, y) def predict(self, X): y_pred = np.array(self.clf.predict(X)) return y_pred def predict_proba(self, X): """ Compte the probailities for each label Important: this class needs to return an 2D array with 2 columns per label, so 109*2 columns. """ proba = self.clf.predict_proba(X) #Proba is a list of size 109, one for each label, each element is an array of size n_samples * 2, #except some times when it is n_sample*1 so a little work is needed to reshape the array y_proba = proba[0] for x in proba[1:]: if x.shape[1] == 2: y_proba = np.hstack((y_proba, x)) else: y_proba = np.hstack((y_proba, x, np.zeros_like(x))) return y_proba
class MultilabelClassifier_SVM(Classifier): kernel = None model = None def __init__(self, kernel='linear'): self.kernel = kernel def train(self, Train_X_Tfidf, Train_Y, Test_X_Tfidf=None, Test_Y=None): # Fit the training dataset on the classifier self.model = MultiOutputClassifier( SVC(C=1.0, kernel=self.kernel, degree=3, gamma='auto', probability=True)) self.model.fit(Train_X_Tfidf, Train_Y) return self.model def predict(self, df): # Raw list of category probabilities (Shape n_categories x n_samples x n_outputs) rawList = self.model.predict_proba(df) # Convert to NumPy array # Extract relevant output # Transpose and round probabilities to create an indicator matrix predictionMatrix = np.round_(np.array(rawList)[:, :, 1]).T # Extract relevant probability output # Average confidences across all categories for all samples probabilityMatrix = np.amax(np.array(rawList), axis=2) confidenceList = np.average(probabilityMatrix.T, axis=1) return predictionMatrix, confidenceList
def test_multi_output_classification(): # test if multi_target initializes correctly with base estimator and fit # assert predictions work as expected for predict, prodict_proba and score forest = RandomForestClassifier(n_estimators=10, random_state=1) multi_target_forest = MultiOutputClassifier(forest) # train the multi_target_forest and also get the predictions. multi_target_forest.fit(X, y) predictions = multi_target_forest.predict(X) assert_equal((n_samples, n_outputs), predictions.shape) predict_proba = multi_target_forest.predict_proba(X) assert len(predict_proba) == n_outputs for class_probabilities in predict_proba: assert_equal((n_samples, n_classes), class_probabilities.shape) assert_array_equal(np.argmax(np.dstack(predict_proba), axis=1), predictions) # train the forest with each column and assert that predictions are equal for i in range(3): forest_ = clone(forest) # create a clone with the same state forest_.fit(X, y[:, i]) assert_equal(list(forest_.predict(X)), list(predictions[:, i])) assert_array_equal(list(forest_.predict_proba(X)), list(predict_proba[i]))
def train_and_predict(train_data, test_data, train_labels, test_labels): clf = MultiOutputClassifier( LogisticRegressionCV(max_iter=1e4, class_weight='balanced')) with warnings.catch_warnings(): warnings.simplefilter("ignore") clf.fit(train_data, train_labels.A) y_pred = np.array(clf.predict_proba(test_data))[:, :, 1].T mi = roc_auc_score(test_labels.A, y_pred, average="micro") ma = roc_auc_score(test_labels.A, y_pred, average="macro") c = np.mean([estimator.C_.mean() for estimator in clf.estimators_]) return mi, ma, c
def test_multi_output_delegate_predict_proba(): """Check the behavior for the delegation of predict_proba to the underlying estimator""" # A base estimator with `predict_proba`should expose the method even before fit moc = MultiOutputClassifier(LogisticRegression()) assert hasattr(moc, "predict_proba") moc.fit(X, y) assert hasattr(moc, "predict_proba") # A base estimator without `predict_proba` should raise an AttributeError moc = MultiOutputClassifier(LinearSVC()) assert not hasattr(moc, "predict_proba") msg = "'LinearSVC' object has no attribute 'predict_proba'" with pytest.raises(AttributeError, match=msg): moc.predict_proba(X) moc.fit(X, y) assert not hasattr(moc, "predict_proba") with pytest.raises(AttributeError, match=msg): moc.predict_proba(X)
class LinearRegression(Model): class SafeOneClassLogisticRegression(SafeOneClassMixin, LogisticRegression): pass def fit(self, X, y): self.model = MultiOutputClassifier( self.SafeOneClassLogisticRegression()).fit(X, y) def predict(self, X): return self.model.predict_proba(X)
def main(): ### read training and testing data (Y_data, X_data, tag_list) = read_data(train_path, True) (_, X_test, _) = read_data(test_path, False) all_corpus = X_data + X_test print('Find %d articles.' % (len(all_corpus))) ### tokenizer for all data tokenizer = Tokenizer() tokenizer.fit_on_texts(all_corpus) save_tokenizer(tokenizer, "bog_tokenizer_pickle") word_index = tokenizer.word_index ### convert word sequences to index sequence print('Convert to index sequences.') # train_sequences = tokenizer.texts_to_sequences(X_data) train_sequences = tokenizer.texts_to_matrix(X_data, "count") test_sequences = tokenizer.texts_to_matrix(X_test, "count") print(test_sequences.shape) ### train_tag = to_multi_categorical(Y_data, tag_list) ### split data into training set and validation set (X_train, Y_train), (X_val, Y_val) = split_data(train_sequences, train_tag, split_ratio) forest = RandomForestClassifier(n_estimators=10, random_state=1) multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1) print("fitting ...") multi_target_forest.fit(X_train, Y_train) pred_val = multi_target_forest.predict_proba(X_val) print(pred_val.shape) pred_val = (pred_val > thresh).astype('int') print(pred_val.shape) # print( np_f1score(Y_val, pred_val) ) Y_pred = multi_target_forest.predict(test_sequences) Y_pred_thresh = (Y_pred > thresh).astype('int') with open(output_path, 'w') as output: print('\"id\",\"tags\"', file=output) for index, labels in enumerate(Y_pred_thresh): labels = [ tag_list[i] for i, value in enumerate(labels) if value == 1 ] if len(labels) == 0: labels.append(tag_list[np.argmax(Y_pred[index])]) labels_original = ' '.join(labels) print('\"%d\",\"%s\"' % (index, labels_original), file=output)
class MultilabelClassifier_SVM(Classifier): kernel = None # SVM kernel type model = None # SVC object ''' @brief Class constructor @param kernel SVM kernel type @return None ''' def __init__(self, kernel='linear'): # Set SVM kernel type self.kernel = kernel ''' @brief Trains the model using given X and Y matrices @param Train_X_Tfidf Scikit-learn compatible matrix of TF-IDF embeddings for each topic text @param Train_Y Binary indicator matrix for the Y labels (tags) of the topics @return self.model ''' def train(self, Train_X_Tfidf, Train_Y, Test_X_Tfidf=None, Test_Y=None): # Fit the training dataset on the classifier self.model = MultiOutputClassifier(SVC(C=1.0, kernel=self.kernel, degree=3, gamma='auto', probability=True)) self.model.fit(Train_X_Tfidf, Train_Y) return self.model ''' @brief Predicts an indicator matrix and confidence level for each topic @param df Pandas dataframe of topic text @return predictionMatrix NumPy indicator matrix for the predicted tags @return confidenceList NumPy array of prediction confidence scores for each topic ''' def predict(self, df): # Raw list of category probabilities (Shape n_categories x n_samples x n_outputs) rawList = self.model.predict_proba(df) # Convert to NumPy array # Extract relevant output # Transpose and round probabilities to create an indicator matrix predictionMatrix = np.round_(np.array(rawList)[:, :, 1]).T # Extract relevant probability output # Average confidences across all categories for all samples probabilityMatrix = np.amax(np.array(rawList), axis=2) confidenceList = np.average(probabilityMatrix.T, axis=1) return predictionMatrix, confidenceList
class MulticlassLearner(AbstractLearner): def __init__(self): super().__init__() clf = LogisticRegression(C=1, solver='lbfgs', multi_class='multinomial') # clf = RandomForestClassifier(n_estimators=10) # clf = RidgeClassifier(alpha=0.3) # clf = MLPClassifier() self.clf = MultiOutputClassifier(clf, n_jobs=-1) def train(self, store: BasicStore): # x = self.data_manager.train # y = self.data_manager.train_labels self.clf.fit(*store.train_XYs) return self.clf def predict_proba(self, X): return self.clf.predict_proba(X)
def KNN(X_train, x_test, y_train, y_test): knn = KNeighborsClassifier(algorithm='auto', metric='minkowski', metric_params=None, n_jobs=-1, n_neighbors=147, p=2, weights='distance') print("poopf") knn.fit(X_train, y_train) classifier = MultiOutputClassifier(knn, n_jobs=-1) classifier.fit(X_train, y_train) y_predict = (classifier.predict_proba(x_test)) output = np.zeros((1967, 147)) #2597 for x in range(1967): for y in range(147): output[x][y] = y_predict[y][x][1] # print(output) # np.savetxt("sub.csv", output, delimiter=",") print(classifier.score(output, y_test))
loaddir = "feature_extraction/data/features/" #read data df = pd.read_csv(loaddir + 'features.csv', index_col=0) def create_target(text): text = text.strip('][') text = text.split(', ') return [int(i) for i in text] df["target"] = df["target"].apply(create_target) df_train, df_test = train_test_split(df, test_size=0.1, random_state=0) train_x = df_train.drop(["target"], axis=1) train_y = df_train["target"] test_x = df_test.drop(["target"], axis=1) test_y = df_test["target"] model = xgb.XGBClassifier() print(train_y) print(train_x.values) clf = MultiOutputClassifier(model).fit(train_x, train_y) pred = clf.predict_proba(test_x) print(test_y["target"]) print(pred)
estimators = MultiOutputClassifier(estimator=XGBClassifier( penalty="l2", objective="binary:logistic", random_state=42)) X_train, X_eval, y_train, y_eval = train_test_split(features_df, labels_df, test_size=0.33, shuffle=True, stratify=labels_df, random_state=RANDOM_SEED) # Train model estimators.fit(features_df, labels_df) # Predict on evaluation set # This competition wants probabilities, not labels preds = estimators.predict_proba(X_eval) preds k = preds[0] y_preds = pd.DataFrame( { "h1n1_vaccine": preds[0][:, 1], "seasonal_vaccine": preds[1][:, 1], }, index=y_eval.index) print("y_preds.shape:", y_preds.shape) y_preds.head() def plot_roc(y_true, y_score, label_name, ax): fpr, tpr, thresholds = roc_curve(y_true, y_score) ax.plot(fpr, tpr)
################confusion amtrix for method 2 # In[ ]: from sklearn.datasets import make_classification from sklearn.multioutput import MultiOutputClassifier X_train, X_test, y_train, y_test = train_test_split(F,h, test_size=0.30) multioutput = MultiOutputClassifier(xgb.XGBClassifier(objective='reg:logistic')).fit(X_train, y_train) y2_true, y2_pred = y_test , multioutput.predict(X_test) print('Results on the test set:') print(classification_report(y2_true, y2_pred)) cm = multilabel_confusion_matrix(y_test, y2_pred) sns.heatmap(cm, center=True) y1_probas =multioutput.predict_proba(X_test) skplt.metrics.plot_roc(y_test, y1_probas) plt.show() # In[ ]: ############################################################################################# # In[ ]: ##################################Random forest
for i, line in enumerate(f): if i == 0: n2vDim = int(line.split()[1]) X = np.zeros((Aexp.shape[0], n2vDim)) else: fields = line.split() assert len(fields) == n2vDim + 1 X[int(fields[0])] = np.array( fields[1:]).astype(float) Xtrain = X[train] Xtest = X[test] ii_tr = np.where(np.max(np.abs(Xtrain), axis=1) > 0.)[0] ii_ts = np.where(np.max(np.abs(Xtest), axis=1) > 0.)[0] clf = MultiOutputClassifier(LogisticRegression()).fit( Xtrain[ii_tr], Ytrain[ii_tr]) y = clf.predict_proba(Xtest[ii_ts]) elif classifier == 'n2v_knn' and not os.path.exists( experimentPath + 'fold' + str(fold_nr) + '/n2v/0_knn.pkl'): with open('../data/' + species + '/networks/tmp0.emb') as f: for i, line in enumerate(f): if i == 0: n2vDim = int(line.split()[1]) X = np.zeros((Aexp.shape[0], n2vDim)) else: fields = line.split() assert len(fields) == n2vDim + 1 X[int(fields[0])] = np.array( fields[1:]).astype(float) Xtrain = X[train] Xtest = X[test]
]] X_test_yn, y_test_yn = merged_test[features], merged_test[[ 'survival_yn', 'amount_yn' ]] multi_rf_clf = MultiOutputClassifier( RandomForestClassifier(n_estimators=100, random_state=0, verbose=3, n_jobs=-1)).fit(X_train_yn, y_train_yn) true_label_yn = return_true_label(y_test_yn, merged_test) pred_label_yn = return_multi_pred_label(multi_rf_clf, true_label_yn, X_test_yn) sur_pred_res = pd.concat([ pd.DataFrame(multi_rf_clf.predict_proba(X_test_yn)[0]), pred_label_yn[1][['pred_survival_time', 'survival_time']] ], 1) sur_pred_res.columns = [ 'survival_yn_prob_0', 'survival_yn_prob_1', 'pred_survival_yn', 'survival_yn' ] ams_pred_res = pd.concat([ pd.DataFrame(multi_rf_clf.predict_proba(X_test_yn)[1]), pred_label_yn[1][['pred_amount_spent', 'amount_spent']] ], 1) ams_pred_res.columns = [ 'amount_yn_prob_0', 'amount_yn_prob_1', 'pred_amount_yn', 'amount_yn' ] #joblib.dump(multi_rf_clf, 'clf_sur0_ams1.pkl')
def run_regression(train_embeds, train_labels, test_embeds, test_labels, args): start_time = time.time() if args.label == 'single': log = args.classifier elif args.label == 'multi': log = MultiOutputClassifier(args.classifier, n_jobs=-1) # log = RandomForestClassifier(n_jobs = -1, random_state=seed) # log = MLPClassifier(random_state=seed) else: assert False log.fit(train_embeds, train_labels) test_pred = log.predict(test_embeds) train_pred = log.predict(train_embeds) test_score = log.predict_proba(test_embeds) train_score = log.predict_proba(train_embeds) n2v_scores = {} n2v_scores['runtime'] = time.time() - start_time if args.label == 'single': print("Single-label") n2v_scores['test_f1'] = f1_score(test_labels, test_pred, average=args.average) n2v_scores['test_precision'] = precision_score(test_labels, test_pred, average=args.average) n2v_scores['test_recall'] = recall_score(test_labels, test_pred, average=args.average) n2v_scores['test_accuracy'] = accuracy_score(test_labels, test_pred) n2v_scores['train_f1'] = f1_score(train_labels, train_pred, average=args.average) n2v_scores['train_precision'] = precision_score(train_labels, train_pred, average=args.average) n2v_scores['train_recall'] = recall_score(train_labels, train_pred, average=args.average) n2v_scores['train_accuracy'] = accuracy_score(train_labels, train_pred) lb = LabelBinarizer() lb.fit(test_labels) lb.fit(train_labels) n2v_scores['test_auc'] = roc_auc_score(lb.transform(test_labels), test_score, average=args.average) n2v_scores['test_ap'] = average_precision_score( lb.transform(test_labels), test_score, average=args.average) n2v_scores['train_auc'] = roc_auc_score(lb.transform(train_labels), train_score, average=args.average) n2v_scores['train_ap'] = average_precision_score( lb.transform(train_labels), train_score, average=args.average) elif args.label == 'multi': print("Multi-label", test_labels.shape[1]) assert test_labels.shape[1] == train_labels.shape[1] n2v_scores['test_f1'] = [] n2v_scores['train_f1'] = [] n2v_scores['test_precision'] = [] n2v_scores['train_precision'] = [] n2v_scores['test_recall'] = [] n2v_scores['train_recall'] = [] for i in range(test_labels.shape[1]): n2v_scores['test_f1'].append( f1_score(test_labels[:, i], test_pred[:, i], average=args.average)) n2v_scores['test_precision'].append( precision_score(test_labels[:, i], test_pred[:, i], average=args.average)) n2v_scores['test_recall'].append( recall_score(test_labels[:, i], test_pred[:, i], average=args.average)) n2v_scores['train_f1'].append( f1_score(train_labels[:, i], train_pred[:, i], average=args.average)) n2v_scores['train_precision'].append( precision_score(train_labels[:, i], train_pred[:, i], average=args.average)) n2v_scores['train_recall'].append( recall_score(train_labels[:, i], train_pred[:, i], average=args.average)) n2v_scores['test_f1'] = np.mean(n2v_scores['test_f1']) n2v_scores['test_precision'] = np.mean(n2v_scores['test_precision']) n2v_scores['test_recall'] = np.mean(n2v_scores['test_recall']) n2v_scores['train_f1'] = np.mean(n2v_scores['train_f1']) n2v_scores['train_precision'] = np.mean(n2v_scores['train_precision']) n2v_scores['train_recall'] = np.mean(n2v_scores['train_recall']) n2v_scores['test_accuracy'] = accuracy_score(test_labels, test_pred) n2v_scores['train_accuracy'] = accuracy_score(train_labels, train_pred) # https://github.com/scikit-learn/scikit-learn/issues/2451 # n2v_scores['test_lrap'] = label_ranking_average_precision_score(test_labels, test_score) # n2v_scores['train_lrap'] = label_ranking_average_precision_score(train_labels, train_score) # n2v_scores['test_auc'] = roc_auc_score(test_labels, test_score, average=args.average) # n2v_scores['test_ap'] = average_precision_score(test_labels, test_score, average=args.average) # n2v_scores['train_auc'] = roc_auc_score(train_labels, train_score, average=args.average) # n2v_scores['train_ap'] = average_precision_score(train_labels, train_score, average=args.average)s else: assert False print(n2v_scores) # print("Test F1-score", n2v_scores['test_f1']) # print("Train F1-score", n2v_scores['train_f1']) # print("Runtime (s)", n2v_scores['runtime']) # from sklearn.dummy import DummyClassifier # dummy = DummyClassifier() # dummy.fit(train_embeds, train_labels) # print("Random baseline") # print(f1_score(test_labels, dummy.predict(test_embeds), average=average)) # for i in range(test_labels.shape[1]): # print("Random baseline F1 score", f1_score(test_labels[:,i], dummy.predict(test_embeds)[:,i], average="micro") return n2v_scores
random_state=1, return_train_score=True) fit_model(random_forest_Bayes_optimized_classifier, X_train, y_train, X_test) print(random_forest_Bayes_optimized_classifier.best_estimator_) #Show Confusion Matrix random_forest_optim = MultiOutputClassifier( RandomForestClassifier(n_estimators=2000, max_depth=20, min_samples_split=20, min_samples_leaf=4, max_features='auto')) classifier = random_forest_optim.fit(X_train, y_train) cm = multilabel_confusion_matrix(y_test, random_forest_optim.predict(X_test)) print(cm) ## Retrain best model on full dataset and fit to test_set_features random_forest_optim.fit(scaled_training_features, training_set_labels) preds = random_forest_optim.predict_proba(scaled_test_features) ## Format for submittal on DrivenData #Code copied from DrivenData to ensure correct format for submittal # Save predictions to submission data frame submission_format["h1n1_vaccine"] = preds[0][:, 1] submission_format["seasonal_vaccine"] = preds[1][:, 1] print(submission_format.head()) submission_format.to_csv('my_submission.csv', index=True)
X = np.vstack(list(map(lambda x: x['data'], sample_files.values()))) # X = normalize(X) # Build a label list that corresponds to the feature set y = [] for value in sample_files.values(): y += [value['labels']] * len(value['data']) y = np.array(mlb.transform(y)) # Use a multi-label classifier implementing Multinomial Naive Bayes clf = MultiOutputClassifier(ExtraTreesClassifier(max_depth=5)) clf.fit(X, y) print(f'Mean accuracy: {clf.score(X, y)}') num_folds = 10 cv_score = cross_val_score(clf, X, y, cv=num_folds) print(f'{num_folds}-fold cross-validation: {cv_score}') # Perform real-time tests for each input file for key, value in sample_files.items(): print("\nPerforming real-time classification of " f"{', '.join(value['labels'])}") start_time = timeit.default_timer() features = Serializer("data/" + key).classify_realtime( clf, wait_for_min_counts=False, classification_interval_counts=1000) total_time = timeit.default_timer() - start_time print(f'Classified in {total_time} seconds') print_prediction(clf.predict(features)) print_proba(clf.predict_proba(features))
def main(): # Script argument parsing parser = argparse.ArgumentParser( description= 'Homework 03 - Machine learning a.a. 2018/19 - Predict missing values', epilog=' coded by: Emanuele Palombo') parser.add_argument('dataset_name', metavar='DATASET', type=str, nargs='?', default=__default_ts_name, help='{} (default {}) - dataset name'.format( list(__ts_opts.keys()), __default_ts_name)) parser.add_argument( '--test-size', '-t', dest='test_size', action='store', metavar='TEST_SIZE', type=float, default=__default_test_size, help='[0-1] (default {}) - splitting size of TestSet'.format( __default_test_size)) parser.add_argument( '--question-marks-ts', '-q', dest='qm_repeted_ts', action='store', type=int, default=__default_question_mark_count_repeated, help= '{{0,1,2...}} (default {}) - (this value * {} * samples) added to TrainingSet' .format(__default_question_mark_count_repeated, __default_question_mark_count)) parser.add_argument( '--no-split', '-s', dest='no_split', action='store_true', default=__default_no_split, help='(default {}) - keep whole DataSet for training'.format( __default_no_split)) parser.add_argument('--img-tag', '-i', dest='img_tag', action='store', type=str, default='', help='string - add arbitrary string to saved images') parser.add_argument( '--verbose', '-v', dest='verbosity', action='count', default=__default_training_verbosity, help='add more verbosity to output (repeat it to increase)') args = parser.parse_args() if args.dataset_name not in __ts_opts: print('ERROR: Choose correct DataSet!\n') parser.print_help() exit(1) trainingset_selected_name = args.dataset_name test_size = args.test_size qm_repeted_ts = args.qm_repeted_ts dataset_no_split = args.no_split training_verbosity = args.verbosity img_tag = args.img_tag running_id = id_generator() ts_selected_opts = __ts_opts[trainingset_selected_name] # End script argument parsing print('\nDataSet selected: ' + ts_selected_opts['url']) # read dataset to pandas dataframe dataset = pd.read_csv(ts_selected_opts['url'], names=ts_selected_opts['columns']) if training_verbosity >= 1: print('\nFirst five rows of DataSet:\n') print(dataset.head()) print('\nDataSet Length: {}'.format(len(dataset))) # DataSet Manipulation # remove row with question marks (this avoid to have '?' on the output) dataset = dataset[~(dataset.astype(str) == '?').any(1)] # strip out (remove) the "real output" (y) dataset = dataset.iloc[ts_selected_opts['x_slice'][0], ts_selected_opts['x_slice'][1]] # Different approach to value conversion # convert all column to int (str => int) # dataset = dataset.apply(lambda x: pd.factorize(x)[0] + 1) # convert all columns to int dataset = dataset.astype(int) # dataSet Information features_count = len(dataset.columns) features_values = ds_features_values(dataset) # copy input features to output (columns * 2) for column in dataset.columns: dataset['y_' + column] = dataset[column] # Split DataSet training_set, test_set = train_test_split( dataset, test_size=test_size, random_state=__default_train_test_split_random_state) # check feature values between TrainingSet and TestSet # it's important avoid more value on TestSet (ie. error on log_loss for mismatch in predict_proba size) if not check_labels_split(features_count, training_set, test_set): exit(1) # Concat (add row) TrainingSet and TestSet # in this case model could see all sample (included queries without '?') if dataset_no_split: training_set = pd.concat([training_set, test_set], axis=0) print('\nTraining over the whole DataSet') else: print('\nSplit DataSet in TrainingSet and TestSet (test size: {})'. format(test_size)) # add (append) question mark # append qm_count rows, with 1 to qm_count '?' qm_count = int(ts_selected_opts['question_mark_count']) for i in range(qm_repeted_ts): for value_count in range(1, qm_count + 1): training_set = ds_mod_with_value(training_set, value_count, features_count, True) if training_verbosity >= 1: print( '{} Added {} question mark (?) to TrainingSet for each sample' .format(i, value_count)) # Shuffle TrainingSet training_set = training_set.sample(frac=1) if training_verbosity >= 1: print('\nManipulated TrainingSet:\n') print(training_set.head()) print('\nTrainingSet Length: {}'.format(len(training_set))) # TrainingSet: input X (features) and Output y ("mirrored" features)) x_train = training_set.iloc[:, 0:features_count] y_train = training_set.iloc[:, features_count:] # TestSet: input X (features) and Output y ("mirrored" features)) x_test = test_set.iloc[:, 0:features_count] y_test = test_set.iloc[:, features_count:] if training_verbosity >= 2: print('\nInput train:\n {}'.format(x_train.head())) print('\nOutput train:\n {}'.format(y_train.head())) print('\nInput test:\n {}'.format(x_test.head())) print('\nOutput test:\n {}'.format(y_test.head())) x_train = x_train.values y_train = y_train.values y_test = y_test.values # oneHot encoding (characteristic vector) # passing features_values without None force OneHotEncoder to transform None to null vector one_hot_encoder = OneHotEncoder(categories=features_values, handle_unknown='ignore') one_hot_encoder.fit(x_train) x_train_encoded = one_hot_encoder.transform(x_train).toarray() if training_verbosity >= 2: print('\nOneHotEncoding...\nexample: {} => {}'.format( x_train[0], x_train_encoded[0])) # store all results/metrics for each model/classifier results = {} for classifier_name in __deafult_model_classifier: filename = 'model_{}_{}.sav'.format(trainingset_selected_name, classifier_name) if os.path.isfile(filename): # load module already trained multi_output_classifier = joblib.load(filename) print( '\n### Model {} loaded by file: {}\nImportant: remove the file to re-train the model!' .format(classifier_name, filename)) else: n_jobs = None model_verbosity = True if training_verbosity >= 3 else False if classifier_name == 'MLP': classifier = MLPClassifier(hidden_layer_sizes=ts_selected_opts[ 'mlp_hidden_layers_sizes'], max_iter=1000, verbose=model_verbosity) elif classifier_name == 'KNN': n_jobs = None classifier = KNeighborsClassifier( n_neighbors=ts_selected_opts['knn_k']) elif classifier_name == 'SVM': classifier = SVC(gamma='scale', decision_function_shape='ovo', probability=True, verbose=model_verbosity) elif classifier_name == 'RandomForest': classifier = RandomForestClassifier( n_estimators=ts_selected_opts['random_forest_estimator'], verbose=model_verbosity) print('\n### Init and training the model: {}'.format( classifier_name)) # init MultiOutput for classifier multi_output_classifier = MultiOutputClassifier(classifier, n_jobs=n_jobs) multi_output_classifier.fit(x_train_encoded, y_train) # save the model to disk joblib.dump(multi_output_classifier, filename) results[classifier_name] = collections.defaultdict(list) metris_result = results[classifier_name] # create input test (query) with different number of '?' for query_count_question_mark in range( ts_selected_opts['question_mark_count'] + 1): print('\n## Add {} questions mark to input test (query)'.format( query_count_question_mark)) # modify (in place) input test with question marks x_test_with_qm = ds_mod_with_value( x_test.copy(), value_count=query_count_question_mark, append=False) if training_verbosity >= 2: print('\nInput test (query):\n {}'.format( pd.DataFrame(data=x_test_with_qm).head())) # encode the input test x_test_encoded = one_hot_encoder.transform( x_test_with_qm).toarray() # compute output prediction and probability y_pred = multi_output_classifier.predict(x_test_encoded) y_pred_proba = multi_output_classifier.predict_proba( x_test_encoded) # precision on whole output score = multi_output_classifier.score(x_test_encoded, y_test) # the Hamming loss corresponds to the Hamming distance between y_test and y_pred hamming_loss = np.sum(np.not_equal(y_test, y_pred)) / float( y_test.size) # compute y_test and y_pred how if the out was only the query question marks y_test_reduced, y_pred_reduced = reduce_y_to_qm( x_test_with_qm, y_test, y_pred) # write y_pred_proba to file (csv) write_pred_proba( y_pred_proba, '{}{}-{}-q{}-{}{}.csv'.format(__default_csv_path, trainingset_selected_name, classifier_name, query_count_question_mark, running_id, img_tag)) print('\nMetrics:') print(' {:<30} | {:^10} | {:>10}'.format('features', 'accuracy', 'log loss')) print('-' * (30 + 10 + 10 + 7)) log_loss_avg = 0 # for each output column => compute accuracy and log_loss for feature_index in range(y_test.shape[1]): y_test_column = y_test[:, feature_index] y_pred_column = y_pred[:, feature_index] accuracy = accuracy_score(y_test_column, y_pred_column) # note: for avoid error here was implemented check_labels_split() log_loss_value = log_loss( y_test_column, y_pred_proba[feature_index], labels=features_values[feature_index]) print(' {:<30} | {:^10.4f} | {:>10.4f}'.format( test_set.columns[feature_index], accuracy, log_loss_value)) log_loss_avg += log_loss_value metris_result['accuracy_' + str(feature_index)].append(accuracy) metris_result['log_loss_' + str(feature_index)].append(log_loss_value) print('\nVirtual reduced output:') # for each output reduced (only question marks) => compute accuracy for index in range(query_count_question_mark): accuracy = accuracy_score(y_test_reduced[:, index], y_pred_reduced[:, index]) print(' accuracy {}: {:>10.4f}'.format(index, accuracy)) metris_result['accuracy_reduced_' + str(index)].append(accuracy) print('\nAll output:') print(' accuracy: {:>10.4f}'.format(score)) print(' log_loss avg: {:>10.4f}'.format(log_loss_avg / y_test.shape[1])) print(' hamming loss: {:>10.4f}'.format(hamming_loss)) metris_result['accuracy'].append(score) metris_result['log_loss_avg'].append(log_loss_avg / y_test.shape[1]) metris_result['hamming_loss'].append(hamming_loss) # GRAPH PLOT per model/classifier plot_line_graph(range(ts_selected_opts['question_mark_count'] + 1), [ results[classifier_name]['accuracy'], results[classifier_name]['log_loss_avg'], results[classifier_name]['hamming_loss'] ], labels=['accuracy', 'log loss avg', 'hamming loss'], fmt=['bo-', 'ro-', 'yo-'], title=classifier_name, xlabel='Number of Question Marks in the query', ymax=1) if __default_save_img: plt.savefig('{}{}-{}-{}{}.png'.format(__default_imgs_path, trainingset_selected_name, classifier_name, running_id, img_tag), dpi=200) # create list of list of accuracy x feature accuracy_lst = [ 'accuracy_' + str(index) for index in range(features_count) ] accuracy_lst = [ results[classifier_name][accuracy_key] for accuracy_key in accuracy_lst ] plot_line_graph(range(ts_selected_opts['question_mark_count'] + 1), [results[classifier_name]['accuracy']] + accuracy_lst, fmt=['bo-'] + ['g.--'] * len(accuracy_lst), title=classifier_name + ': whole accuracy and those by features', xlabel='Number of Question Marks in the query', ylabel='accuracy', ymax=1) if __default_save_img: plt.savefig('{}{}-{}-accuracy-{}{}.png'.format( __default_imgs_path, trainingset_selected_name, classifier_name, running_id, img_tag), dpi=200) # create list of list of accuracy_reduced x feature (adding 0 in front when needed) accuracy_reduced_lst = [ 'accuracy_reduced_' + str(index) for index in range(ts_selected_opts['question_mark_count']) ] accuracy_reduced_lst = [ results[classifier_name][accuracy_reduced] for accuracy_reduced in accuracy_reduced_lst ] accuracy_reduced_lst = [[None] * (ts_selected_opts['question_mark_count'] - len(accuracy_reduced) + 1) + accuracy_reduced for accuracy_reduced in accuracy_reduced_lst] plot_line_graph( range(ts_selected_opts['question_mark_count'] + 1), [results[classifier_name]['accuracy']] + accuracy_reduced_lst, fmt=['bo-'] + ['m.--'] * len(accuracy_reduced_lst), title=classifier_name + ': whole accuracy and the virtual accuracies by features', xlabel='Number of Question Marks in the query', ylabel='accuracy', ymax=1) if __default_save_img: plt.savefig('{}{}-{}-accuracy-reduced-{}{}.png'.format( __default_imgs_path, trainingset_selected_name, classifier_name, running_id, img_tag), dpi=200) # create list of list of log_loss x feature log_loss_lst = [ 'log_loss_' + str(index) for index in range(features_count) ] log_loss_lst = [ results[classifier_name][log_loss_key] for log_loss_key in log_loss_lst ] plot_line_graph( range(ts_selected_opts['question_mark_count'] + 1), [results[classifier_name]['log_loss_avg']] + log_loss_lst, fmt=['ro-'] + ['c.--'] * len(log_loss_lst), title=classifier_name + ': average log loss and those by features', xlabel='Number of Question Marks in the query', ylabel='log loss') if __default_save_img: plt.savefig('{}{}-{}-log-loss-{}{}.png'.format( __default_imgs_path, trainingset_selected_name, classifier_name, running_id, img_tag), dpi=200) metrics_by_classifier = [ results[classifier][metric] for classifier in __deafult_model_classifier for metric in ['accuracy', 'log_loss_avg', 'hamming_loss'] ] label_by_classifier = [ classifier + ' ' + metric for classifier in __deafult_model_classifier for metric in ['accuracy', 'log_loss_avg', 'hamming_loss'] ] fmt_lst = [ style.replace('0', character) for character in ['o', '^', 'v', '<', '>', '.', ',', '+', 'x'] for style in ['b0-', 'r0-', 'y0-'] ] # GRAPH PLOT comparing model/classifier plot_line_graph(range(ts_selected_opts['question_mark_count'] + 1), metrics_by_classifier, labels=label_by_classifier, fmt=fmt_lst, title='Compare all model', xlabel='Number of Question Marks in the query', ylabel='', ymax=1) if __default_save_img: plt.savefig('{}{}-comparing-{}{}.png'.format( __default_imgs_path, trainingset_selected_name, running_id, img_tag), dpi=200) if not __default_save_img: plt.show()
class Team: def __init__(self, team_name, play_by_play_df): self.team = team_name self.team_df = play_by_play_df[play_by_play_df['posteam'] == self.team] self._generate_lists() self.valid_play_dict = { 'Pass': 0, 'Run': 1, 'Punt': 2, 'Field Goal': 3 } self.valid_play_inv_dict = { 0: 'Pass', 1: 'Run', 2: 'Punt', 3: 'Field Goal' } self.X = [] self.Y = [] def train_classifier(self, debug_classifier=False): self._organize_training_data() self._generate_random_forest(debug_classifier) def _generate_random_forest(self, debug_classifier): self.forest = RandomForestClassifier(n_estimators=100, random_state=1) self.multi_target_forest = MultiOutputClassifier(self.forest, n_jobs=-1) X_train, X_test, Y_train, Y_test = train_test_split(self.X, self.Y, test_size=0.1, random_state=0) self.multi_target_forest.fit(X_train, Y_train) forests = self.multi_target_forest.estimators_ forest0_feat = forests[0].feature_importances_.tolist() forest1_feat = forests[1].feature_importances_.tolist() forest2_feat = forests[2].feature_importances_.tolist() forest3_feat = forests[3].feature_importances_.tolist() feature_df = pd.DataFrame( data={ 'Features': [x for x in range(5)], 'Forest0': forest0_feat, 'Forest1': forest1_feat, 'Forest2': forest2_feat, 'Forest3': forest3_feat }) if debug_classifier == True: print('Training Score: ', self.multi_target_forest.score(X_train, Y_train)) print('Test Score: ', self.multi_target_forest.score(X_test, Y_test)) fig1 = plt.figure() ax = fig1.add_subplot(111) width = 0.1 feature_df.Forest0.plot(kind='bar', color='red', ax=ax, width=width, position=-1) feature_df.Forest1.plot(kind='bar', color='green', ax=ax, width=width, position=0) feature_df.Forest2.plot(kind='bar', color='blue', ax=ax, width=width, position=1) feature_df.Forest3.plot(kind='bar', color='yellow', ax=ax, width=width, position=2) ax.set_xticklabels([ 'Yards to First', 'Down', 'Quarter', 'Yardline', 'Score Diff' ], rotation=0) ax.set_xlabel('Features') ax.set_ylabel('Feature Importance') ax.set_title('Random Forest - Feature Analysis') plt.xlim(-0.5, 4.5) plt.legend(['Pass', 'Run', 'Punt', 'Field Goal']) plt.show() def test_classifier(self, yards_to_go, down, quarter, yard_line, score_diff): input_array = np.array( [yards_to_go, down, quarter, yard_line, score_diff]) prediction = self.multi_target_forest.predict_proba( input_array).tolist() prediction = prediction[0][1] return np.argmax(prediction) def _generate_lists(self): self.play_type = self.team_df['PlayType'].values.tolist() self.game_ID = self.team_df['GameID'].values.tolist() self.drive = self.team_df['Drive'].values.tolist() self.quarter = self.team_df['qtr'].values.tolist() self.down = self.team_df['down'].values.tolist() self.time = self.team_df['time'].values.tolist() self.pos_team = self.team_df['posteam'].values.tolist() self.def_team = self.team_df['DefensiveTeam'].values.tolist() self.pass_length = self.team_df['PassLength'].values.tolist() self.pass_location = self.team_df['PassLocation'].values.tolist() self.pass_attempt = self.team_df['PassAttempt'].values.tolist() self.air_yards = self.team_df['AirYards'].values.tolist() self.rush_attempt = self.team_df['RushAttempt'].values.tolist() self.run_location = self.team_df['RunLocation'].values.tolist() self.run_gap = self.team_df['RunGap'].values.tolist() self.fieldgoal_distance = self.team_df[ 'FieldGoalDistance'].values.tolist() self.pos_team_score = self.team_df['PosTeamScore'].values.tolist() self.def_team_score = self.team_df['DefTeamScore'].values.tolist() self.yrdline100 = self.team_df['yrdline100'].values.tolist() self.yrds_to_go = self.team_df['ydstogo'].values.tolist() def _organize_training_data(self): score_diff_list = np.array(self.pos_team_score) - np.array( self.def_team_score) zipped_data = zip(self.quarter, self.down, self.yrdline100, self.yrds_to_go, score_diff_list, self.play_type) for quarter, down, yrdln, yrds_to_go, score_diff, play_type in zipped_data: input_list = [yrds_to_go, down, quarter, yrdln, score_diff] if not np.any(np.isnan( input_list)) and play_type in self.valid_play_dict: output_list = [0 for _ in range(4)] output_list[self.valid_play_dict[play_type]] = 1 self.X.append(input_list) self.Y.append(output_list) self.X = np.array(self.X) self.Y = np.array(self.Y) def generate_success_probabilities(self, opponent, yr, debug_probs=False): ############################## # Extract Team Specific Data # ############################## self.opponent = opponent valid_dates = [ str(yr) + '-' + '09', str(yr) + '-' + '10', str(yr) + '-' + '11', str(yr) + '-' + '12', str(yr + 1) + '-' + '01' ] coach_yr_09_df = self.team_df[self.team_df['\ufeffDate'].str.contains( valid_dates[0])] coach_yr_10_df = self.team_df[self.team_df['\ufeffDate'].str.contains( valid_dates[1])] coach_yr_11_df = self.team_df[self.team_df['\ufeffDate'].str.contains( valid_dates[2])] coach_yr_12_df = self.team_df[self.team_df['\ufeffDate'].str.contains( valid_dates[3])] coach_yr_01_df = self.team_df[self.team_df['\ufeffDate'].str.contains( valid_dates[4])] coach_yr_df = pd.concat([ coach_yr_09_df, coach_yr_10_df, coach_yr_11_df, coach_yr_12_df, coach_yr_01_df ]) team_prob_df = coach_yr_df[coach_yr_df['DefensiveTeam'] == self.opponent] loc_pass_outcome = team_prob_df['PassOutcome'].values.tolist() loc_yrds_gained = team_prob_df['Yards.Gained'].values.tolist() loc_play_type = team_prob_df['PlayType'].values.tolist() loc_interception = team_prob_df['InterceptionThrown'].values.tolist() loc_play_type_fumble = coach_yr_df['PlayType'].values.tolist() loc_fumble = coach_yr_df['Fumble'].values.tolist() loc_drive = coach_yr_df['Drive'].values.tolist() loc_gameID = coach_yr_df['GameID'].values.tolist() loc_fg_success = coach_yr_df['FieldGoalResult'] loc_fg_distance = coach_yr_df['yrdline100'] loc_fg_play_type = coach_yr_df['PlayType'] loc_punt_spot = coach_yr_df['yrdline100'] loc_punt_return = coach_yr_df['Return_spot'] loc_time_elapsed = coach_yr_df['Elapsed_Play_Time'] ######################## # Initialize Variables # ######################## self.elapsed_time = { 'punt': [], 'run': [], 'pass_good': [], 'pass_nogood': [], 'fg': [] } self.total_passes = 0 self.total_completions = 0 self.pass_list = [] self.rush_list = [] self.pass_or_sack = 0 self.num_sacks = 0 self.sack_dist = [] self.total_interceptions = 0 field_goal_attempts = {0: 0, 10: 0, 20: 0, 30: 0, 40: 0, 50: 0, 60: 0} field_goal_successes = {0: 0, 10: 0, 20: 0, 30: 0, 40: 0, 50: 0, 60: 0} self.field_goal_pct = {} total_runs = 0 total_run_fumbles = 0 total_pass = 0 total_pass_fumbles = 0 self.punt_dist = [] punt_touchback = { 90: 0, 80: 0, 70: 0, 60: 0, 50: 0, 40: 0, 30: 0, 20: 0 } punt_kickrange = { 90: 0, 80: 0, 70: 0, 60: 0, 50: 0, 40: 0, 30: 0, 20: 0 } punt_total = 0 ##################### # Punt Calculations # ##################### for punt_spot, return_spot, time in zip(loc_punt_spot, loc_punt_return, loc_time_elapsed): if np.isnan(punt_spot) == False and np.isnan(return_spot) == False: punt_total += 1 punt_range = np.floor(punt_spot / 10) * 10 punt_kickrange[punt_range] += 1 if return_spot == 80: punt_touchback[punt_range] += 1 else: self.punt_dist.append(return_spot - (100 - punt_spot)) if np.isnan(time) == False: self.elapsed_time['punt'].append(time) self.punt_alpha, self.punt_loc, self.punt_beta = stats.gamma.fit( self.punt_dist) punt_x = np.arange(-10, 80, 1) g3 = gamma.pdf(x=punt_x, a=self.punt_alpha, loc=self.punt_loc, scale=self.punt_beta) self.punt_touchback_pct = {} for key, value in punt_kickrange.items(): if value != 0: self.punt_touchback_pct[key] = punt_touchback[key] / value ########################### # Field Goal Calculations # ########################### for fg_success, fg_distance, fg_play_type, time in zip( loc_fg_success, loc_fg_distance, loc_fg_play_type, loc_time_elapsed): if fg_play_type == 'Field Goal': marker = np.floor(fg_distance / 10) * 10 if marker is not None: if np.isnan(time) == False: self.elapsed_time['fg'].append(time) field_goal_attempts[marker] += 1 if fg_success == 'Good': field_goal_successes[marker] += 1 for key, value in field_goal_attempts.items(): if value > 0: self.field_goal_pct[key] = field_goal_successes[key] / value else: self.field_goal_pct[key] = 0 ####################### # Fumble Calculations # ####################### for i, fumble in enumerate(loc_fumble): current_game = loc_gameID[i] current_drive = loc_drive[i] if loc_play_type_fumble[i] == 'Pass': total_pass += 1 if fumble == 1: if loc_gameID[i + 1] == current_game: if loc_drive[i + 1] == current_drive or loc_drive[ i + 1] == current_drive + 1: pass else: total_pass_fumbles += 1 elif loc_play_type_fumble[i] == 'Run': total_runs += 1 if fumble == 1: if loc_gameID[i + 1] == current_game: if loc_drive[i + 1] == current_drive or loc_drive[ i + 1] == current_drive + 1: pass else: total_run_fumbles += 1 self.pass_fumble_pct = total_pass_fumbles / total_pass self.run_fumble_pct = total_run_fumbles / total_runs ############################# # Pass and Run Calculations # ############################# for pass_outcome, yrds_gained, play_type, interception, time in zip( loc_pass_outcome, loc_yrds_gained, loc_play_type, loc_interception, loc_time_elapsed): if play_type == 'Pass' or play_type == 'Sack': self.pass_or_sack += 1 if play_type == 'Sack': self.num_sacks += 1 self.sack_dist.append(yrds_gained) if play_type == 'Pass': self.total_passes += 1 if pass_outcome == "Complete": self.total_completions += 1 self.pass_list.append(yrds_gained) if np.isnan(time) == False: self.elapsed_time['pass_good'].append(time) else: if np.isnan(time) == False: self.elapsed_time['pass_nogood'].append(time) if interception == 1: self.total_interceptions += 1 elif play_type == 'Run': if np.isnan(time) == False: self.elapsed_time['run'].append(time) self.rush_list.append(yrds_gained) self.time_kde = {} self.time_kde['pass_good'] = stats.gaussian_kde( self.elapsed_time['pass_good'], bw_method=.2) self.time_kde['pass_nogood'] = stats.gaussian_kde( self.elapsed_time['pass_nogood'], bw_method=.2) self.time_kde['punt'] = stats.gaussian_kde(self.elapsed_time['punt'], bw_method=.2) self.time_kde['run'] = stats.gaussian_kde(self.elapsed_time['run'], bw_method=.2) self.time_kde['fg'] = stats.gaussian_kde(self.elapsed_time['fg'], bw_method=.2) self.pass_complete_pct = self.total_completions / self.total_passes self.pass_alpha, self.pass_loc, self.pass_beta = stats.gamma.fit( self.pass_list) self.run_alpha, self.run_loc, self.run_beta = stats.gamma.fit( self.rush_list) self.sack_pct = self.num_sacks / self.pass_or_sack self.sack_yrds_mean = np.mean(self.sack_dist) self.sack_yrds_std = np.std(self.sack_dist) self.interception_pct = self.total_interceptions / self.total_passes ############# # Debugging # ############# if debug_probs == True: pass_x = np.arange(0, 40, .1) g1 = gamma.pdf(x=pass_x, a=self.pass_alpha, loc=self.pass_loc, scale=self.pass_beta) run_x = np.arange(-10, 20, .1) g2 = gamma.pdf(x=run_x, a=self.run_alpha, loc=self.run_loc, scale=self.run_beta) fig2 = plt.figure() ax1 = fig2.add_subplot(2, 1, 1) ax1.plot(pass_x, g1) ax1.hist(self.pass_list, bins=20, normed=True) ax1.set_xlabel('Pass Yards') ax1.set_ylabel('Probability') ax2 = fig2.add_subplot(2, 1, 2) ax2.plot(run_x, g2) ax2.hist(self.rush_list, 20, normed=True) ax2.set_xlabel('Rush Yards') ax2.set_ylabel('Probability') fig2.show() fig3 = plt.figure() ax3 = fig3.add_subplot(1, 1, 1) ax3.plot(punt_x, g3) ax3.hist(self.punt_dist, bins=20, normed=True) fig3.show() fig6 = plt.figure() ax6 = fig6.add_subplot(1, 1, 1) print('TIMES', self.elapsed_time) for key, value in self.elapsed_time.items(): ax6.hist(value, histtype='step', label=key) ax6.legend() fig6.show()
class MultiLabeller(semisupervisor.SemiSupervisor): """ A widget for assigning more than one label to each data point. This class is designed to label data for (semi-)supervised learning algorithms. It allows you to label data. In the future, it will also allow you to re-train an algorithm. Parameters ---------- connection_string: str A SQLAlchemy-compatible database connection string. This is where the data for this widget will be stored, and where it will be retrieved from for labelling. features : list, np.ndarray, pd.Series, pd.DataFrame, optional An array or sequence of data in which each element (if 1D) or each row (if 2D) represents one data point for which you'd like to generate labels. labels : list, np.ndarray, pd.Series, pd.DataFrame, optional If you already have some labels, but would like to re-label some, then you can pass these in as labels. options : tuple, list The options presented for labelling. classifier : sklearn.base.ClassifierMixin, optional An object that implements the standard sklearn fit/predict methods. If provided, a button for retraining the model is shown, and the model performance under k-fold crossvalidation can be read as you go along. display_func : callable, optional A function that will be used to display the data. This function should take in two arguments, first the data to display, and second the number of data points to display (set to 1 for this class). eval_method : callable, optional A function that accepts the classifier, features, and labels as input and returns a dictionary of values that contain the key 'test_score'. The default is sklearn.model_selection.cross_validate, with cv=3. Use functools.partial to create a function with its parameters fixed. reorder : str, callable, optional One of the reordering algorithms specified in :py:mod:`superintendent.prioritisation`. This describes a function that receives input in the shape of n_samples, n_labels and calculates the priority in terms of information value in labelling a data point. shuffle_prop : float The proportion of points that are shuffled when the data points are re-ordered (see reorder keyword-argument). This controls the "exploration vs exploitation" trade-off - the higher, the more you explore the feature space randomly, the lower, the more you exploit your current weak points. keyboard_shortcuts : bool, optional If you want to enable ipyevent-mediated keyboard capture to use the keyboard rather than the mouse to submit data. """ def __init__(self, *args, **kwargs): """ A class for labelling your data. This class is designed to label data for (semi-)supervised learning algorithms. It allows you to label data, periodically re-train your algorithm and assess its performance, and determine which data points to label next based on your model's predictions. """ reorder = kwargs.pop("reorder", None) super().__init__(*args, **kwargs) if self.event_manager is not None: self.event_manager.on_dom_event( self.input_widget._on_key_down, remove=True ) if ( not isinstance(self.classifier, MultiOutputClassifier) and self.classifier is not None ): self.classifier = MultiOutputClassifier(self.classifier, n_jobs=-1) if reorder is not None and isinstance(reorder, str): if reorder not in prioritisation.functions: raise NotImplementedError( "Unknown reordering function '{}'.".format(reorder) ) self.reorder = prioritisation.functions[reorder] elif reorder is not None and callable(reorder): self.reorder = reorder elif reorder is None: self.reorder = None else: raise ValueError( "The reorder argument needs to be either a function or the " "name of a function listed in superintendent.prioritisation." ) self.input_widget = controls.MulticlassSubmitter( hint_function=kwargs.get("hint_function"), hints=kwargs.get("hints"), options=kwargs.get("options", ()), max_buttons=kwargs.get("max_buttons", 12), ) self.input_widget.on_submission(self._apply_annotation) if self.event_manager is not None: self.event_manager.on_dom_event(self.input_widget._on_key_down) self._compose() def retrain(self, *args): """Retrain the classifier you passed when creating this widget. This calls the fit method of your class with the data that you've labelled. It will also score the classifier and display the performance. """ if self.classifier is None: raise ValueError("No classifier to retrain.") if len(self.queue.list_labels()) < 1: self.model_performance.value = ( "Score: Not enough labels to retrain." ) return _, labelled_X, labelled_y = self.queue.list_completed() preprocessor = MultiLabelBinarizer() labelled_y = preprocessor.fit_transform(labelled_y) self._render_processing(message="Retraining... ") try: with warnings.catch_warnings(): warnings.simplefilter("ignore") self.performance = self.eval_method( self.classifier, labelled_X, labelled_y ) self.model_performance.value = "Score: {:.2f}".format( self.performance["test_score"].mean() ) except ValueError: # pragma: no cover self.performance = "Could not evaluate" self.model_performance.value = "Score: {}".format(self.performance) self.classifier.fit(labelled_X, labelled_y) if self.reorder is not None: ids, unlabelled_X = self.queue.list_uncompleted() probabilities = self.classifier.predict_proba(unlabelled_X) # if len(preprocessor.classes_) > 1: # probabilities = sum(probabilities) / len(probabilities) reordering = list( self.reorder(probabilities, shuffle_prop=self.shuffle_prop) ) new_order = OrderedDict( [(id_, index) for id_, index in zip(ids, list(reordering))] ) self.queue.reorder(new_order) self.queue.undo() self._annotation_loop.send({"source": "__skip__"})
class ModelTrainer: """ To train a merchine learning model based on the input yaml config """ RAND_SEED = 42 input_cmds = ('fit', 'evaluate', 'predict', 'experiment') supported_types = ('regression', 'classification', 'clustering') results_path = configs.get('results_path') # path to the results folder default_model_path = configs.get( 'default_model_path') # path to the pre-fitted model description_file = configs.get( 'description_file') # path to the description.json file evaluation_file = configs.get( 'evaluation_file') # path to the evaluation.json file prediction_file = configs.get( 'prediction_file') # path to the predictions.csv default_dataset_props = configs.get( 'dataset_props' ) # dataset props that can be changed from the yaml file default_model_props = configs.get( 'model_props') # model props that can be changed from the yaml file model = None def __init__(self, *args, **kwargs) -> None: self.data_path: str = kwargs.get('data_path', None) self.logfile = kwargs.get('logfile', None) self.command = kwargs.get('cmd', None) self.results_path = kwargs.get('results_path', None) # path to the results folder self._x_columns = None # results_path as specified input if self.results_path == None: self.results_path = ModelTrainer.results_path # path to the results folder else: self.default_model_path = os.path.join(self.results_path, configs.get('model_file')) self.description_file = os.path.join( self.results_path, 'description.json') # path to the description.json file self.evaluation_file = os.path.join( self.results_path, 'evaluation.json') # path to the evaluation.json file self.prediction_file = os.path.join( self.results_path, 'prediction.json') # path to the predictions.csv logger.info(f"Entered kwargs: {kwargs}") if not self.command or self.command not in self.input_cmds: raise Exception(f"You must enter a valid command.\n" f"available commands: {self.input_cmds}") if self.command == "fit": self.yml_path = kwargs.get('yaml_path', None) file_ext = self.yml_path.split('.')[-1] logger.info(f"You passed the configurations as a {file_ext} file.") self.yaml_configs = read_yaml( self.yml_path) if file_ext == 'yaml' else read_json( self.yml_path) logger.info(f"your chosen configuration: {self.yaml_configs}") # dataset options given by the user self.dataset_props: dict = self.yaml_configs.get( 'dataset', self.default_dataset_props) # model options given by the user self.model_props: dict = self.yaml_configs.get( 'model', self.default_model_props) # list of target(s) to predict self.target: list = self.yaml_configs.get('target', None) # list of obs_id(s) to identify observation self.observation_id: list = self.yaml_configs.get( 'observation_id', None) self.model_type: str = self.model_props.get('type', None) logger.info(f"dataset_props: {self.dataset_props} \n" f"model_props: {self.model_props} \n " f"target: {self.target} \n") # handle random numbers generation random_num_options = self.dataset_props.get('random_numbers', None) if random_num_options: generate_reproducible = random_num_options.get( 'generate_reproducible', None) if generate_reproducible: logger.info( "You provided the generate reproducible results option." ) seed = random_num_options.get('seed', self.RAND_SEED) np.random.seed(seed) logger.info( f"Setting a seed = {seed} to generate same random numbers on each experiment.." ) # if entered command is evaluate or predict, then the pre-fitted model needs to be loaded and used else: self.model_path = kwargs.get('model_path', self.default_model_path) logger.info(f"path of the pre-fitted model => {self.model_path}") # load description file to read stored training parameters with open(self.description_file, 'r') as f: dic = json.load(f) self.target: list = dic.get( "target") # target to predict as a list self.model_type: str = dic.get( "type" ) # type of the model -> regression or classification self.dataset_props: dict = dic.get( 'dataset_props') # dataset props entered while fitting getattr(self, self.command)() def _create_model(self, **kwargs): """ fetch a model depending on the provided type and algorithm by the user and return it @return: class of the chosen model """ model_type: str = self.model_props.get('type') model_algorithm: str = self.model_props.get('algorithm') use_cv = self.model_props.get('use_cv_estimator', None) model_args = None if not model_type or not model_algorithm: raise Exception(f"model_type and algorithm cannot be None") algorithms: dict = models_dict.get( model_type) # extract all algorithms as a dictionary model = algorithms.get( model_algorithm) # extract model class depending on the algorithm logger.info( f"Solving a {model_type} problem using ===> {model_algorithm}") if not model: raise Exception("Model not found in the algorithms list") else: model_props_args = self.model_props.get('arguments', None) if model_props_args and type(model_props_args) == dict: model_args = model_props_args elif not model_props_args or model_props_args.lower() == "default": model_args = None if use_cv: model_class = model.get('cv_class', None) if model_class: logger.info( f"cross validation estimator detected. " f"Switch to the CV version of the {model_algorithm} algorithm" ) else: logger.info( f"No CV class found for the {model_algorithm} algorithm" ) else: model_class = model.get('class') logger.info(f"model arguments: \n" f"{self.model_props.get('arguments')}") model = model_class(**kwargs) if not model_args else model_class( **model_args) return model, model_args def _save_model(self, model): """ save the model to a binary file @param model: model to save @return: bool """ try: if not os.path.exists(self.results_path): logger.info( f"creating model_results folder to save results...\n" f"path of the results folder: {self.results_path}") os.mkdir(self.results_path) else: logger.info(f"Folder {self.results_path} already exists") logger.warning( f"data in the {self.results_path} folder will be overridden. If you don't " f"want this, then move the current {self.results_path} to another path" ) except OSError: logger.exception( f"Creating the directory {self.results_path} failed ") else: logger.info( f"Successfully created the directory in {self.results_path} ") pickle.dump(model, open(self.default_model_path, 'wb')) return True def _load_model(self, f: str = ''): """ load a saved model from file @param f: path to model @return: loaded model """ try: if not f: logger.info(f"result path: {self.results_path} ") logger.info(f"loading model form {self.default_model_path} ") model = pickle.load(open(self.default_model_path, 'rb')) else: logger.info(f"loading from {f}") model = pickle.load(open(f, 'rb')) return model except FileNotFoundError: logger.error(f"File not found in {self.default_model_path} ") def _prepare_clustering_data(self): """ preprocess data for the clustering algorithm """ return self._process_data(target='fit_cluster') def _prepare_predict_data(self): """ preprocess predict data to get similar data to the one used when training the model """ return self._process_data(target='predict') def _prepare_fit_data(self): return self._process_data(target='fit') def _prepare_eval_data(self): return self._process_data(target='evaluate') def _process_data(self, target='fit'): """ read and return data as x and y @return: list of separate x and y """ assert isinstance(self.target, list), "provide target(s) as a list in the yaml file" if self.model_type != "clustering": assert len( self.target) > 0, "please provide at least a target to predict" try: read_data_options = self.dataset_props.get('read_data_options', None) dataset = pd.read_csv( self.data_path) if not read_data_options else pd.read_csv( self.data_path, **read_data_options) logger.info(f"dataset shape: {dataset.shape}") attributes = list(dataset.columns) logger.info(f"dataset attributes: {attributes}") # handle missing values in the dataset preprocess_props = self.dataset_props.get('preprocess', None) if preprocess_props: # handle encoding encoding = preprocess_props.get('encoding') if encoding: encoding_type = encoding.get('type', None) column = encoding.get('column', None) if column in attributes: dataset, classes_map = encode( df=dataset, encoding_type=encoding_type.lower(), column=column) if classes_map: self.dataset_props[ 'label_encoding_classes'] = classes_map logger.info( f"adding classes_map to dataset props: \n{classes_map}" ) logger.info( f"shape of the dataset after encoding => {dataset.shape}" ) # preprocessing strategy: mean, median, mode etc.. strategy = preprocess_props.get('missing_values') if strategy: dataset = handle_missing_values(dataset, strategy=strategy) logger.info( f"shape of the dataset after handling missing values => {dataset.shape}" ) if target == 'predict' or target == 'fit_cluster': x = _reshape(dataset.to_numpy()) if not preprocess_props: return x scaling_props = preprocess_props.get('scale', None) if not scaling_props: return x else: scaling_method = scaling_props.get('method', None) return normalize(x, method=scaling_method) if any(col not in attributes for col in self.target): raise Exception( "chosen target(s) to predict must exist in the dataset") y = pd.concat( [dataset.pop(x) for x in self.target], axis=1) # remove target variable(s) from dataset & concat them x = _reshape(dataset.to_numpy()) y = _reshape(y.to_numpy()) logger.info(f"y shape: {y.shape} and x shape: {x.shape}") self._x_columns = dataset.columns.to_list() logger.info(f"X columns: {self._x_columns}") # handle data scaling if preprocess_props: scaling_props = preprocess_props.get('scale', None) if scaling_props: scaling_method = scaling_props.get('method', None) scaling_target = scaling_props.get('target', None) if scaling_target == 'all': x = normalize(x, method=scaling_method) y = normalize(y, method=scaling_method) elif scaling_target == 'inputs': x = normalize(x, method=scaling_method) elif scaling_target == 'outputs': y = normalize(y, method=scaling_method) if target == 'evaluate': return x, y split_options = self.dataset_props.get('split', None) if not split_options: return x, y, None, None test_size = split_options.get('test_size') shuffle = split_options.get('shuffle') stratify = split_options.get('stratify') x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=test_size, shuffle=shuffle, stratify=None if not stratify or stratify.lower() == "default" else stratify) return x_train, y_train, x_test, y_test except Exception as e: logger.exception( f"error occured while preparing the data: {e.args}") def get_evaluation(self, model, x_test, y_true, y_pred, y_score, **kwargs): try: res = evaluate_model(model_type=self.model_type, model=model, x_test=x_test, y_pred=y_pred, y_true=y_true, y_score=y_score, get_score_only=False, **kwargs) except Exception as e: logger.debug(e) res = evaluate_model(model_type=self.model_type, model=model, x_test=x_test, y_pred=y_pred, y_true=y_true, y_score=y_score, get_score_only=True, **kwargs) return res def fit(self, **kwargs): """fit a model Raises: Exception: [description] """ x_train = None y_train = None x_test = None y_test = None cv_results = None eval_results = None cv_params = None hp_search_results = {} if self.model_type == 'clustering': x_train = self._prepare_clustering_data() else: x_train, y_train, x_test, y_test = self._prepare_fit_data() self.model, model_args = self._create_model(**kwargs) logger.info( f"executing a {self.model.__class__.__name__} algorithm...") # convert to multioutput if there is more than one target to predict: if self.model_type != 'clustering' and len(self.target) > 1: logger.info( f"predicting multiple targets detected. Hence, the model will be automatically " f"converted to a multioutput model") self.model = MultiOutputClassifier(self.model) \ if self.model_type == 'classification' else MultiOutputRegressor(self.model) if self.model_type != 'clustering': cv_params = self.model_props.get('cross_validate', None) if not cv_params: logger.info(f"cross validation is not provided") else: # perform cross validation logger.info("performing cross validation ...") cv_results = cross_validate(estimator=self.model, X=x_train, y=y_train, **cv_params) hyperparams_props = self.model_props.get('hyperparameter_search', None) if hyperparams_props: # perform hyperparameter search method = hyperparams_props.get('method', None) grid_params = hyperparams_props.get('parameter_grid', None) hp_args = hyperparams_props.get('arguments', None) logger.info( f"Performing hyperparameter search using -> {method}") logger.info( f"Grid parameters entered by the user: {grid_params}") logger.info(f"Additional hyperparameter arguments: {hp_args}") best_estimator, best_score, best_params = hyperparameter_search( model=self.model, method=method, params=grid_params, x_train=x_train, y_train=y_train, **hp_args) hp_search_results['best_params'] = best_params hp_search_results['best_score'] = best_score self.model = best_estimator self.model.fit(x_train, y_train) else: # if the model type is clustering self.model.fit(x_train) saved = self._save_model(self.model) if saved: logger.info( f"model saved successfully and can be found in the {self.results_path} folder" ) if self.model_type == 'clustering': eval_results = self.model.score(x_train) else: if x_test is None: logger.info( f"no split options was provided. training score will be calculated" ) eval_results = self.model.score(x_train, y_train) else: logger.info( f"split option detected. The performance will be automatically evaluated " f"using the test data portion") y_pred = self.model.predict(x_test) y_score = self.model.predict_proba( x_test) if self.model_type == 'classification' else None eval_results = self.get_evaluation(model=self.model, x_test=x_test, y_true=y_test, y_pred=y_pred, y_score=y_score, **kwargs) fit_description = { "model": self.model.__class__.__name__, "arguments": model_args if model_args else "default", "type": self.model_props['type'], "algorithm": self.model_props['algorithm'], "dataset_props": self.dataset_props, "model_props": self.model_props, "data_path": self.data_path, "train_data_shape": x_train.shape, "test_data_shape": None if x_test is None else x_test.shape, "train_data_size": x_train.shape[0], "test_data_size": None if x_test is None else x_test.shape[0], "results_path": str(self.results_path), "model_path": str(self.default_model_path), "target": None if self.model_type == 'clustering' else self.target, "results_on_test_data": eval_results, "hyperparameter_search_results": hp_search_results } if self.model_type == 'clustering': clustering_res = { "cluster_centers": self.model.cluster_centers_, "cluster_labels": self.model.labels_ } fit_description['clustering_results'] = clustering_res if cv_params: cv_res = { "fit_time": cv_results['fit_time'].tolist(), "score_time": cv_results['score_time'].tolist(), "test_score": cv_results['test_score'].tolist() } fit_description['cross_validation_params'] = cv_params fit_description['cross_validation_results'] = cv_res try: logger.info(f"saving fit description to {self.description_file}") with open(self.description_file, 'w', encoding='utf-8') as f: json.dump(fit_description, f, ensure_ascii=False, indent=4) except Exception as e: logger.exception( f"Error while storing the fit description file: {e}") def evaluate(self, **kwargs): """ evaluate a pre-fitted model and save results to a evaluation.json @return: None """ x_val = None y_true = None eval_results = None try: model = self._load_model() if self.model_type != 'clustering': x_val, y_true = self._prepare_eval_data() y_pred = model.predict(x_val) y_score = model.predict_proba( x_val) if self.model_type == 'classification' else None eval_results = self.get_evaluation(model=model, x_test=x_val, y_true=y_true, y_pred=y_pred, y_score=y_score, **kwargs) else: x_val = self._prepare_clustering_data() y_pred = model.predict(x_val) eval_results = model.score(x_val, y_pred) logger.info(f"saving fit description to {self.evaluation_file}") with open(self.evaluation_file, 'w', encoding='utf-8') as f: json.dump(eval_results, f, ensure_ascii=False, indent=4) except Exception as e: logger.exception(f"error occured during evaluation: {e}") def predict(self): """ use a pre-fitted model to make predictions and save them as csv @return: None """ try: model = self._load_model(f=self.model_path) x_val = self._prepare_predict_data( ) # the same is used for clustering y_pred = model.predict(x_val) y_pred = _reshape(model.predict_proba(x_val)[:, 1]) if ( type_of_target(y_pred) == 'binary' and self.model_type == 'classification') else _reshape(y_pred) logger.info( f"predictions shape: {y_pred.shape} | shape len: {len(y_pred.shape)}" ) logger.info(f"predict on targets: {self.target}") df_pred = pd.DataFrame.from_dict({ self.target[i]: y_pred[:, i] if len(y_pred.shape) > 1 else y_pred for i in range(len(self.target)) }) logger.info(f"saving the predictions to {self.prediction_file}") df_pred.to_csv(self.prediction_file) except Exception as e: logger.exception(f"Error while preparing predictions: {e}") @staticmethod def create_init_config_file(model_type=None, model_name=None, target=None, *args, **kwargs): path = configs.get('init_file_path', None) if not path: raise Exception("You need to provide a path for the init file") dataset_props = ModelTrainer.default_dataset_props model_props = ModelTrainer.default_model_props if model_type: logger.info(f"user selected model type = {model_type}") model_props['type'] = model_type if model_name: logger.info(f"user selected algorithm = {model_name}") model_props['algorithm'] = model_name logger.info(f"initalizing a default ModelTrainer.yaml in {path}") default_data = { "dataset": dataset_props, "model": model_props, "target": ['provide your target(s) here'] if not target else [tg for tg in target.split()] } created = create_yaml(default_data, path) if created: logger.info( f"a default Model.yaml is created for you in {path}. " f"you just need to overwrite the values to meet your expectations" ) else: logger.warning( f"something went wrong while initializing a default file")