def test_multi_output_classification_partial_fit(): # test if multi_target initializes correctly with base estimator and fit # assert predictions work as expected for predict sgd_linear_clf = SGDClassifier(loss='log', random_state=1) multi_target_linear = MultiOutputClassifier(sgd_linear_clf) # train the multi_target_linear and also get the predictions. half_index = X.shape[0] // 2 multi_target_linear.partial_fit( X[:half_index], y[:half_index], classes=classes) first_predictions = multi_target_linear.predict(X) assert_equal((n_samples, n_outputs), first_predictions.shape) multi_target_linear.partial_fit(X[half_index:], y[half_index:]) second_predictions = multi_target_linear.predict(X) assert_equal((n_samples, n_outputs), second_predictions.shape) # train the linear classification with each column and assert that # predictions are equal after first partial_fit and second partial_fit for i in range(3): # create a clone with the same state sgd_linear_clf = clone(sgd_linear_clf) sgd_linear_clf.partial_fit( X[:half_index], y[:half_index, i], classes=classes[i]) assert_array_equal(sgd_linear_clf.predict(X), first_predictions[:, i]) sgd_linear_clf.partial_fit(X[half_index:], y[half_index:, i]) assert_array_equal(sgd_linear_clf.predict(X), second_predictions[:, i])
def test_multi_output_classification(): # test if multi_target initializes correctly with base estimator and fit # assert predictions work as expected for predict, prodict_proba and score forest = RandomForestClassifier(n_estimators=10, random_state=1) multi_target_forest = MultiOutputClassifier(forest) # train the multi_target_forest and also get the predictions. multi_target_forest.fit(X, y) predictions = multi_target_forest.predict(X) assert_equal((n_samples, n_outputs), predictions.shape) predict_proba = multi_target_forest.predict_proba(X) assert len(predict_proba) == n_outputs for class_probabilities in predict_proba: assert_equal((n_samples, n_classes), class_probabilities.shape) assert_array_equal(np.argmax(np.dstack(predict_proba), axis=1), predictions) # train the forest with each column and assert that predictions are equal for i in range(3): forest_ = clone(forest) # create a clone with the same state forest_.fit(X, y[:, i]) assert_equal(list(forest_.predict(X)), list(predictions[:, i])) assert_array_equal(list(forest_.predict_proba(X)), list(predict_proba[i]))
def test_multi_output_classification_partial_fit_sample_weights(): # weighted classifier Xw = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]] yw = [[3, 2], [2, 3], [3, 2]] w = np.asarray([2., 1., 1.]) sgd_linear_clf = SGDClassifier(random_state=1) clf_w = MultiOutputClassifier(sgd_linear_clf) clf_w.fit(Xw, yw, w) # unweighted, but with repeated samples X = [[1, 2, 3], [1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]] y = [[3, 2], [3, 2], [2, 3], [3, 2]] sgd_linear_clf = SGDClassifier(random_state=1) clf = MultiOutputClassifier(sgd_linear_clf) clf.fit(X, y) X_test = [[1.5, 2.5, 3.5]] assert_array_almost_equal(clf.predict(X_test), clf_w.predict(X_test))
def test_multi_output_classification_sample_weights(): # weighted classifier Xw = [[1, 2, 3], [4, 5, 6]] yw = [[3, 2], [2, 3]] w = np.asarray([2., 1.]) forest = RandomForestClassifier(n_estimators=10, random_state=1) clf_w = MultiOutputClassifier(forest) clf_w.fit(Xw, yw, w) # unweighted, but with repeated samples X = [[1, 2, 3], [1, 2, 3], [4, 5, 6]] y = [[3, 2], [3, 2], [2, 3]] forest = RandomForestClassifier(n_estimators=10, random_state=1) clf = MultiOutputClassifier(forest) clf.fit(X, y) X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]] assert_almost_equal(clf.predict(X_test), clf_w.predict(X_test))
def test_multiclass_multioutput_estimator(): # test to check meta of meta estimators svc = LinearSVC(random_state=0) multi_class_svc = OneVsRestClassifier(svc) multi_target_svc = MultiOutputClassifier(multi_class_svc) multi_target_svc.fit(X, y) predictions = multi_target_svc.predict(X) assert_equal((n_samples, n_outputs), predictions.shape) # train the forest with each column and assert that predictions are equal for i in range(3): multi_class_svc_ = clone(multi_class_svc) # create a clone multi_class_svc_.fit(X, y[:, i]) assert_equal(list(multi_class_svc_.predict(X)), list(predictions[:, i]))
def test_multiclass_multioutput_estimator(): # test to check meta of meta estimators svc = LinearSVC(random_state=0) multi_class_svc = OneVsRestClassifier(svc) multi_target_svc = MultiOutputClassifier(multi_class_svc) multi_target_svc.fit(X, y) predictions = multi_target_svc.predict(X) assert_equal((n_samples, n_outputs), predictions.shape) # train the forest with each column and assert that predictions are equal for i in range(3): multi_class_svc_ = clone(multi_class_svc) # create a clone multi_class_svc_.fit(X, y[:, i]) assert_equal(list(multi_class_svc_.predict(X)), list(predictions[:, i]))
class GOClassifier: def __init__(self, X, y, random_seed=11, test_size=0.25, *args, **kwargs): ind = np.arange(X.shape[0]) np.random.seed(random_seed) np.random.shuffle(ind) self.X = X[ind] self.y = y[ind] self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( X, y, test_size=test_size, random_state=random_seed) self.random_seed = random_seed self.args = args self.kwargs = kwargs self.clf = None def fit(self, X=None, y=None): X_ = X if X is not None else self.X_train y_ = y if y is not None else self.y_train self.clf = MultiOutputClassifier( SGDClassifier(alpha=0.0001, max_iter=1000, tol=1e-3, random_state=self.random_seed, *self.args, **self.kwargs)) self.clf.fit(X_, y_) return self.clf def predict(self, X=None): assert self.clf is not None X_ = X if X is not None else self.X return self.clf.predict(X_) def test_predict(self): return self.predict(X=self.X_test) def score(self, X, y): assert self.clf is not None return self.clf.score(X, y) def test_score(self): assert self.clf is not None return self.clf.score(self.X_test, self.y_test) def train_score(self): assert self.clf is not None return self.clf.score(self.X_train, self.y_train)
def classify(method, h_features, h_labels, val_iter=10): print('X shape:', h_features.shape) print('y shape:', h_labels.shape) print('Training -> {0} - Classifier will run {1} times'.format( str(method), val_iter)) accuracy = [] class_stats = [] f1_micro, f1_macro = [], [] recall_micro, recall_macro = [], [] precision_micro, precision_macro = [], [] for iter_idx in range(val_iter): print('run - - - - - - - - - {0} at: {1} '.format( iter_idx + 1, datetime.now())) X_train, X_test, y_train, y_test = train_test_split(h_features, h_labels, test_size=0.2) classifier = MultiOutputClassifier(method) classifier.fit(X_train, y_train) y_hat = classifier.predict(X_test) accuracy.append(accuracy_score(y_test, y_hat)) f1_micro.append(f1_score(y_test, y_hat, average='micro')) f1_macro.append(f1_score(y_test, y_hat, average='macro')) recall_micro.append(recall_score(y_test, y_hat, average='micro')) recall_macro.append(recall_score(y_test, y_hat, average='macro')) precision_micro.append(precision_score(y_test, y_hat, average='micro')) precision_macro.append(precision_score(y_test, y_hat, average='macro')) class_stats.append(class_metrics(y_hat, y_test)) return { "classwise_stats": class_stats, "accuracy": accuracy, "f1_micro": f1_micro, "f1_macro": f1_macro, "recall_micro": recall_micro, "recall_macro": recall_macro, "precision_micro": precision_micro, "precision_macro": precision_macro, }
class analyze_text: def __init__(self): pass def _tfidftransformation(self, df): ''' TFIDF transformation of the training DS :param df: the entire dataset :return: X_train. the vectorized and tranformed input training ds. ''' pd.options.mode.chained_assignment = None nlp = spacy.load("en_core_web_sm") X = df["Speech"] self.vectoriser = TfidfVectorizer() X_train = self.vectoriser.fit_transform(X) print(X_train.shape) print(self.vectoriser.get_feature_names()) print(X_train) return X_train def train_model(self, df): ''' Train the model using liner SVC :param df: the entire ds :return: None ''' training_ds = self._tfidftransformation(df) y = df[["app","options"]] y.fillna('', inplace=True) print(y) self.clf = MultiOutputClassifier(LinearSVC()) self.clf.fit(training_ds, y) def predict(self, speech): ''' predict using the trained model :param speech: the verbal command :return: [app, options] predictions ''' test = [speech] test = self.vectoriser.transform(test) preds = self.clf.predict(test) return preds[0]
def test_multi_output_classifier_fallback(self): X, y = make_multilabel_classification(n_classes=3, random_state=0) X = X.astype(numpy.float32) clf = MultiOutputClassifier(LogisticRegression()).fit(X, y) del clf.classes_ onx = to_onnx(clf, X[:1], target_opset=TARGET_OPSET, options={ 'zipmap': False, 'output_class_labels': True }) sess = InferenceSession(onx.SerializeToString()) res = sess.run(None, {'X': X}) exp_lab = clf.predict(X) exp_prb = clf.predict_proba(X) assert_almost_equal(exp_lab, res[0]) self.assertEqual(len(exp_prb), len(res[1])) for e, g in zip(exp_prb, res[1]): assert_almost_equal(e, g, decimal=5)
class Recommender(): clf = "" classLabels = "Teknoloji-1 Teknoloji-2 Teknoloji-3 Gıda İnşaat Danışmanlık Giyim Online-Alışveriş Medya Banka-Sigorta Mobilya-Ev Eğitim Yemek Sanayi Otomobil Holding Market İçecek Kariyer-Planlama Kitap-Kırtasiye Kar-Amacı-Gütmeyen-Kuruluşlar Seyahat-Tatil Temizlik-Bakım Eskişehir-Yerel Düzce-Yerel Samsun-Yerel Osmaniye-Yerel Antalya-Yerel İstanbul-Yerel Ankara-Yerel Bursa-Yerel" def __init__(self, **kwargs): super(Recommender, self).__init__(**kwargs) file_path = os.path.join(settings.FILES_DIR, 'data_encoded.csv') X_train = pd.read_csv(file_path) file_path = os.path.join(settings.FILES_DIR, 'y.csv') y_train = pd.read_csv(file_path) self.classLabels = self.classLabels.split(" ") self.clf = MultiOutputClassifier(DecisionTreeClassifier()).fit( X_train, y_train) def recommend(self, X): preds = self.clf.predict([X]) recommends = [ item for item, pred in zip(self.classLabels, preds[0]) if pred == 1 ] return recommends
def knn_multi_output(x, y): from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1) from sklearn.multioutput import MultiOutputClassifier clf = MultiOutputClassifier(KNeighborsClassifier(n_neighbors=1)).fit( x_train, y_train) y_pred = clf.predict(x_test) for i in range(len(BASE_GENRES)): auc = roc_auc_score(y_test[:, i], y_pred[:, i]) print("AUC %s: %.4f" % (BASE_GENRES[i], auc)) f1s = [] tprs = [] for genre in range(len(BASE_GENRES)): TP = 0 FP = 0 TN = 0 FN = 0 genre = BASE_GENRES[genre] for i in range(len(y_test)): truth_genres = true_false_to_genres(y_test[i]) pred_genres = true_false_to_genres(y_pred[i]) if genre in truth_genres: if genre in pred_genres: TP += 1 else: FN += 1 else: if genre in pred_genres: FP += 1 else: TN += 1 print("Confusion Matrix of ", genre) get_confusion_matrix(TP, FP, TN, FN) f1s.append(get_f1(TP, FP, FN)) tprs.append(get_tpr(TP, FN)) print("av f1", np.array(f1s).mean()) print("av tpr", np.array(tprs).mean())
def return_metrics2(X, y, classifier): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) if classifier == "KNN": clf = MultiOutputClassifier(KNeighborsClassifier()).fit( X_train, y_train) elif classifier == "DTC": clf = MultiOutputClassifier(DecisionTreeClassifier()).fit( X_train, y_train) elif classifier == "ETC": clf = MultiOutputClassifier(ExtraTreeClassifier()).fit( X_train, y_train) elif classifier == "RFC": clf = MultiOutputClassifier(RandomForestClassifier()).fit( X_train, y_train) else: clf = MultiOutputClassifier(KNeighborsClassifier()).fit( X_train, y_train) y_pred = clf.predict(X_test) accuracy = 0 shape = y_pred.shape for idxRow in range(shape[0]): trueValCount = 0 size = 0 for idxCol in range(shape[1]): if y_test[idxRow][idxCol] == y_pred[idxRow][idxCol]: trueValCount += 1 lineAccuracy = trueValCount / shape[1] if (trueValCount > 0) else 0 accuracy += lineAccuracy # print('{0} -> {1} -> {2}/{3}'.format(idxRow,lineAccuracy,trueValCount,size)) # print(y_test[idxRow]) # print(y_pred[idxRow]) print("AVG Accuracy") print(accuracy / shape[0] if accuracy > 0 else 0) print("Hamming Loss") print(hamming_loss(y_test, y_pred)) return hamming_loss(y_test, y_pred) * 100
def run_classifier(data,seed,include_bigrams=True): vec = CountVectorizer(ngram_range=(1, 1), lowercase=True) if include_bigrams: vec = CountVectorizer(ngram_range=(1, 2), lowercase=True) X_train = vec.fit_transform(data['X_train']) X_eval = vec.transform(data['X_eval']) y_train = data['y_train'] y_eval = data['y_eval'] label_names = list(y_eval.columns) clf = MultiOutputClassifier(LogisticRegression(solver='saga',random_state=seed)) clf.fit(X_train,y_train) y_pred = clf.predict(X_eval) #support_train = y_train.sum(axis=1) #support_eval = y_eval.sum(axis=1) macro_f1 = f1_score(y_eval,y_pred,average='macro') all_f1 = f1_score(y_eval,y_pred,average=None) report = print(classification_report(y_eval,y_pred,target_names=label_names)) lrap = label_ranking_average_precision_score(y_eval,y_pred) print(macro_f1) print(all_f1) return macro_f1, all_f1, lrap, support_train, support_eval
leaf_size=10, n_jobs=-1)) else: model = MultiOutputRegressor( KNeighborsRegressor(n_neighbors=3, weights="uniform", leaf_size=10, n_jobs=-1)) # train the model model.fit(X.iloc[train_idx, :], Y.iloc[train_idx, :]) # In[2]: Collect the predictions # predict training and testing data train_predict = pd.DataFrame(model.predict(X.iloc[train_idx, :]), columns=Y.columns) test_predict = pd.DataFrame(model.predict(X.iloc[test_idx, :]), columns=Y.columns) # reshape all of the predictions into a single table predictions = pd.DataFrame() for j in range(outputs): # collect training data predict_j = np.array(train_predict.iloc[:, j]) actual_j = np.array(Y.iloc[train_idx, j]) name_j = Y.columns[j] data_j = "Train" predictions = pd.concat([ predictions, pd.DataFrame({
mo_X_train_A, mo_X_test_A, mo_t_train_A, mo_t_test_A = train_test_split( mo_training_set_A_features, mo_training_set_A_labels, test_size=0.25, random_state=42) mo_X_train_B, mo_X_test_B, mo_t_train_B, mo_t_test_B = train_test_split( mo_training_set_B_features, mo_training_set_B_labels, test_size=0.25, random_state=42) multi_output_class_A = MultiOutputClassifier(KNeighborsClassifier()).fit( mo_X_train_A, mo_t_train_A) multi_output_class_A_pred = multi_output_class_A.predict(mo_X_test_A) err_multi_output_class_A = mean_squared_error(mo_t_test_A, multi_output_class_A_pred) multi_output_class_B = MultiOutputClassifier(KNeighborsClassifier()).fit( mo_X_train_B, mo_t_train_B) multi_output_class_B_pred = multi_output_class_B.predict(mo_X_test_B) err_multi_output_class_B = mean_squared_error(mo_t_test_B, multi_output_class_B_pred) print("Multi Output classification error - System A: ", err_multi_output_class_A) print("Multi Output classification error - System B: ", err_multi_output_class_B) df_A = pd.DataFrame(multi_output_class_A_pred,
from sklearn.ensemble import RandomForestClassifier #from sklearn.metrics import accuracy_score rf = RandomForestClassifier(random_state=42) # 랜덤포레스트 분류기 rf.fit(X_train, y_train) # train data에 random forest model 학습 # rf_predictions = rf.predict(X_test) # 학습된 모델에 X_test 값을 넣어 y_test 예측 값 생성 # print(rf_predictions) # 분류기 평가 - gridsearchcv 전 rf model from sklearn.multioutput import MultiOutputClassifier # 멀티 출력 가능하게 하는 패키지 설치 rf_classifier = MultiOutputClassifier(rf, n_jobs=1) rf_classifier.fit(X_train, y_train) # 다중출력이 가능한 모델에 train data 학습 rf_predictions2 = rf_classifier.predict(X_test) # 학습된 모델에 X_test 넣어서 y_test 예측 print(rf_predictions2) print(rf_classifier.score(X_train, y_train)) # 훈련 데이터 셋 정확도 94.91% # GridSearchCV : 교차검증과 최적의 파라미터를 동시에 진행 from sklearn.model_selection import GridSearchCV param_grid = {'n_estimators': [50, 100, 200, 300], 'max_depth': [5, 10, 20]} forest_reg = RandomForestClassifier() grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error') print(grid_search.fit(X_train, y_train))
class MultilabelTraining: X_COLUMN_NAME = "page_text_extract" DEFAULT_TARGET_THEMES = [ 5, 6, 26, 33, 139, 163, 232, 313, 339, 350, 406, 409, 555, 589, 597, 634, 660, 695, 729, 766, 773, 793, 800, 810, 852, 895, 951, 975, ] OTHER_THEMES_VALUE = 4242 def __init__( self, df=pd.DataFrame(), x_column_name=X_COLUMN_NAME, group_processes=True, classifier=PassiveAggressiveClassifier(random_state=42), vectorizer=HashingVectorizer(n_features=2**14), target_themes=DEFAULT_TARGET_THEMES, other_themes_value=OTHER_THEMES_VALUE, remove_processes_without_theme=True, is_incremental_training=False, vocab_path="", ): self.is_incremental_training = is_incremental_training self.vocab_path = vocab_path self.remove_processes_without_theme = remove_processes_without_theme self.mo_classifier = MultiOutputClassifier(classifier, n_jobs=-1) self.classifier = classifier self.vectorizer = vectorizer self.target_themes = target_themes self.other_themes_value = other_themes_value self.group_processes = group_processes self.x_column_name = x_column_name self._initialize_dataframe(df) def _initialize_dataframe(self, df): if not df.empty: self.dp = DataframePreprocessing( df.copy(), group_processes=self.group_processes, x_column_name=self.x_column_name, target_themes=self.target_themes, other_themes_value=self.other_themes_value, is_incremental_training=self.is_incremental_training, remove_processes_without_theme=self. remove_processes_without_theme, vocab_path=self.vocab_path, ) self.y_columns_names = self.dp.distinct_themes self.df = self.dp.processed_df else: self.df = df def _split(self, X, y): print("Splitting dataset...") self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( X, y, stratify=y, test_size=0.2, random_state=42) def _vectorize(self, X_train): print("Vectorizing...") return self.vectorizer.fit_transform(X_train) def train(self, split_df=False): print("Training...") self.X_train, self.y_train = ( self.df[self.x_column_name], self.df[self.y_columns_names], ) if split_df: self._split(self.X_train, self.y_train) vector = self._vectorize(self.X_train) self.mo_classifier.fit(vector, self.y_train) if split_df: vector_test = self._vectorize(self.X_test) self.y_pred = self.mo_classifier.predict(vector_test) metrics = get_multilabel_metrics(self.y_test, self.y_pred) return metrics return None def _update_dataframe(self, df, is_incremental_training=True, is_parquet=False, labels_freq={}): self.dp = DataframePreprocessing( df.copy(), x_column_name=self.x_column_name, group_processes=self.group_processes, target_themes=self.target_themes, other_themes_value=self.other_themes_value, is_incremental_training=is_incremental_training, remove_processes_without_theme=self.remove_processes_without_theme, is_parquet=is_parquet, vocab_path=self.vocab_path, labels_freq=labels_freq, ) self.df = self.dp.processed_df def incremental_train(self, df_path, nrows=5000): print("Training incrementally...") columns_names = pd.read_csv(df_path, nrows=1).columns.tolist() skiprows = 1 classes, labels_freq = DataframePreprocessing( target_themes=self.target_themes).get_unique_binarized_labels( df_path, "tema") while True: df = pd.read_csv( df_path, nrows=nrows, skiprows=skiprows, header=None, names=columns_names, ) if df.empty: break self._update_dataframe(df, labels_freq=labels_freq) X_train, y_train = ( self.df[self.x_column_name], self.df[self.target_themes + [self.other_themes_value]], ) vector = self._vectorize(X_train) self.mo_classifier.partial_fit(vector, y_train, classes=classes) skiprows += nrows print("{} rows already trained\n".format(skiprows - 1)) def incremental_train_with_parquet(self, parquet_path): print("Training incrementally with parquet...") nrows = 0 pf = ParquetFile(parquet_path) classes, labels_freq = DataframePreprocessing( target_themes=self.target_themes).get_unique_binarized_labels( parquet_path, "tema", True) for df in pf.iter_row_groups(): df = df.reset_index() self._update_dataframe(df, is_parquet=True, labels_freq=labels_freq) X_train, y_train = ( self.df[self.x_column_name], self.df[self.target_themes + [self.other_themes_value]], ) vector = self._vectorize(X_train) self.mo_classifier.partial_fit(vector.toarray(), y_train, classes=classes) nrows += len(self.df) print("{} rows already trained\n".format(nrows)) clear_output(wait=True) def predict(self): return self.mo_classifier.predict( self._vectorize(self.X_test).todense()) def set_X_test(self, X): self.X_test = X def set_y_test(self, y): self.y_test = y def get_pickle(self): return pickle.dumps(self.mo_classifier)
X_train_embeds, X_val_embeds = [ WE.get_sentence_vector(tokenized_sentence(x), vector_dict, stopwords=STOPWORDS) for x in raw_X_train ], [ WE.get_sentence_vector(tokenized_sentence(x), vector_dict, stopwords=STOPWORDS) for x in raw_X_val ] lr_embed_clf = MultiOutputClassifier( LogisticRegression( max_iter=300, multi_class="multinomial", penalty="none", solver="lbfgs" ) ).fit(X_train_embeds, y_train) print(hamming_loss(y_val, lr_embed_clf.predict(X_val_embeds))) print(classification_report(y_val, lr_embed_clf.predict(X_val_embeds))) ## Seeing where no prediction was made null_predictions = len( [i for i in lr_embed_clf.predict(X_val_embeds) if not np.any(np.nonzero(i))] ) print(f"{null_predictions} out of {len(y_val)} predictions were null.") dub_ref_model = lr_embed_clf.estimators_[4] vocab, id2tok, tok2id = get_vocab(train_dataset) target_label = "dubious reference" BATCH_SIZE = 1 pred = [] actual = [] vectors = [] for batch, targets, lengths, raw_data in create_dataset(
pass ######################################## ############ CLASSIFICATION ############ ######################################## ######################################## # LOGISTIC REGRESSION ######################################## if ML_option == "Logistic Regression": # Fit the model and predict X_test. Show some analysis. try: logReg = MultiOutputClassifier(LogisticRegression()) logReg.fit(X_train, y_train) pred = logReg.predict(X_test) st.write('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, pred), 4)) st.write('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, pred), 4)) st.write('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, pred)), 4)) st.write('Accuracy of Logistic Regression on training set: ', round(logReg.score(X_train, y_train), 4)) st.write('Accuracy of Logistic Regression on test set: ', round(logReg.score(X_test, y_test), 4)) st.subheader("Classification Report") st.text(classification_report(y_test, pred)) try:
class ReedMullerMultiClass(ClassifierMixin): @_deprecate_positional_args def __init__(self, estimator, *, n_jobs=None): # FIXME: Check estimator has predict_proba method self.multi_output = MultiOutputClassifier(estimator, n_jobs=n_jobs) def fit(self, X, Y, sample_weight=None, **fit_params): self.classes_ = np.unique(Y) n_classes = len(self.classes_) if n_classes < 3: pass # Fixme: Raise warning? Exception? self.class_to_index = dict((c, i) for i, c in enumerate(self.classes_)) # Choose Reed Muller parameters in function of n_classes r, m = self._rm_policy(n_classes) self.rm = ReedMullerCodec(r, m, limit=n_classes) Y = self.encode_labels(Y) self.multi_output.fit(X, Y, sample_weight, **fit_params) def decision_function(self, X): check_is_fitted(self) Y = self.multi_output.predict(X) return self.decode_log_proba(Y) def predict_proba(self, X): check_is_fitted(self) Y = self.multi_output.predict(X) Y = np.exp(self.decode_log_proba(Y)) Y = Y / Y.sum(axis=1) return Y def predict(self, X): check_is_fitted(self) Y = self.multi_output.predict(X) Y = self.decode_log_proba(Y).argmax(axis=1) return np.array([self.classes_[i] for i in Y]) def encode_labels(self, Y): Y = (self.class_to_index[c] for c in Y) # Encode classes as integers Y = np.array([self.rm.encode(i) for i in Y]) # Encode integers as an RM ECC return Y def decode_log_proba(self, Y): Z = np.empty((len(Y), len(self.classes_))) for i, bits in enumerate(Y): Z[i] = self.rm.decode_log_proba(bits) return Z @staticmethod def _rm_options(): m = 0 # For a small number of classes, give order 1 RM codes for m in range(1, 4): yield 1, m, m + 1 # For a larger number of classes, give order 2 RM codes m = 3 while True: yield 2, m, int((m * (m - 1)) / 2) + m + 1 m += 1 @classmethod def _rm_policy(cls, n_classes): for r, m, rows in cls._rm_options(): if 2**rows >= n_classes: return r, m
highest_acc = 0 acc = 0 for i in range(1, 100): for j in range(1, 100): for k in range(50, 100): for l in range(len(x_train)): i, j, k = 6, 11, 59 forest = ensemble.RandomForestClassifier( n_estimators=i, random_state=42, max_features=j, min_samples_leaf=k) multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1) multi_target_forest.fit(x_train[l], y_train[l]) pred_ud = multi_target_forest.predict(x_test[l]) class_names = ['down', 'balance', 'up'] # Compute confusion matrix cnf_matrix = confusion_matrix(y_test[l].flatten(), pred_ud.flatten()) np.set_printoptions(precision=2) acc += np.trace(cnf_matrix, dtype='float32') / np.sum( np.sum(cnf_matrix, dtype='float32')) acc /= len(x_train) if acc >= highest_acc: highest_acc = acc print('acc->' + str(acc)) print('n_estimators->' + str(i)) print('max_features->' + str(j))
X_test = PCA(n_components=2).fit_transform(X_test) ax2.set_title('Test labels') ax2.scatter(X_test[:, 0], X_test[:, 1], c=np.sum(Y_test * np.array([1, 2, 3, 4, 5]), axis=1)) ax2.set_xlabel('Feature 0 count') forest = RandomForestClassifier(n_estimators=100, random_state=1) decision = DecisionTreeClassifier() # training step multi_target_R = MultiOutputClassifier(forest, n_jobs=-1) result_R = multi_target_R.fit(X, Y) result_R = multi_target_R.predict(X_test) score_R = multi_target_R.score(X_test, Y_test) multi_target_D = MultiOutputClassifier(decision, n_jobs=-1) multi_target_D = multi_target_D.fit(X, Y) result_D = multi_target_D.predict(X_test) score_D = multi_target_D.score(X_test, Y_test) # Plot classification result ax3.scatter(X_test[:, 0], X_test[:, 1], c=np.sum(result_D * np.array([1, 2, 3, 4, 5]), axis=1)) ax3.set_title('Decision Tree labels %0.2f' % score_D) ax3.set_ylabel('Feature 1 count') ax3.set_xlabel('Feature 0 count') X_w_D = []
if __name__ == "__main__": mlb = MultiLabelBinarizer() # also consider data/all_remove.train outfile = "results/linear.txt" X_train,y_train,X_test,y_test,test_sents = prepare_data("data/all.train", "data/validation/all_validation", mlb) #clf = LogisticRegression(verbose=1, solver="sag", class_weight={0:0.1}) clf = SGDClassifier(verbose=1, n_jobs=10, loss="log", class_weight={0:0.1}) multi_clf = MultiOutputClassifier(clf, n_jobs=1) multi_clf.fit(X_train, y_train) preds = multi_clf.predict(X_test) print(preds.shape) y_preds = mlb.inverse_transform(preds) print(y_preds) with open(outfile, "w") as out: for sent,pred in zip(test_sents, y_preds): pred = list(pred) if len(pred) == 0: pred = ["unmatched"] if len(pred) > 1 and "unmatched" in pred: pred.remove("unmatched") print("{}\t{}".format(sent,",".join(pred)), file=out)
x = np.load('./data/x_data.npy') y = np.load('./data/y_data.npy') x_pred = np.load('./data/x_pred.npy') print("x.shape :", x.shape) print("y.shape :", y.shape) print("x_pred.shape :", x_pred.shape) x = x.reshape(x.shape[0], 64 * 64 * 3) x_pred = x_pred.reshape(x_pred.shape[0], 64 * 64 * 3) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=77, shuffle=True) # model = XGBClassifier() model = MultiOutputClassifier(XGBRFClassifier()) # 3. 훈련 model.fit(x_train, y_train) # 4. 평가, 예측 acc = model.score(x_test, y_test) print("acc :", acc) y_pred = model.predict(x_pred)
title_words = [word for title in dataset['title'] for word in title.split()] normalized_titles = [normalize(title) for title in dataset['title']] # VECTORIZE YOUR DATA from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.preprocessing import MultiLabelBinarizer vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 3)) feature_matrix = vectorizer.fit_transform(normalized_titles) multiLabelBinarizer = MultiLabelBinarizer() labels = multiLabelBinarizer.fit_transform(job_functions) # BUILD THE MODEL from sklearn.svm import SVC from sklearn.multioutput import MultiOutputClassifier from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(feature_matrix, labels, test_size=0.2) estimator = SVC() model = MultiOutputClassifier(estimator) model.fit(X_train, y_train) y_pred = model.predict(X_test) # EVALUEATE YOUR ALGORITHM from sklearn.metrics import f1_score score = f1_score(y_test, y_pred, average='weighted')
return postostr(lignTOpos(j)) + ' => ' + postostr(lignTOpos(k)) for m in range(len(Matrix)): x = [] y = [] c = 0 for i in range(8): for j in range(8): Mx[m][c] = Matrix[m][i][j] # On aplatit la matrice des positions sur une ligne My[m][c] = Matrix_Y[m][i][j] c += 1 List_coups = np.zeros((n_positions,1)) for k in range(len(Matrix)): List_coups[k,0] = detect_coup(Matrix[k],Matrix_Y[k]) DFx = pd.DataFrame(Mx) DFy = pd.DataFrame(List_coups) trainData,testData,trainY,testY = train_test_split(Mx,List_coups,test_size=0.1) # Séparation en données d'entrainement et données de test kNN = KNeighborsClassifier(n_neighbors=3,algorithm='kd_tree',metric='minkowski',p=2,n_jobs=-1) classifier = MultiOutputClassifier(kNN, n_jobs=-1) classifier.fit(trainData,trainY) # Entrainement du réseau de neurone trainPredictionsk = classifier.predict(trainData) # Entrainement à la prédiction de la sortie trainCMk = confusion_matrix(y_pred=trainPredictionsk,y_true=trainY) # Calcul matrice de confusion testpredict = classifier.predict(testData) testCM = confusion_matrix(y_pred=testpredict,y_true=testY) # Matrice de confusion, les résultats juste sont comptés sur la diagonale, les autres sur le reste de la matrice. print('Score = ' + str(testCM.trace()/sum(sum(testCM)))) # Calcul performance
readings_train = df_train.ix[:, :-3] subj_train = df_train.ix[:, -3] activity_train = df_train.ix[:, -2] subj_activity_train = pd.DataFrame({ 'subject': subj_train, 'activity_id': activity_train }) # step 1.2 - fit the model to predict subject print('Fitting model to predict subject ...') clf = GaussianNB() clf_multi = MultiOutputClassifier(clf) time_bgn = time.time() clf_multi.fit(readings_train, subj_activity_train) dur_train_both = time.time() - time_bgn predicted_subj_activity_train = clf_multi.predict(readings_train) predicted_subj_activity_train = pd.DataFrame({ 'subject': predicted_subj_activity_train[:, 1], 'activity_id': predicted_subj_activity_train[:, 0] }) predicted_subj = predicted_subj_activity_train.ix[:, 1] predicted_activity = predicted_subj_activity_train.ix[:, 0] predicted_subj_activity_train = (100 * predicted_subj) + predicted_activity # step 2.1 - get the readings data (from data stratified using subject) readings_test = df_test.ix[:, :-3] subj_test = df_test.ix[:, -3] activity_test = df_test.ix[:, -2] subj_activity_test = pd.DataFrame({
train_labels_oh = pd.get_dummies(train_labels,columns=["Unfallschwere"]) if validation: # One Hot Encoding vom Validierungs Set test_val_labels = test_val_data["Unfallschwere"] test_val_data.drop(["Unfallschwere"],axis=1, inplace=True) test_labels_oh = pd.get_dummies(test_val_labels,columns=["Unfallschwere"]) ### Model Training ### # Random Forest und Decision Tree Classifier als Vergleich für Neuronales Netz forest = RandomForestClassifier(n_estimators=100) multi_target_forest = MultiOutputClassifier(forest) multi_target_forest.fit(train_data, train_labels_oh) Y_pred = multi_target_forest.predict(test_val_data) # Metrics print(np.round(accuracy_score(test_labels_oh.values.argmax(axis=1), Y_pred.argmax(axis=1)),2),"accuracy") print(np.round(f1_score(test_labels_oh.values.argmax(axis=1), Y_pred.argmax(axis=1),average=None),2),"f1 score") print(np.round(f1_score(test_labels_oh.values.argmax(axis=1), Y_pred.argmax(axis=1),average="weighted"),2),"f1 score weighted") print(np.round(f1_score(test_labels_oh.values.argmax(axis=1), Y_pred.argmax(axis=1),average="macro"),2),"f1 score macro") print(confusion_matrix(test_labels_oh.values.argmax(axis=1), Y_pred.argmax(axis=1))) decision_tree = DecisionTreeClassifier() decision_tree.fit(train_data, train_labels_oh.values) Y_pred = decision_tree.predict(test_val_data) # Metrics print(np.round(accuracy_score(test_labels_oh.values.argmax(axis=1), Y_pred.argmax(axis=1)),2),"accuracy")
audios = np.unique(mfcc_audio["Audio"]) train_audio, test_audio = train_test_split( audios, train_size=0.7, test_size=0.3, random_state=0) X_train = mfcc_audio[mfcc_audio["Audio"].isin(train_audio)] X_test = mfcc_audio[mfcc_audio["Audio"].isin(test_audio)] y_train = X_train[columns] y_test = X_test[columns] X_train.drop(columns + ["Audio"], inplace=True, axis=1) X_test.drop(columns + ["Audio"], inplace=True, axis=1) mor = MultiOutputClassifier( RandomForestClassifier(random_state=0, n_estimators=1000), n_jobs=-1) mor.fit(X_train, y_train) mor_pred = mor.predict(X_test) dummy = DummyClassifier() dummy.fit(X_train, y_train) dummy_pred = dummy.predict(X_test) estimators = mor.estimators_ for i, col in enumerate(columns): true = y_test[col] pred = mor_pred[:, i] d_p = dummy_pred[:, i] print(col) print("accuracy score")
vectorizer2.fit_transform(X), y, test_size=0.2, random_state=42) X_train3, X_test3, y_train3, y_test3 = train_test_split( vectorizer3.fit_transform(X), y, test_size=0.2, random_state=42) classifiers = { "LinearSVC": LinearSVC(), "BernoulliNB": MultinomialNB(), "Perceptron": Perceptron(n_iter=50) } for i in classifiers.keys(): clf = MultiOutputClassifier(classifiers[i]).fit(X_train, y_train) clf2 = MultiOutputClassifier(classifiers[i]).fit(X_train2, y_train2) clf3 = MultiOutputClassifier(classifiers[i]).fit(X_train3, y_train3) yhat = clf.predict(X_test) yhat2 = clf2.predict(X_test2) yhat3 = clf3.predict(X_test3) print i, "unigram" print "f1_score", f1_score(y_test, yhat, average='samples') print "jaccard_similarity_score", jaccard_similarity_score(y_test, yhat) print "accuracy_score", accuracy_score(y_test, yhat) print "precision_score", precision_score(y_test, yhat, average='samples') print "recall_score", recall_score(y_test, yhat, average='samples') print "********" print i, "bigram" print "f1_score", f1_score(y_test2, yhat2, average='samples') print "jaccard_similarity_score", jaccard_similarity_score(y_test2, yhat2) print "accuracy_score", accuracy_score(y_test2, yhat2)
Y_train_multi = np.column_stack([train_EI, train_NS, train_FT, train_JP]) test_EI = np.zeros(len(Y_test), dtype=np.bool) test_NS = np.zeros(len(Y_test), dtype=np.bool) test_FT = np.zeros(len(Y_test), dtype=np.bool) test_JP = np.zeros(len(Y_test), dtype=np.bool) test_EI[np.isin(Y_test, [0,2,3,6,8,0,10,11])] = 1 test_NS[np.isin(Y_test, [8,9,10,11,12,13,14,15])] = 1 test_FT[np.isin(Y_test, [1,2,3,4,9,11,12,14])] = 1 test_JP[np.isin(Y_test, [1,2,6,7,8,9,12,13])] = 1 Y_test_multi = np.column_stack([test_EI, test_NS, test_FT, test_JP]) rfc_multi = RandomForestClassifier(n_estimators=100, max_features=100, class_weight="balanced", verbose=1, n_jobs=7) rfc_multi_out = MultiOutputClassifier(rfc_multi) rfc_multi_out.fit(X_train, Y_train_multi) multi_predictions = rfc_multi_out.predict(X_test) np.logical_and(multi_predictions, Y_test_multi) #cm = confusion_matrix(Y_test_multi, multi_predictions) #cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] #fig, ax = plt.subplots() #ax.imshow(cm, interpolation='nearest') #plt.xticks(range(len(unique_type_list)), unique_type_list) #plt.yticks(range(len(unique_type_list)), unique_type_list)
model = MultiOutputRegressor( LassoCV(eps=1e-9, n_alphas=20, cv=3, tol=1e-4, max_iter=500, random_state=42, n_jobs=1)) # train the model model.fit(X.iloc[train_idx, :], Y.iloc[train_idx, :]) # In[2]: Collect the predictions # predict training and testing data train_predict = pd.DataFrame(model.predict(X.iloc[train_idx, :]), columns=Y.columns) test_predict = pd.DataFrame(model.predict(X.iloc[test_idx, :]), columns=Y.columns) # reshape all of the predictions into a single table predictions = pd.DataFrame() for j in range(outputs): # collect training data predict_j = np.array(train_predict.iloc[:, j]) actual_j = np.array(Y.iloc[train_idx, j]) name_j = Y.columns[j] data_j = "Train" predictions = pd.concat([ predictions, pd.DataFrame({
from sklearn.metrics import multilabel_confusion_matrix, ConfusionMatrixDisplay X_train = np.array(dfTrain.fabeec.to_list()) y_train = np.array(dfTrain.new_label.to_list()) # Create the SVM svm = LinearSVC(random_state=42) # Make it an Multilabel classifier multilabel_classifier = MultiOutputClassifier(svm, n_jobs=-1) # Fit the data to the Multilabel classifier multilabel_classifier = multilabel_classifier.fit(X_train, y_train) X_test = np.array(dfTest.fabeec.to_list()) # Get predictions for test data y_test_pred = multilabel_classifier.predict(X_test) predcited_res = multilabel_classifier.predict(X_train) f1_score(dfTrain.new_label.to_list(), predcited_res, average='macro') from joblib import dump, load dump(multilabel_classifier,'svm.joblib') !cp 'svm.joblib' '/content/drive/MyDrive/svm.joblib' print (f1_score) import pickle s = pickle.dumps(clf_1, 'decision_tree.joblib') pickle.dumps(multilabel_classifier, 'svm.joblib') !cp 'decision_tree.joblib' '/content/drive/MyDrive/decision_tree.joblib'
class Igel(object): """ Igel is the base model to use the fit, evaluate and predict functions of the sklearn library """ available_commands = ('fit', 'evaluate', 'predict', 'experiment') supported_types = ('regression', 'classification', 'clustering') results_path = configs.get('results_path') # path to the results folder default_model_path = configs.get( 'default_model_path') # path to the pre-fitted model description_file = configs.get( 'description_file') # path to the description.json file evaluation_file = configs.get( 'evaluation_file') # path to the evaluation.json file prediction_file = configs.get( 'prediction_file') # path to the predictions.csv default_dataset_props = configs.get( 'dataset_props' ) # dataset props that can be changed from the yaml file default_model_props = configs.get( 'model_props') # model props that can be changed from the yaml file model = None def __init__(self, **cli_args): logger.info(f"Entered CLI args: {cli_args}") logger.info(f"Executing command: {cli_args.get('cmd')} ...") self.data_path: str = cli_args.get('data_path') # path to the dataset logger.info(f"reading data from {self.data_path}") self.command = cli_args.get('cmd', None) if not self.command or self.command not in self.available_commands: raise Exception(f"You must enter a valid command.\n" f"available commands: {self.available_commands}") if self.command == "fit": self.yml_path = cli_args.get('yaml_path') file_ext = self.yml_path.split('.')[-1] logger.info(f"You passed the configurations as a {file_ext} file.") self.yaml_configs = read_yaml( self.yml_path) if file_ext == 'yaml' else read_json( self.yml_path) logger.info(f"your chosen configuration: {self.yaml_configs}") # dataset options given by the user self.dataset_props: dict = self.yaml_configs.get( 'dataset', self.default_dataset_props) # model options given by the user self.model_props: dict = self.yaml_configs.get( 'model', self.default_model_props) # list of target(s) to predict self.target: list = self.yaml_configs.get('target') self.model_type: str = self.model_props.get('type') logger.info(f"dataset_props: {self.dataset_props} \n" f"model_props: {self.model_props} \n " f"target: {self.target} \n") # if entered command is evaluate or predict, then the pre-fitted model needs to be loaded and used else: self.model_path = cli_args.get('model_path', self.default_model_path) logger.info(f"path of the pre-fitted model => {self.model_path}") # load description file to read stored training parameters with open(self.description_file, 'r') as f: dic = json.load(f) self.target: list = dic.get( "target") # target to predict as a list self.model_type: str = dic.get( "type" ) # type of the model -> regression or classification self.dataset_props: dict = dic.get( 'dataset_props') # dataset props entered while fitting getattr(self, self.command)() def _create_model(self, **kwargs): """ fetch a model depending on the provided type and algorithm by the user and return it @return: class of the chosen model """ model_type: str = self.model_props.get('type') model_algorithm: str = self.model_props.get('algorithm') use_cv = self.model_props.get('use_cv_estimator', None) model_args = None if not model_type or not model_algorithm: raise Exception(f"model_type and algorithm cannot be None") algorithms: dict = models_dict.get( model_type) # extract all algorithms as a dictionary model = algorithms.get( model_algorithm) # extract model class depending on the algorithm logger.info( f"Solving a {model_type} problem using ===> {model_algorithm}") if not model: raise Exception("Model not found in the algorithms list") else: model_props_args = self.model_props.get('arguments', None) if model_props_args and type(model_props_args) == dict: model_args = model_props_args elif not model_props_args or model_props_args.lower() == "default": model_args = None if use_cv: model_class = model.get('cv_class', None) if model_class: logger.info( f"cross validation estimator detected. " f"Switch to the CV version of the {model_algorithm} algorithm" ) else: logger.info( f"No CV class found for the {model_algorithm} algorithm" ) else: model_class = model.get('class') logger.info(f"model arguments: \n" f"{self.model_props.get('arguments')}") model = model_class(**kwargs) if not model_args else model_class( **model_args) return model, model_args def _save_model(self, model): """ save the model to a binary file @param model: model to save @return: bool """ try: if not os.path.exists(self.results_path): logger.info( f"creating model_results folder to save results...\n" f"path of the results folder: {self.results_path}") os.mkdir(self.results_path) else: logger.info(f"Folder {self.results_path} already exists") logger.warning( f"data in the {self.results_path} folder will be overridden. If you don't " f"want this, then move the current {self.results_path} to another path" ) except OSError: logger.exception( f"Creating the directory {self.results_path} failed ") else: logger.info( f"Successfully created the directory in {self.results_path} ") pickle.dump(model, open(self.default_model_path, 'wb')) return True def _load_model(self, f: str = ''): """ load a saved model from file @param f: path to model @return: loaded model """ try: if not f: logger.info(f"result path: {self.results_path} ") logger.info(f"loading model form {self.default_model_path} ") model = pickle.load(open(self.default_model_path, 'rb')) else: logger.info(f"loading from {f}") model = pickle.load(open(f, 'rb')) return model except FileNotFoundError: logger.error(f"File not found in {self.default_model_path} ") def _prepare_fit_data(self): return self._process_data(target='fit') def _prepare_eval_data(self): return self._process_data(target='evaluate') def _process_data(self, target='fit'): """ read and return data as x and y @return: list of separate x and y """ assert isinstance(self.target, list), "provide target(s) as a list in the yaml file" if self.model_type != "clustering": assert len( self.target) > 0, "please provide at least a target to predict" try: read_data_options = self.dataset_props.get('read_data_options', None) dataset = pd.read_csv( self.data_path) if not read_data_options else pd.read_csv( self.data_path, **read_data_options) logger.info(f"dataset shape: {dataset.shape}") attributes = list(dataset.columns) logger.info(f"dataset attributes: {attributes}") # handle missing values in the dataset preprocess_props = self.dataset_props.get('preprocess', None) if preprocess_props: # handle encoding encoding = preprocess_props.get('encoding') if encoding: encoding_type = encoding.get('type', None) column = encoding.get('column', None) if column in attributes: dataset, classes_map = encode( df=dataset, encoding_type=encoding_type.lower(), column=column) if classes_map: self.dataset_props[ 'label_encoding_classes'] = classes_map logger.info( f"adding classes_map to dataset props: \n{classes_map}" ) logger.info( f"shape of the dataset after encoding => {dataset.shape}" ) # preprocessing strategy: mean, median, mode etc.. strategy = preprocess_props.get('missing_values') if strategy: dataset = handle_missing_values(dataset, strategy=strategy) logger.info( f"shape of the dataset after handling missing values => {dataset.shape}" ) if target == 'predict' or target == 'fit_cluster': x = _reshape(dataset.to_numpy()) if not preprocess_props: return x scaling_props = preprocess_props.get('scale', None) if not scaling_props: return x else: scaling_method = scaling_props.get('method', None) return normalize(x, method=scaling_method) if any(col not in attributes for col in self.target): raise Exception( "chosen target(s) to predict must exist in the dataset") y = pd.concat([dataset.pop(x) for x in self.target], axis=1) x = _reshape(dataset.to_numpy()) y = _reshape(y.to_numpy()) logger.info(f"y shape: {y.shape} and x shape: {x.shape}") # handle data scaling if preprocess_props: scaling_props = preprocess_props.get('scale', None) if scaling_props: scaling_method = scaling_props.get('method', None) scaling_target = scaling_props.get('target', None) if scaling_target == 'all': x = normalize(x, method=scaling_method) y = normalize(y, method=scaling_method) elif scaling_target == 'inputs': x = normalize(x, method=scaling_method) elif scaling_target == 'outputs': y = normalize(y, method=scaling_method) if target == 'evaluate': return x, y split_options = self.dataset_props.get('split', None) if not split_options: return x, y, None, None test_size = split_options.get('test_size') shuffle = split_options.get('shuffle') stratify = split_options.get('stratify') x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=test_size, shuffle=shuffle, stratify=None if not stratify or stratify.lower() == "default" else stratify) return x_train, y_train, x_test, y_test except Exception as e: logger.exception( f"error occured while preparing the data: {e.args}") def _prepare_clustering_data(self): """ preprocess data for the clustering algorithm """ return self._process_data(target='fit_cluster') def _prepare_predict_data(self): """ preprocess predict data to get similar data to the one used when training the model """ return self._process_data(target='predict') def get_evaluation(self, model, x_test, y_true, y_pred, **kwargs): res = None try: res = evaluate_model(model_type=self.model_type, model=model, x_test=x_test, y_pred=y_pred, y_true=y_true, get_score_only=False, **kwargs) except Exception as e: res = evaluate_model(model_type=self.model_type, model=model, x_test=x_test, y_pred=y_pred, y_true=y_true, get_score_only=True, **kwargs) return res def fit(self, **kwargs): """ fit a machine learning model and save it to a file along with a description.json file @return: None """ x_train = None x_test = None y_train = None y_test = None cv_results = None eval_results = None cv_params = None if self.model_type == 'clustering': x_train = self._prepare_clustering_data() else: x_train, y_train, x_test, y_test = self._prepare_fit_data() self.model, model_args = self._create_model(**kwargs) logger.info( f"executing a {self.model.__class__.__name__} algorithm...") # convert to multioutput if there is more than one target to predict: if self.model_type != 'clustering' and len(self.target) > 1: logger.info( f"predicting multiple targets detected. Hence, the model will be automatically " f"converted to a multioutput model") self.model = MultiOutputClassifier(self.model) \ if self.model_type == 'classification' else MultiOutputRegressor(self.model) if self.model_type != 'clustering': cv_params = self.model_props.get('cross_validate', None) if not cv_params: logger.info(f"cross validation is not provided") else: cv_results = cross_validate(estimator=self.model, X=x_train, y=y_train, **cv_params) self.model.fit(x_train, y_train) else: self.model.fit(x_train) saved = self._save_model(self.model) if saved: logger.info( f"model saved successfully and can be found in the {self.results_path} folder" ) if self.model_type == 'clustering': eval_results = self.model.score(x_train) else: if x_test is None: logger.info( f"no split options was provided. training score will be calculated" ) eval_results = self.model.score(x_train, y_train) else: logger.info( f"split option detected. The performance will be automatically evaluated " f"using the test data portion") y_pred = self.model.predict(x_test) eval_results = self.get_evaluation(model=self.model, x_test=x_test, y_true=y_test, y_pred=y_pred, **kwargs) fit_description = { "model": self.model.__class__.__name__, "arguments": model_args if model_args else "default", "type": self.model_props['type'], "algorithm": self.model_props['algorithm'], "dataset_props": self.dataset_props, "model_props": self.model_props, "data_path": self.data_path, "train_data_shape": x_train.shape, "test_data_shape": None if x_test is None else x_test.shape, "train_data_size": x_train.shape[0], "test_data_size": None if x_test is None else x_test.shape[0], "results_path": str(self.results_path), "model_path": str(self.default_model_path), "target": None if self.model_type == 'clustering' else self.target, "results_on_test_data": eval_results } if self.model_type == 'clustering': clustering_res = { "cluster_centers": self.model.cluster_centers_, "cluster_labels": self.model.labels_ } fit_description['clustering_results'] = clustering_res if cv_params: cv_res = { "fit_time": cv_results['fit_time'].tolist(), "score_time": cv_results['score_time'].tolist(), "test_score": cv_results['test_score'].tolist() } fit_description['cross_validation_params'] = cv_params fit_description['cross_validation_results'] = cv_res try: logger.info(f"saving fit description to {self.description_file}") with open(self.description_file, 'w', encoding='utf-8') as f: json.dump(fit_description, f, ensure_ascii=False, indent=4) except Exception as e: logger.exception( f"Error while storing the fit description file: {e}") def evaluate(self, **kwargs): """ evaluate a pre-fitted model and save results to a evaluation.json @return: None """ x_val = None y_true = None eval_results = None try: model = self._load_model() if self.model_type != 'clustering': x_val, y_true = self._prepare_eval_data() y_pred = model.predict(x_val) eval_results = self.get_evaluation(model=model, x_test=x_val, y_true=y_true, y_pred=y_pred, **kwargs) else: x_val = self._prepare_clustering_data() y_pred = model.predict(x_val) eval_results = model.score(x_val, y_pred) logger.info(f"saving fit description to {self.evaluation_file}") with open(self.evaluation_file, 'w', encoding='utf-8') as f: json.dump(eval_results, f, ensure_ascii=False, indent=4) except Exception as e: logger.exception(f"error occured during evaluation: {e}") def predict(self): """ use a pre-fitted model to make predictions and save them as csv @return: None """ try: model = self._load_model(f=self.model_path) x_val = self._prepare_predict_data( ) # the same is used for clustering y_pred = model.predict(x_val) y_pred = _reshape(y_pred) logger.info( f"predictions shape: {y_pred.shape} | shape len: {len(y_pred.shape)}" ) logger.info(f"predict on targets: {self.target}") df_pred = pd.DataFrame.from_dict({ self.target[i]: y_pred[:, i] if len(y_pred.shape) > 1 else y_pred for i in range(len(self.target)) }) logger.info(f"saving the predictions to {self.prediction_file}") df_pred.to_csv(self.prediction_file) except Exception as e: logger.exception(f"Error while preparing predictions: {e}") @staticmethod def create_init_mock_file(model_type=None, model_name=None, target=None, *args, **kwargs): path = configs.get('init_file_path', None) if not path: raise Exception("You need to provide a path for the init file") dataset_props = Igel.default_dataset_props model_props = Igel.default_model_props if model_type: logger.info(f"user selected model type = {model_type}") model_props['type'] = model_type if model_name: logger.info(f"user selected algorithm = {model_name}") model_props['algorithm'] = model_name logger.info(f"initalizing a default igel.yaml in {path}") default_data = { "dataset": dataset_props, "model": model_props, "target": ['provide your target(s) here'] if not target else [tg for tg in target.split()] } created = create_yaml(default_data, path) if created: logger.info( f"a default igel.yaml is created for you in {path}. " f"you just need to overwrite the values to meet your expectations" ) else: logger.warning( f"something went wrong while initializing a default file")
train_interval[i], test_interval[i]) x_train[i], y_train[i], x_test[i], y_test[ i] = x_train_ud, y_train_ud, x_test_ud, y_test_ud print('[data helper -ud ] costs:' + str(time.time() - t_start) + 'secs') t_start = time.time() highest_acc = 0 acc = 0 for i in range(1, 100): for l in range(len(x_train)): nusvc = NuSVC(nu=float(i) / 100.0) multi_target_nusvc = MultiOutputClassifier(nusvc, n_jobs=-1) multi_target_nusvc.fit(x_train[l], y_train[l]) pred_ud = multi_target_nusvc.predict(x_test[l]) class_names = ['down', 'balance', 'up'] # Compute confusion matrix cnf_matrix = confusion_matrix(y_test[l].flatten(), pred_ud.flatten()) np.set_printoptions(precision=2) acc += np.trace(cnf_matrix, dtype='float32') / np.sum( np.sum(cnf_matrix, dtype='float32')) acc /= len(x_train) if acc >= highest_acc: highest_acc = acc print('acc->' + str(acc)) print('nu->' + str(float(i) / 100.0))