Пример #1
0
def test_multi_output_classification_partial_fit():
    # test if multi_target initializes correctly with base estimator and fit
    # assert predictions work as expected for predict

    sgd_linear_clf = SGDClassifier(loss='log', random_state=1)
    multi_target_linear = MultiOutputClassifier(sgd_linear_clf)

    # train the multi_target_linear and also get the predictions.
    half_index = X.shape[0] // 2
    multi_target_linear.partial_fit(
        X[:half_index], y[:half_index], classes=classes)

    first_predictions = multi_target_linear.predict(X)
    assert_equal((n_samples, n_outputs), first_predictions.shape)

    multi_target_linear.partial_fit(X[half_index:], y[half_index:])
    second_predictions = multi_target_linear.predict(X)
    assert_equal((n_samples, n_outputs), second_predictions.shape)

    # train the linear classification with each column and assert that
    # predictions are equal after first partial_fit and second partial_fit
    for i in range(3):
        # create a clone with the same state
        sgd_linear_clf = clone(sgd_linear_clf)
        sgd_linear_clf.partial_fit(
            X[:half_index], y[:half_index, i], classes=classes[i])
        assert_array_equal(sgd_linear_clf.predict(X), first_predictions[:, i])
        sgd_linear_clf.partial_fit(X[half_index:], y[half_index:, i])
        assert_array_equal(sgd_linear_clf.predict(X), second_predictions[:, i])
Пример #2
0
def test_multi_output_classification():
    # test if multi_target initializes correctly with base estimator and fit
    # assert predictions work as expected for predict, prodict_proba and score

    forest = RandomForestClassifier(n_estimators=10, random_state=1)
    multi_target_forest = MultiOutputClassifier(forest)

    # train the multi_target_forest and also get the predictions.
    multi_target_forest.fit(X, y)

    predictions = multi_target_forest.predict(X)
    assert_equal((n_samples, n_outputs), predictions.shape)

    predict_proba = multi_target_forest.predict_proba(X)

    assert len(predict_proba) == n_outputs
    for class_probabilities in predict_proba:
        assert_equal((n_samples, n_classes), class_probabilities.shape)

    assert_array_equal(np.argmax(np.dstack(predict_proba), axis=1),
                       predictions)

    # train the forest with each column and assert that predictions are equal
    for i in range(3):
        forest_ = clone(forest)  # create a clone with the same state
        forest_.fit(X, y[:, i])
        assert_equal(list(forest_.predict(X)), list(predictions[:, i]))
        assert_array_equal(list(forest_.predict_proba(X)),
                           list(predict_proba[i]))
Пример #3
0
def test_multi_output_classification_partial_fit_sample_weights():
    # weighted classifier
    Xw = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
    yw = [[3, 2], [2, 3], [3, 2]]
    w = np.asarray([2., 1., 1.])
    sgd_linear_clf = SGDClassifier(random_state=1)
    clf_w = MultiOutputClassifier(sgd_linear_clf)
    clf_w.fit(Xw, yw, w)

    # unweighted, but with repeated samples
    X = [[1, 2, 3], [1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
    y = [[3, 2], [3, 2], [2, 3], [3, 2]]
    sgd_linear_clf = SGDClassifier(random_state=1)
    clf = MultiOutputClassifier(sgd_linear_clf)
    clf.fit(X, y)
    X_test = [[1.5, 2.5, 3.5]]
    assert_array_almost_equal(clf.predict(X_test), clf_w.predict(X_test))
Пример #4
0
def test_multi_output_classification_sample_weights():
    # weighted classifier
    Xw = [[1, 2, 3], [4, 5, 6]]
    yw = [[3, 2], [2, 3]]
    w = np.asarray([2., 1.])
    forest = RandomForestClassifier(n_estimators=10, random_state=1)
    clf_w = MultiOutputClassifier(forest)
    clf_w.fit(Xw, yw, w)

    # unweighted, but with repeated samples
    X = [[1, 2, 3], [1, 2, 3], [4, 5, 6]]
    y = [[3, 2], [3, 2], [2, 3]]
    forest = RandomForestClassifier(n_estimators=10, random_state=1)
    clf = MultiOutputClassifier(forest)
    clf.fit(X, y)

    X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]]
    assert_almost_equal(clf.predict(X_test), clf_w.predict(X_test))
def test_multiclass_multioutput_estimator():
    # test to check meta of meta estimators
    svc = LinearSVC(random_state=0)
    multi_class_svc = OneVsRestClassifier(svc)
    multi_target_svc = MultiOutputClassifier(multi_class_svc)

    multi_target_svc.fit(X, y)

    predictions = multi_target_svc.predict(X)
    assert_equal((n_samples, n_outputs), predictions.shape)

    # train the forest with each column and assert that predictions are equal
    for i in range(3):
        multi_class_svc_ = clone(multi_class_svc)  # create a clone
        multi_class_svc_.fit(X, y[:, i])
        assert_equal(list(multi_class_svc_.predict(X)), list(predictions[:, i]))
Пример #6
0
def test_multiclass_multioutput_estimator():
    # test to check meta of meta estimators
    svc = LinearSVC(random_state=0)
    multi_class_svc = OneVsRestClassifier(svc)
    multi_target_svc = MultiOutputClassifier(multi_class_svc)

    multi_target_svc.fit(X, y)

    predictions = multi_target_svc.predict(X)
    assert_equal((n_samples, n_outputs), predictions.shape)

    # train the forest with each column and assert that predictions are equal
    for i in range(3):
        multi_class_svc_ = clone(multi_class_svc)  # create a clone
        multi_class_svc_.fit(X, y[:, i])
        assert_equal(list(multi_class_svc_.predict(X)), list(predictions[:,
                                                                         i]))
Пример #7
0
class GOClassifier:
    def __init__(self, X, y, random_seed=11, test_size=0.25, *args, **kwargs):
        ind = np.arange(X.shape[0])
        np.random.seed(random_seed)
        np.random.shuffle(ind)
        self.X = X[ind]
        self.y = y[ind]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_seed)
        self.random_seed = random_seed
        self.args = args
        self.kwargs = kwargs
        self.clf = None

    def fit(self, X=None, y=None):
        X_ = X if X is not None else self.X_train
        y_ = y if y is not None else self.y_train
        self.clf = MultiOutputClassifier(
            SGDClassifier(alpha=0.0001,
                          max_iter=1000,
                          tol=1e-3,
                          random_state=self.random_seed,
                          *self.args,
                          **self.kwargs))
        self.clf.fit(X_, y_)
        return self.clf

    def predict(self, X=None):
        assert self.clf is not None
        X_ = X if X is not None else self.X
        return self.clf.predict(X_)

    def test_predict(self):
        return self.predict(X=self.X_test)

    def score(self, X, y):
        assert self.clf is not None
        return self.clf.score(X, y)

    def test_score(self):
        assert self.clf is not None
        return self.clf.score(self.X_test, self.y_test)

    def train_score(self):
        assert self.clf is not None
        return self.clf.score(self.X_train, self.y_train)
Пример #8
0
def classify(method, h_features, h_labels, val_iter=10):
    print('X shape:', h_features.shape)
    print('y shape:', h_labels.shape)

    print('Training -> {0} - Classifier will run {1} times'.format(
        str(method), val_iter))

    accuracy = []
    class_stats = []
    f1_micro, f1_macro = [], []
    recall_micro, recall_macro = [], []
    precision_micro, precision_macro = [], []

    for iter_idx in range(val_iter):
        print('run  - - - - - - - - -  {0} at: {1} '.format(
            iter_idx + 1, datetime.now()))

        X_train, X_test, y_train, y_test = train_test_split(h_features,
                                                            h_labels,
                                                            test_size=0.2)
        classifier = MultiOutputClassifier(method)
        classifier.fit(X_train, y_train)
        y_hat = classifier.predict(X_test)

        accuracy.append(accuracy_score(y_test, y_hat))
        f1_micro.append(f1_score(y_test, y_hat, average='micro'))
        f1_macro.append(f1_score(y_test, y_hat, average='macro'))
        recall_micro.append(recall_score(y_test, y_hat, average='micro'))
        recall_macro.append(recall_score(y_test, y_hat, average='macro'))
        precision_micro.append(precision_score(y_test, y_hat, average='micro'))
        precision_macro.append(precision_score(y_test, y_hat, average='macro'))

        class_stats.append(class_metrics(y_hat, y_test))

    return {
        "classwise_stats": class_stats,
        "accuracy": accuracy,
        "f1_micro": f1_micro,
        "f1_macro": f1_macro,
        "recall_micro": recall_micro,
        "recall_macro": recall_macro,
        "precision_micro": precision_micro,
        "precision_macro": precision_macro,
    }
Пример #9
0
class analyze_text:
    def __init__(self):
        pass

    def _tfidftransformation(self, df):
        '''
        TFIDF transformation of the training DS
        :param df: the entire dataset
        :return: X_train. the vectorized and tranformed input training ds.
        '''
        pd.options.mode.chained_assignment = None
        nlp = spacy.load("en_core_web_sm")
        X = df["Speech"]
        self.vectoriser = TfidfVectorizer()
        X_train = self.vectoriser.fit_transform(X)
        print(X_train.shape)
        print(self.vectoriser.get_feature_names())
        print(X_train)
        return X_train

    def train_model(self, df):
        '''
        Train the model using liner SVC
        :param df: the entire ds
        :return: None
        '''
        training_ds = self._tfidftransformation(df)
        y = df[["app","options"]]
        y.fillna('', inplace=True)
        print(y)
        self.clf = MultiOutputClassifier(LinearSVC())
        self.clf.fit(training_ds, y)

    def predict(self, speech):
        '''
        predict using the trained model
        :param speech: the verbal command
        :return: [app, options] predictions
        '''
        test = [speech]
        test = self.vectoriser.transform(test)
        preds = self.clf.predict(test)
        return preds[0]
Пример #10
0
 def test_multi_output_classifier_fallback(self):
     X, y = make_multilabel_classification(n_classes=3, random_state=0)
     X = X.astype(numpy.float32)
     clf = MultiOutputClassifier(LogisticRegression()).fit(X, y)
     del clf.classes_
     onx = to_onnx(clf,
                   X[:1],
                   target_opset=TARGET_OPSET,
                   options={
                       'zipmap': False,
                       'output_class_labels': True
                   })
     sess = InferenceSession(onx.SerializeToString())
     res = sess.run(None, {'X': X})
     exp_lab = clf.predict(X)
     exp_prb = clf.predict_proba(X)
     assert_almost_equal(exp_lab, res[0])
     self.assertEqual(len(exp_prb), len(res[1]))
     for e, g in zip(exp_prb, res[1]):
         assert_almost_equal(e, g, decimal=5)
Пример #11
0
class Recommender():
    clf = ""
    classLabels = "Teknoloji-1 Teknoloji-2 Teknoloji-3 Gıda İnşaat Danışmanlık Giyim Online-Alışveriş Medya Banka-Sigorta Mobilya-Ev Eğitim Yemek Sanayi Otomobil Holding Market İçecek Kariyer-Planlama Kitap-Kırtasiye Kar-Amacı-Gütmeyen-Kuruluşlar Seyahat-Tatil Temizlik-Bakım Eskişehir-Yerel Düzce-Yerel Samsun-Yerel Osmaniye-Yerel Antalya-Yerel İstanbul-Yerel Ankara-Yerel Bursa-Yerel"

    def __init__(self, **kwargs):
        super(Recommender, self).__init__(**kwargs)
        file_path = os.path.join(settings.FILES_DIR, 'data_encoded.csv')
        X_train = pd.read_csv(file_path)
        file_path = os.path.join(settings.FILES_DIR, 'y.csv')
        y_train = pd.read_csv(file_path)
        self.classLabels = self.classLabels.split(" ")
        self.clf = MultiOutputClassifier(DecisionTreeClassifier()).fit(
            X_train, y_train)

    def recommend(self, X):
        preds = self.clf.predict([X])
        recommends = [
            item for item, pred in zip(self.classLabels, preds[0]) if pred == 1
        ]
        return recommends
def knn_multi_output(x, y):
    from sklearn.model_selection import train_test_split
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)

    from sklearn.multioutput import MultiOutputClassifier
    clf = MultiOutputClassifier(KNeighborsClassifier(n_neighbors=1)).fit(
        x_train, y_train)
    y_pred = clf.predict(x_test)

    for i in range(len(BASE_GENRES)):
        auc = roc_auc_score(y_test[:, i], y_pred[:, i])
        print("AUC %s: %.4f" % (BASE_GENRES[i], auc))

    f1s = []
    tprs = []
    for genre in range(len(BASE_GENRES)):
        TP = 0
        FP = 0
        TN = 0
        FN = 0
        genre = BASE_GENRES[genre]
        for i in range(len(y_test)):
            truth_genres = true_false_to_genres(y_test[i])
            pred_genres = true_false_to_genres(y_pred[i])
            if genre in truth_genres:
                if genre in pred_genres:
                    TP += 1
                else:
                    FN += 1
            else:
                if genre in pred_genres:
                    FP += 1
                else:
                    TN += 1
        print("Confusion Matrix of ", genre)
        get_confusion_matrix(TP, FP, TN, FN)
        f1s.append(get_f1(TP, FP, FN))
        tprs.append(get_tpr(TP, FN))

    print("av f1", np.array(f1s).mean())
    print("av tpr", np.array(tprs).mean())
Пример #13
0
 def return_metrics2(X, y, classifier):
     X_train, X_test, y_train, y_test = train_test_split(X,
                                                         y,
                                                         test_size=0.3)
     if classifier == "KNN":
         clf = MultiOutputClassifier(KNeighborsClassifier()).fit(
             X_train, y_train)
     elif classifier == "DTC":
         clf = MultiOutputClassifier(DecisionTreeClassifier()).fit(
             X_train, y_train)
     elif classifier == "ETC":
         clf = MultiOutputClassifier(ExtraTreeClassifier()).fit(
             X_train, y_train)
     elif classifier == "RFC":
         clf = MultiOutputClassifier(RandomForestClassifier()).fit(
             X_train, y_train)
     else:
         clf = MultiOutputClassifier(KNeighborsClassifier()).fit(
             X_train, y_train)
     y_pred = clf.predict(X_test)
     accuracy = 0
     shape = y_pred.shape
     for idxRow in range(shape[0]):
         trueValCount = 0
         size = 0
         for idxCol in range(shape[1]):
             if y_test[idxRow][idxCol] == y_pred[idxRow][idxCol]:
                 trueValCount += 1
         lineAccuracy = trueValCount / shape[1] if (trueValCount > 0) else 0
         accuracy += lineAccuracy
         # print('{0} -> {1} -> {2}/{3}'.format(idxRow,lineAccuracy,trueValCount,size))
         # print(y_test[idxRow])
         # print(y_pred[idxRow])
     print("AVG Accuracy")
     print(accuracy / shape[0] if accuracy > 0 else 0)
     print("Hamming Loss")
     print(hamming_loss(y_test, y_pred))
     return hamming_loss(y_test, y_pred) * 100
Пример #14
0
def run_classifier(data,seed,include_bigrams=True):
    vec = CountVectorizer(ngram_range=(1, 1), lowercase=True)
    if include_bigrams:
        vec = CountVectorizer(ngram_range=(1, 2), lowercase=True)
    X_train = vec.fit_transform(data['X_train'])
    X_eval = vec.transform(data['X_eval'])
    y_train = data['y_train']
    y_eval = data['y_eval']
    label_names = list(y_eval.columns)

    clf = MultiOutputClassifier(LogisticRegression(solver='saga',random_state=seed))
    clf.fit(X_train,y_train)

    y_pred = clf.predict(X_eval)
    #support_train = y_train.sum(axis=1)
    #support_eval = y_eval.sum(axis=1)
    macro_f1 = f1_score(y_eval,y_pred,average='macro')
    all_f1 = f1_score(y_eval,y_pred,average=None) 
    report = print(classification_report(y_eval,y_pred,target_names=label_names))
    lrap = label_ranking_average_precision_score(y_eval,y_pred)
    print(macro_f1)
    print(all_f1)
    return macro_f1, all_f1, lrap, support_train, support_eval
Пример #15
0
                             leaf_size=10,
                             n_jobs=-1))
else:
    model = MultiOutputRegressor(
        KNeighborsRegressor(n_neighbors=3,
                            weights="uniform",
                            leaf_size=10,
                            n_jobs=-1))

# train the model
model.fit(X.iloc[train_idx, :], Y.iloc[train_idx, :])

# In[2]: Collect the predictions

# predict training and testing data
train_predict = pd.DataFrame(model.predict(X.iloc[train_idx, :]),
                             columns=Y.columns)
test_predict = pd.DataFrame(model.predict(X.iloc[test_idx, :]),
                            columns=Y.columns)

# reshape all of the predictions into a single table
predictions = pd.DataFrame()
for j in range(outputs):
    # collect training data
    predict_j = np.array(train_predict.iloc[:, j])
    actual_j = np.array(Y.iloc[train_idx, j])
    name_j = Y.columns[j]
    data_j = "Train"
    predictions = pd.concat([
        predictions,
        pd.DataFrame({
Пример #16
0
mo_X_train_A, mo_X_test_A, mo_t_train_A, mo_t_test_A = train_test_split(
    mo_training_set_A_features,
    mo_training_set_A_labels,
    test_size=0.25,
    random_state=42)

mo_X_train_B, mo_X_test_B, mo_t_train_B, mo_t_test_B = train_test_split(
    mo_training_set_B_features,
    mo_training_set_B_labels,
    test_size=0.25,
    random_state=42)

multi_output_class_A = MultiOutputClassifier(KNeighborsClassifier()).fit(
    mo_X_train_A, mo_t_train_A)
multi_output_class_A_pred = multi_output_class_A.predict(mo_X_test_A)
err_multi_output_class_A = mean_squared_error(mo_t_test_A,
                                              multi_output_class_A_pred)

multi_output_class_B = MultiOutputClassifier(KNeighborsClassifier()).fit(
    mo_X_train_B, mo_t_train_B)
multi_output_class_B_pred = multi_output_class_B.predict(mo_X_test_B)
err_multi_output_class_B = mean_squared_error(mo_t_test_B,
                                              multi_output_class_B_pred)

print("Multi Output classification error - System A: ",
      err_multi_output_class_A)
print("Multi Output classification error - System B: ",
      err_multi_output_class_B)

df_A = pd.DataFrame(multi_output_class_A_pred,
Пример #17
0
from sklearn.ensemble import RandomForestClassifier
#from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(random_state=42)  # 랜덤포레스트 분류기
rf.fit(X_train, y_train)  # train data에 random forest model 학습
# rf_predictions = rf.predict(X_test) # 학습된 모델에 X_test 값을 넣어 y_test 예측 값 생성
# print(rf_predictions)

# 분류기 평가 - gridsearchcv 전 rf model
from sklearn.multioutput import MultiOutputClassifier
# 멀티 출력 가능하게 하는 패키지 설치

rf_classifier = MultiOutputClassifier(rf, n_jobs=1)
rf_classifier.fit(X_train, y_train)  # 다중출력이 가능한 모델에 train data 학습

rf_predictions2 = rf_classifier.predict(X_test)  # 학습된 모델에 X_test 넣어서 y_test 예측
print(rf_predictions2)

print(rf_classifier.score(X_train, y_train))  # 훈련 데이터 셋 정확도 94.91%

# GridSearchCV : 교차검증과 최적의 파라미터를 동시에 진행
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [50, 100, 200, 300], 'max_depth': [5, 10, 20]}

forest_reg = RandomForestClassifier()
grid_search = GridSearchCV(forest_reg,
                           param_grid,
                           cv=5,
                           scoring='neg_mean_squared_error')
print(grid_search.fit(X_train, y_train))
Пример #18
0
class MultilabelTraining:

    X_COLUMN_NAME = "page_text_extract"

    DEFAULT_TARGET_THEMES = [
        5,
        6,
        26,
        33,
        139,
        163,
        232,
        313,
        339,
        350,
        406,
        409,
        555,
        589,
        597,
        634,
        660,
        695,
        729,
        766,
        773,
        793,
        800,
        810,
        852,
        895,
        951,
        975,
    ]

    OTHER_THEMES_VALUE = 4242

    def __init__(
        self,
        df=pd.DataFrame(),
        x_column_name=X_COLUMN_NAME,
        group_processes=True,
        classifier=PassiveAggressiveClassifier(random_state=42),
        vectorizer=HashingVectorizer(n_features=2**14),
        target_themes=DEFAULT_TARGET_THEMES,
        other_themes_value=OTHER_THEMES_VALUE,
        remove_processes_without_theme=True,
        is_incremental_training=False,
        vocab_path="",
    ):
        self.is_incremental_training = is_incremental_training
        self.vocab_path = vocab_path
        self.remove_processes_without_theme = remove_processes_without_theme
        self.mo_classifier = MultiOutputClassifier(classifier, n_jobs=-1)
        self.classifier = classifier
        self.vectorizer = vectorizer
        self.target_themes = target_themes
        self.other_themes_value = other_themes_value
        self.group_processes = group_processes
        self.x_column_name = x_column_name
        self._initialize_dataframe(df)

    def _initialize_dataframe(self, df):
        if not df.empty:
            self.dp = DataframePreprocessing(
                df.copy(),
                group_processes=self.group_processes,
                x_column_name=self.x_column_name,
                target_themes=self.target_themes,
                other_themes_value=self.other_themes_value,
                is_incremental_training=self.is_incremental_training,
                remove_processes_without_theme=self.
                remove_processes_without_theme,
                vocab_path=self.vocab_path,
            )
            self.y_columns_names = self.dp.distinct_themes
            self.df = self.dp.processed_df
        else:
            self.df = df

    def _split(self, X, y):
        print("Splitting dataset...")
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, stratify=y, test_size=0.2, random_state=42)

    def _vectorize(self, X_train):
        print("Vectorizing...")
        return self.vectorizer.fit_transform(X_train)

    def train(self, split_df=False):
        print("Training...")
        self.X_train, self.y_train = (
            self.df[self.x_column_name],
            self.df[self.y_columns_names],
        )
        if split_df:
            self._split(self.X_train, self.y_train)
        vector = self._vectorize(self.X_train)
        self.mo_classifier.fit(vector, self.y_train)
        if split_df:
            vector_test = self._vectorize(self.X_test)
            self.y_pred = self.mo_classifier.predict(vector_test)
            metrics = get_multilabel_metrics(self.y_test, self.y_pred)
            return metrics
        return None

    def _update_dataframe(self,
                          df,
                          is_incremental_training=True,
                          is_parquet=False,
                          labels_freq={}):
        self.dp = DataframePreprocessing(
            df.copy(),
            x_column_name=self.x_column_name,
            group_processes=self.group_processes,
            target_themes=self.target_themes,
            other_themes_value=self.other_themes_value,
            is_incremental_training=is_incremental_training,
            remove_processes_without_theme=self.remove_processes_without_theme,
            is_parquet=is_parquet,
            vocab_path=self.vocab_path,
            labels_freq=labels_freq,
        )
        self.df = self.dp.processed_df

    def incremental_train(self, df_path, nrows=5000):
        print("Training incrementally...")
        columns_names = pd.read_csv(df_path, nrows=1).columns.tolist()
        skiprows = 1
        classes, labels_freq = DataframePreprocessing(
            target_themes=self.target_themes).get_unique_binarized_labels(
                df_path, "tema")
        while True:
            df = pd.read_csv(
                df_path,
                nrows=nrows,
                skiprows=skiprows,
                header=None,
                names=columns_names,
            )
            if df.empty:
                break
            self._update_dataframe(df, labels_freq=labels_freq)
            X_train, y_train = (
                self.df[self.x_column_name],
                self.df[self.target_themes + [self.other_themes_value]],
            )
            vector = self._vectorize(X_train)
            self.mo_classifier.partial_fit(vector, y_train, classes=classes)
            skiprows += nrows
            print("{} rows already trained\n".format(skiprows - 1))

    def incremental_train_with_parquet(self, parquet_path):
        print("Training incrementally with parquet...")
        nrows = 0
        pf = ParquetFile(parquet_path)
        classes, labels_freq = DataframePreprocessing(
            target_themes=self.target_themes).get_unique_binarized_labels(
                parquet_path, "tema", True)
        for df in pf.iter_row_groups():
            df = df.reset_index()
            self._update_dataframe(df,
                                   is_parquet=True,
                                   labels_freq=labels_freq)
            X_train, y_train = (
                self.df[self.x_column_name],
                self.df[self.target_themes + [self.other_themes_value]],
            )
            vector = self._vectorize(X_train)
            self.mo_classifier.partial_fit(vector.toarray(),
                                           y_train,
                                           classes=classes)
            nrows += len(self.df)
            print("{} rows already trained\n".format(nrows))
            clear_output(wait=True)

    def predict(self):
        return self.mo_classifier.predict(
            self._vectorize(self.X_test).todense())

    def set_X_test(self, X):
        self.X_test = X

    def set_y_test(self, y):
        self.y_test = y

    def get_pickle(self):
        return pickle.dumps(self.mo_classifier)
Пример #19
0
X_train_embeds, X_val_embeds = [
    WE.get_sentence_vector(tokenized_sentence(x), vector_dict, stopwords=STOPWORDS)
    for x in raw_X_train
], [
    WE.get_sentence_vector(tokenized_sentence(x), vector_dict, stopwords=STOPWORDS)
    for x in raw_X_val
]


lr_embed_clf = MultiOutputClassifier(
    LogisticRegression(
        max_iter=300, multi_class="multinomial", penalty="none", solver="lbfgs"
    )
).fit(X_train_embeds, y_train)
print(hamming_loss(y_val, lr_embed_clf.predict(X_val_embeds)))
print(classification_report(y_val, lr_embed_clf.predict(X_val_embeds)))
## Seeing where no prediction was made
null_predictions = len(
    [i for i in lr_embed_clf.predict(X_val_embeds) if not np.any(np.nonzero(i))]
)
print(f"{null_predictions} out of {len(y_val)} predictions were null.")

dub_ref_model = lr_embed_clf.estimators_[4]
vocab, id2tok, tok2id = get_vocab(train_dataset)
target_label = "dubious reference"
BATCH_SIZE = 1
pred = []
actual = []
vectors = []
for batch, targets, lengths, raw_data in create_dataset(
Пример #20
0
        pass

########################################
############ CLASSIFICATION ############
########################################

########################################
# LOGISTIC REGRESSION
########################################
if ML_option == "Logistic Regression":
    # Fit the model and predict X_test. Show some analysis.

    try:
        logReg = MultiOutputClassifier(LogisticRegression())
        logReg.fit(X_train, y_train)
        pred = logReg.predict(X_test)
        st.write('Mean Absolute Error (MAE):',
                 round(metrics.mean_absolute_error(y_test, pred), 4))
        st.write('Mean Squared Error (MSE):',
                 round(metrics.mean_squared_error(y_test, pred), 4))
        st.write('Root Mean Squared Error (RMSE):',
                 round(np.sqrt(metrics.mean_squared_error(y_test, pred)), 4))
        st.write('Accuracy of Logistic Regression on training set: ',
                 round(logReg.score(X_train, y_train), 4))
        st.write('Accuracy of Logistic Regression  on test set: ',
                 round(logReg.score(X_test, y_test), 4))

        st.subheader("Classification Report")
        st.text(classification_report(y_test, pred))

        try:
Пример #21
0
class ReedMullerMultiClass(ClassifierMixin):
    @_deprecate_positional_args
    def __init__(self, estimator, *, n_jobs=None):
        # FIXME: Check estimator has predict_proba method
        self.multi_output = MultiOutputClassifier(estimator, n_jobs=n_jobs)

    def fit(self, X, Y, sample_weight=None, **fit_params):
        self.classes_ = np.unique(Y)
        n_classes = len(self.classes_)
        if n_classes < 3:
            pass  # Fixme: Raise warning? Exception?
        self.class_to_index = dict((c, i) for i, c in enumerate(self.classes_))
        # Choose Reed Muller parameters in function of n_classes
        r, m = self._rm_policy(n_classes)
        self.rm = ReedMullerCodec(r, m, limit=n_classes)
        Y = self.encode_labels(Y)
        self.multi_output.fit(X, Y, sample_weight, **fit_params)

    def decision_function(self, X):
        check_is_fitted(self)

        Y = self.multi_output.predict(X)
        return self.decode_log_proba(Y)

    def predict_proba(self, X):
        check_is_fitted(self)

        Y = self.multi_output.predict(X)
        Y = np.exp(self.decode_log_proba(Y))
        Y = Y / Y.sum(axis=1)
        return Y

    def predict(self, X):
        check_is_fitted(self)

        Y = self.multi_output.predict(X)
        Y = self.decode_log_proba(Y).argmax(axis=1)
        return np.array([self.classes_[i] for i in Y])

    def encode_labels(self, Y):
        Y = (self.class_to_index[c] for c in Y)  # Encode classes as integers
        Y = np.array([self.rm.encode(i)
                      for i in Y])  # Encode integers as an RM ECC
        return Y

    def decode_log_proba(self, Y):
        Z = np.empty((len(Y), len(self.classes_)))
        for i, bits in enumerate(Y):
            Z[i] = self.rm.decode_log_proba(bits)
        return Z

    @staticmethod
    def _rm_options():
        m = 0
        # For a small number of classes, give order 1 RM codes
        for m in range(1, 4):
            yield 1, m, m + 1
        # For a larger number of classes, give order 2 RM codes
        m = 3
        while True:
            yield 2, m, int((m * (m - 1)) / 2) + m + 1
            m += 1

    @classmethod
    def _rm_policy(cls, n_classes):
        for r, m, rows in cls._rm_options():
            if 2**rows >= n_classes:
                return r, m
Пример #22
0
    highest_acc = 0
    acc = 0
    for i in range(1, 100):
        for j in range(1, 100):
            for k in range(50, 100):
                for l in range(len(x_train)):
                    i, j, k = 6, 11, 59
                    forest = ensemble.RandomForestClassifier(
                        n_estimators=i,
                        random_state=42,
                        max_features=j,
                        min_samples_leaf=k)
                    multi_target_forest = MultiOutputClassifier(forest,
                                                                n_jobs=-1)
                    multi_target_forest.fit(x_train[l], y_train[l])
                    pred_ud = multi_target_forest.predict(x_test[l])

                    class_names = ['down', 'balance', 'up']
                    # Compute confusion matrix
                    cnf_matrix = confusion_matrix(y_test[l].flatten(),
                                                  pred_ud.flatten())
                    np.set_printoptions(precision=2)
                    acc += np.trace(cnf_matrix, dtype='float32') / np.sum(
                        np.sum(cnf_matrix, dtype='float32'))

                acc /= len(x_train)
                if acc >= highest_acc:
                    highest_acc = acc
                    print('acc->' + str(acc))
                    print('n_estimators->' + str(i))
                    print('max_features->' + str(j))
X_test = PCA(n_components=2).fit_transform(X_test)

ax2.set_title('Test labels')
ax2.scatter(X_test[:, 0],
            X_test[:, 1],
            c=np.sum(Y_test * np.array([1, 2, 3, 4, 5]), axis=1))
ax2.set_xlabel('Feature 0 count')

forest = RandomForestClassifier(n_estimators=100, random_state=1)
decision = DecisionTreeClassifier()

# training step
multi_target_R = MultiOutputClassifier(forest, n_jobs=-1)
result_R = multi_target_R.fit(X, Y)
result_R = multi_target_R.predict(X_test)
score_R = multi_target_R.score(X_test, Y_test)

multi_target_D = MultiOutputClassifier(decision, n_jobs=-1)
multi_target_D = multi_target_D.fit(X, Y)
result_D = multi_target_D.predict(X_test)
score_D = multi_target_D.score(X_test, Y_test)

# Plot classification result
ax3.scatter(X_test[:, 0],
            X_test[:, 1],
            c=np.sum(result_D * np.array([1, 2, 3, 4, 5]), axis=1))
ax3.set_title('Decision Tree labels %0.2f' % score_D)
ax3.set_ylabel('Feature 1 count')
ax3.set_xlabel('Feature 0 count')
X_w_D = []
Пример #24
0
if __name__ == "__main__":

    mlb = MultiLabelBinarizer()

    # also consider data/all_remove.train
    outfile = "results/linear.txt"
    X_train,y_train,X_test,y_test,test_sents = prepare_data("data/all.train", "data/validation/all_validation", mlb)
    
    #clf = LogisticRegression(verbose=1, solver="sag", class_weight={0:0.1})
    
    clf = SGDClassifier(verbose=1, n_jobs=10, loss="log", class_weight={0:0.1})
    
    multi_clf = MultiOutputClassifier(clf, n_jobs=1)
    multi_clf.fit(X_train, y_train)
    
    preds = multi_clf.predict(X_test)
    print(preds.shape)

    y_preds = mlb.inverse_transform(preds)
    print(y_preds)
    
    with open(outfile, "w") as out:
        for sent,pred in zip(test_sents, y_preds):
            pred = list(pred)
            if len(pred) == 0:
                pred = ["unmatched"]
            if len(pred) > 1 and "unmatched" in pred:
                pred.remove("unmatched")
            print("{}\t{}".format(sent,",".join(pred)), file=out)

Пример #25
0
x = np.load('./data/x_data.npy')
y = np.load('./data/y_data.npy')
x_pred = np.load('./data/x_pred.npy')

print("x.shape :", x.shape)
print("y.shape :", y.shape)
print("x_pred.shape :", x_pred.shape)

x = x.reshape(x.shape[0], 64 * 64 * 3)

x_pred = x_pred.reshape(x_pred.shape[0], 64 * 64 * 3)

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=77,
                                                    shuffle=True)

# model = XGBClassifier()
model = MultiOutputClassifier(XGBRFClassifier())

# 3. 훈련
model.fit(x_train, y_train)

# 4. 평가, 예측
acc = model.score(x_test, y_test)

print("acc :", acc)

y_pred = model.predict(x_pred)
Пример #26
0
title_words = [word for title in dataset['title'] for word in title.split()]
normalized_titles = [normalize(title) for title in dataset['title']]

# VECTORIZE YOUR DATA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 3))
feature_matrix = vectorizer.fit_transform(normalized_titles)

multiLabelBinarizer = MultiLabelBinarizer()
labels = multiLabelBinarizer.fit_transform(job_functions)

# BUILD THE MODEL
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(feature_matrix,
                                                    labels,
                                                    test_size=0.2)
estimator = SVC()
model = MultiOutputClassifier(estimator)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# EVALUEATE YOUR ALGORITHM
from sklearn.metrics import f1_score

score = f1_score(y_test, y_pred, average='weighted')
Пример #27
0
                return postostr(lignTOpos(j)) + ' => ' + postostr(lignTOpos(k))
    
for m in range(len(Matrix)):
    x = []
    y = []
    c = 0
    for i in range(8):
        for j in range(8):
            Mx[m][c] = Matrix[m][i][j] # On aplatit la matrice des positions sur une ligne 
            My[m][c] = Matrix_Y[m][i][j]
            c += 1
            
List_coups = np.zeros((n_positions,1))
for k in range(len(Matrix)):
    List_coups[k,0] = detect_coup(Matrix[k],Matrix_Y[k])

DFx = pd.DataFrame(Mx)
DFy = pd.DataFrame(List_coups)


trainData,testData,trainY,testY = train_test_split(Mx,List_coups,test_size=0.1) # Séparation en données d'entrainement et données de test
kNN = KNeighborsClassifier(n_neighbors=3,algorithm='kd_tree',metric='minkowski',p=2,n_jobs=-1)
classifier = MultiOutputClassifier(kNN, n_jobs=-1)
classifier.fit(trainData,trainY) # Entrainement du réseau de neurone
trainPredictionsk = classifier.predict(trainData) # Entrainement à la prédiction de la sortie 

trainCMk = confusion_matrix(y_pred=trainPredictionsk,y_true=trainY) # Calcul matrice de confusion
testpredict = classifier.predict(testData)
testCM = confusion_matrix(y_pred=testpredict,y_true=testY) # Matrice de confusion, les résultats juste sont comptés sur la diagonale, les autres sur le reste de la matrice.
print('Score = ' + str(testCM.trace()/sum(sum(testCM)))) # Calcul performance
Пример #28
0
readings_train = df_train.ix[:, :-3]
subj_train = df_train.ix[:, -3]
activity_train = df_train.ix[:, -2]
subj_activity_train = pd.DataFrame({
    'subject': subj_train,
    'activity_id': activity_train
})

# step 1.2 - fit the model to predict subject
print('Fitting model to predict subject ...')
clf = GaussianNB()
clf_multi = MultiOutputClassifier(clf)
time_bgn = time.time()
clf_multi.fit(readings_train, subj_activity_train)
dur_train_both = time.time() - time_bgn
predicted_subj_activity_train = clf_multi.predict(readings_train)
predicted_subj_activity_train = pd.DataFrame({
    'subject':
    predicted_subj_activity_train[:, 1],
    'activity_id':
    predicted_subj_activity_train[:, 0]
})
predicted_subj = predicted_subj_activity_train.ix[:, 1]
predicted_activity = predicted_subj_activity_train.ix[:, 0]
predicted_subj_activity_train = (100 * predicted_subj) + predicted_activity

# step 2.1 - get the readings data (from data stratified using subject)
readings_test = df_test.ix[:, :-3]
subj_test = df_test.ix[:, -3]
activity_test = df_test.ix[:, -2]
subj_activity_test = pd.DataFrame({
Пример #29
0
train_labels_oh = pd.get_dummies(train_labels,columns=["Unfallschwere"])

if validation:
    # One Hot Encoding vom Validierungs Set
    test_val_labels = test_val_data["Unfallschwere"]
    test_val_data.drop(["Unfallschwere"],axis=1, inplace=True)
    test_labels_oh = pd.get_dummies(test_val_labels,columns=["Unfallschwere"])


    ### Model Training ###

    # Random Forest und Decision Tree Classifier als Vergleich für Neuronales Netz
    forest = RandomForestClassifier(n_estimators=100)
    multi_target_forest = MultiOutputClassifier(forest)
    multi_target_forest.fit(train_data, train_labels_oh)
    Y_pred = multi_target_forest.predict(test_val_data)

    # Metrics
    print(np.round(accuracy_score(test_labels_oh.values.argmax(axis=1), Y_pred.argmax(axis=1)),2),"accuracy")
    print(np.round(f1_score(test_labels_oh.values.argmax(axis=1), Y_pred.argmax(axis=1),average=None),2),"f1 score")
    print(np.round(f1_score(test_labels_oh.values.argmax(axis=1), Y_pred.argmax(axis=1),average="weighted"),2),"f1 score weighted")
    print(np.round(f1_score(test_labels_oh.values.argmax(axis=1), Y_pred.argmax(axis=1),average="macro"),2),"f1 score macro")
    print(confusion_matrix(test_labels_oh.values.argmax(axis=1), Y_pred.argmax(axis=1)))


    decision_tree = DecisionTreeClassifier()
    decision_tree.fit(train_data, train_labels_oh.values)
    Y_pred = decision_tree.predict(test_val_data)

    # Metrics
    print(np.round(accuracy_score(test_labels_oh.values.argmax(axis=1), Y_pred.argmax(axis=1)),2),"accuracy")
audios = np.unique(mfcc_audio["Audio"])
train_audio, test_audio = train_test_split(
    audios, train_size=0.7, test_size=0.3, random_state=0)

X_train = mfcc_audio[mfcc_audio["Audio"].isin(train_audio)]
X_test = mfcc_audio[mfcc_audio["Audio"].isin(test_audio)]
y_train = X_train[columns]
y_test = X_test[columns]

X_train.drop(columns + ["Audio"], inplace=True, axis=1)
X_test.drop(columns + ["Audio"], inplace=True, axis=1)

mor = MultiOutputClassifier(
    RandomForestClassifier(random_state=0, n_estimators=1000), n_jobs=-1)
mor.fit(X_train, y_train)
mor_pred = mor.predict(X_test)

dummy = DummyClassifier()
dummy.fit(X_train, y_train)
dummy_pred = dummy.predict(X_test)

estimators = mor.estimators_

for i, col in enumerate(columns):

    true = y_test[col]
    pred = mor_pred[:, i]
    d_p = dummy_pred[:, i]

    print(col)
    print("accuracy score")
Пример #31
0
    vectorizer2.fit_transform(X), y, test_size=0.2, random_state=42)
X_train3, X_test3, y_train3, y_test3 = train_test_split(
    vectorizer3.fit_transform(X), y, test_size=0.2, random_state=42)
classifiers = {
    "LinearSVC": LinearSVC(),
    "BernoulliNB": MultinomialNB(),
    "Perceptron": Perceptron(n_iter=50)
}

for i in classifiers.keys():
    clf = MultiOutputClassifier(classifiers[i]).fit(X_train, y_train)

    clf2 = MultiOutputClassifier(classifiers[i]).fit(X_train2, y_train2)
    clf3 = MultiOutputClassifier(classifiers[i]).fit(X_train3, y_train3)

    yhat = clf.predict(X_test)
    yhat2 = clf2.predict(X_test2)
    yhat3 = clf3.predict(X_test3)

    print i, "unigram"
    print "f1_score", f1_score(y_test, yhat, average='samples')
    print "jaccard_similarity_score", jaccard_similarity_score(y_test, yhat)
    print "accuracy_score", accuracy_score(y_test, yhat)
    print "precision_score", precision_score(y_test, yhat, average='samples')
    print "recall_score", recall_score(y_test, yhat, average='samples')
    print "********"

    print i, "bigram"
    print "f1_score", f1_score(y_test2, yhat2, average='samples')
    print "jaccard_similarity_score", jaccard_similarity_score(y_test2, yhat2)
    print "accuracy_score", accuracy_score(y_test2, yhat2)
Y_train_multi = np.column_stack([train_EI, train_NS, train_FT, train_JP])

test_EI = np.zeros(len(Y_test), dtype=np.bool)
test_NS = np.zeros(len(Y_test), dtype=np.bool)
test_FT = np.zeros(len(Y_test), dtype=np.bool)
test_JP = np.zeros(len(Y_test), dtype=np.bool)

test_EI[np.isin(Y_test, [0,2,3,6,8,0,10,11])] = 1
test_NS[np.isin(Y_test, [8,9,10,11,12,13,14,15])] = 1
test_FT[np.isin(Y_test, [1,2,3,4,9,11,12,14])] = 1
test_JP[np.isin(Y_test, [1,2,6,7,8,9,12,13])] = 1

Y_test_multi = np.column_stack([test_EI, test_NS, test_FT, test_JP])


rfc_multi = RandomForestClassifier(n_estimators=100, max_features=100, class_weight="balanced", verbose=1, n_jobs=7)
rfc_multi_out = MultiOutputClassifier(rfc_multi)
rfc_multi_out.fit(X_train, Y_train_multi)

multi_predictions = rfc_multi_out.predict(X_test)

np.logical_and(multi_predictions, Y_test_multi)

#cm = confusion_matrix(Y_test_multi, multi_predictions)
#cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

#fig, ax = plt.subplots()
#ax.imshow(cm, interpolation='nearest')
#plt.xticks(range(len(unique_type_list)), unique_type_list)
#plt.yticks(range(len(unique_type_list)), unique_type_list)
Пример #33
0
    model = MultiOutputRegressor(
        LassoCV(eps=1e-9,
                n_alphas=20,
                cv=3,
                tol=1e-4,
                max_iter=500,
                random_state=42,
                n_jobs=1))

# train the model
model.fit(X.iloc[train_idx, :], Y.iloc[train_idx, :])

# In[2]: Collect the predictions

# predict training and testing data
train_predict = pd.DataFrame(model.predict(X.iloc[train_idx, :]),
                             columns=Y.columns)
test_predict = pd.DataFrame(model.predict(X.iloc[test_idx, :]),
                            columns=Y.columns)

# reshape all of the predictions into a single table
predictions = pd.DataFrame()
for j in range(outputs):
    # collect training data
    predict_j = np.array(train_predict.iloc[:, j])
    actual_j = np.array(Y.iloc[train_idx, j])
    name_j = Y.columns[j]
    data_j = "Train"
    predictions = pd.concat([
        predictions,
        pd.DataFrame({
Пример #34
0
from sklearn.metrics import multilabel_confusion_matrix, ConfusionMatrixDisplay

X_train = np.array(dfTrain.fabeec.to_list())
y_train = np.array(dfTrain.new_label.to_list())
# Create the SVM
svm = LinearSVC(random_state=42)
# Make it an Multilabel classifier
multilabel_classifier = MultiOutputClassifier(svm, n_jobs=-1)

# Fit the data to the Multilabel classifier
multilabel_classifier = multilabel_classifier.fit(X_train, y_train)


X_test = np.array(dfTest.fabeec.to_list())
# Get predictions for test data
y_test_pred = multilabel_classifier.predict(X_test)

predcited_res = multilabel_classifier.predict(X_train)
f1_score(dfTrain.new_label.to_list(), predcited_res, average='macro')

from joblib import dump, load
dump(multilabel_classifier,'svm.joblib')
!cp 'svm.joblib' '/content/drive/MyDrive/svm.joblib'
print (f1_score)

import pickle
s = pickle.dumps(clf_1, 'decision_tree.joblib')

pickle.dumps(multilabel_classifier, 'svm.joblib')

!cp 'decision_tree.joblib' '/content/drive/MyDrive/decision_tree.joblib'
Пример #35
0
class Igel(object):
    """
    Igel is the base model to use the fit, evaluate and predict functions of the sklearn library
    """

    available_commands = ('fit', 'evaluate', 'predict', 'experiment')
    supported_types = ('regression', 'classification', 'clustering')
    results_path = configs.get('results_path')  # path to the results folder
    default_model_path = configs.get(
        'default_model_path')  # path to the pre-fitted model
    description_file = configs.get(
        'description_file')  # path to the description.json file
    evaluation_file = configs.get(
        'evaluation_file')  # path to the evaluation.json file
    prediction_file = configs.get(
        'prediction_file')  # path to the predictions.csv
    default_dataset_props = configs.get(
        'dataset_props'
    )  # dataset props that can be changed from the yaml file
    default_model_props = configs.get(
        'model_props')  # model props that can be changed from the yaml file
    model = None

    def __init__(self, **cli_args):
        logger.info(f"Entered CLI args: {cli_args}")
        logger.info(f"Executing command: {cli_args.get('cmd')} ...")
        self.data_path: str = cli_args.get('data_path')  # path to the dataset
        logger.info(f"reading data from {self.data_path}")
        self.command = cli_args.get('cmd', None)
        if not self.command or self.command not in self.available_commands:
            raise Exception(f"You must enter a valid command.\n"
                            f"available commands: {self.available_commands}")

        if self.command == "fit":
            self.yml_path = cli_args.get('yaml_path')
            file_ext = self.yml_path.split('.')[-1]
            logger.info(f"You passed the configurations as a {file_ext} file.")

            self.yaml_configs = read_yaml(
                self.yml_path) if file_ext == 'yaml' else read_json(
                    self.yml_path)
            logger.info(f"your chosen configuration: {self.yaml_configs}")

            # dataset options given by the user
            self.dataset_props: dict = self.yaml_configs.get(
                'dataset', self.default_dataset_props)
            # model options given by the user
            self.model_props: dict = self.yaml_configs.get(
                'model', self.default_model_props)
            # list of target(s) to predict
            self.target: list = self.yaml_configs.get('target')

            self.model_type: str = self.model_props.get('type')
            logger.info(f"dataset_props: {self.dataset_props} \n"
                        f"model_props: {self.model_props} \n "
                        f"target: {self.target} \n")

        # if entered command is evaluate or predict, then the pre-fitted model needs to be loaded and used
        else:
            self.model_path = cli_args.get('model_path',
                                           self.default_model_path)
            logger.info(f"path of the pre-fitted model => {self.model_path}")
            # load description file to read stored training parameters
            with open(self.description_file, 'r') as f:
                dic = json.load(f)
                self.target: list = dic.get(
                    "target")  # target to predict as a list
                self.model_type: str = dic.get(
                    "type"
                )  # type of the model -> regression or classification
                self.dataset_props: dict = dic.get(
                    'dataset_props')  # dataset props entered while fitting
        getattr(self, self.command)()

    def _create_model(self, **kwargs):
        """
        fetch a model depending on the provided type and algorithm by the user and return it
        @return: class of the chosen model
        """
        model_type: str = self.model_props.get('type')
        model_algorithm: str = self.model_props.get('algorithm')
        use_cv = self.model_props.get('use_cv_estimator', None)

        model_args = None
        if not model_type or not model_algorithm:
            raise Exception(f"model_type and algorithm cannot be None")
        algorithms: dict = models_dict.get(
            model_type)  # extract all algorithms as a dictionary
        model = algorithms.get(
            model_algorithm)  # extract model class depending on the algorithm
        logger.info(
            f"Solving a {model_type} problem using ===> {model_algorithm}")
        if not model:
            raise Exception("Model not found in the algorithms list")
        else:
            model_props_args = self.model_props.get('arguments', None)
            if model_props_args and type(model_props_args) == dict:
                model_args = model_props_args
            elif not model_props_args or model_props_args.lower() == "default":
                model_args = None

            if use_cv:
                model_class = model.get('cv_class', None)
                if model_class:
                    logger.info(
                        f"cross validation estimator detected. "
                        f"Switch to the CV version of the {model_algorithm} algorithm"
                    )
                else:
                    logger.info(
                        f"No CV class found for the {model_algorithm} algorithm"
                    )
            else:
                model_class = model.get('class')
            logger.info(f"model arguments: \n"
                        f"{self.model_props.get('arguments')}")
            model = model_class(**kwargs) if not model_args else model_class(
                **model_args)
            return model, model_args

    def _save_model(self, model):
        """
        save the model to a binary file
        @param model: model to save
        @return: bool
        """
        try:
            if not os.path.exists(self.results_path):
                logger.info(
                    f"creating model_results folder to save results...\n"
                    f"path of the results folder: {self.results_path}")
                os.mkdir(self.results_path)
            else:
                logger.info(f"Folder {self.results_path} already exists")
                logger.warning(
                    f"data in the {self.results_path} folder will be overridden. If you don't "
                    f"want this, then move the current {self.results_path} to another path"
                )

        except OSError:
            logger.exception(
                f"Creating the directory {self.results_path} failed ")
        else:
            logger.info(
                f"Successfully created the directory in {self.results_path} ")
            pickle.dump(model, open(self.default_model_path, 'wb'))
            return True

    def _load_model(self, f: str = ''):
        """
        load a saved model from file
        @param f: path to model
        @return: loaded model
        """
        try:
            if not f:
                logger.info(f"result path: {self.results_path} ")
                logger.info(f"loading model form {self.default_model_path} ")
                model = pickle.load(open(self.default_model_path, 'rb'))
            else:
                logger.info(f"loading from {f}")
                model = pickle.load(open(f, 'rb'))
            return model
        except FileNotFoundError:
            logger.error(f"File not found in {self.default_model_path} ")

    def _prepare_fit_data(self):
        return self._process_data(target='fit')

    def _prepare_eval_data(self):
        return self._process_data(target='evaluate')

    def _process_data(self, target='fit'):
        """
        read and return data as x and y
        @return: list of separate x and y
        """
        assert isinstance(self.target,
                          list), "provide target(s) as a list in the yaml file"
        if self.model_type != "clustering":
            assert len(
                self.target) > 0, "please provide at least a target to predict"

        try:
            read_data_options = self.dataset_props.get('read_data_options',
                                                       None)
            dataset = pd.read_csv(
                self.data_path) if not read_data_options else pd.read_csv(
                    self.data_path, **read_data_options)
            logger.info(f"dataset shape: {dataset.shape}")
            attributes = list(dataset.columns)
            logger.info(f"dataset attributes: {attributes}")

            # handle missing values in the dataset
            preprocess_props = self.dataset_props.get('preprocess', None)
            if preprocess_props:
                # handle encoding
                encoding = preprocess_props.get('encoding')
                if encoding:
                    encoding_type = encoding.get('type', None)
                    column = encoding.get('column', None)
                    if column in attributes:
                        dataset, classes_map = encode(
                            df=dataset,
                            encoding_type=encoding_type.lower(),
                            column=column)
                        if classes_map:
                            self.dataset_props[
                                'label_encoding_classes'] = classes_map
                            logger.info(
                                f"adding classes_map to dataset props: \n{classes_map}"
                            )
                        logger.info(
                            f"shape of the dataset after encoding => {dataset.shape}"
                        )

                # preprocessing strategy: mean, median, mode etc..
                strategy = preprocess_props.get('missing_values')
                if strategy:
                    dataset = handle_missing_values(dataset, strategy=strategy)
                    logger.info(
                        f"shape of the dataset after handling missing values => {dataset.shape}"
                    )

            if target == 'predict' or target == 'fit_cluster':
                x = _reshape(dataset.to_numpy())
                if not preprocess_props:
                    return x
                scaling_props = preprocess_props.get('scale', None)
                if not scaling_props:
                    return x
                else:
                    scaling_method = scaling_props.get('method', None)
                    return normalize(x, method=scaling_method)

            if any(col not in attributes for col in self.target):
                raise Exception(
                    "chosen target(s) to predict must exist in the dataset")

            y = pd.concat([dataset.pop(x) for x in self.target], axis=1)
            x = _reshape(dataset.to_numpy())
            y = _reshape(y.to_numpy())
            logger.info(f"y shape: {y.shape} and x shape: {x.shape}")

            # handle data scaling
            if preprocess_props:
                scaling_props = preprocess_props.get('scale', None)
                if scaling_props:
                    scaling_method = scaling_props.get('method', None)
                    scaling_target = scaling_props.get('target', None)
                    if scaling_target == 'all':
                        x = normalize(x, method=scaling_method)
                        y = normalize(y, method=scaling_method)
                    elif scaling_target == 'inputs':
                        x = normalize(x, method=scaling_method)
                    elif scaling_target == 'outputs':
                        y = normalize(y, method=scaling_method)

            if target == 'evaluate':
                return x, y

            split_options = self.dataset_props.get('split', None)
            if not split_options:
                return x, y, None, None
            test_size = split_options.get('test_size')
            shuffle = split_options.get('shuffle')
            stratify = split_options.get('stratify')
            x_train, x_test, y_train, y_test = train_test_split(
                x,
                y,
                test_size=test_size,
                shuffle=shuffle,
                stratify=None
                if not stratify or stratify.lower() == "default" else stratify)

            return x_train, y_train, x_test, y_test

        except Exception as e:
            logger.exception(
                f"error occured while preparing the data: {e.args}")

    def _prepare_clustering_data(self):
        """
        preprocess data for the clustering algorithm
        """
        return self._process_data(target='fit_cluster')

    def _prepare_predict_data(self):
        """
        preprocess predict data to get similar data to the one used when training the model
        """
        return self._process_data(target='predict')

    def get_evaluation(self, model, x_test, y_true, y_pred, **kwargs):
        res = None
        try:
            res = evaluate_model(model_type=self.model_type,
                                 model=model,
                                 x_test=x_test,
                                 y_pred=y_pred,
                                 y_true=y_true,
                                 get_score_only=False,
                                 **kwargs)
        except Exception as e:
            res = evaluate_model(model_type=self.model_type,
                                 model=model,
                                 x_test=x_test,
                                 y_pred=y_pred,
                                 y_true=y_true,
                                 get_score_only=True,
                                 **kwargs)
        return res

    def fit(self, **kwargs):
        """
        fit a machine learning model and save it to a file along with a description.json file
        @return: None
        """
        x_train = None
        x_test = None
        y_train = None
        y_test = None
        cv_results = None
        eval_results = None
        cv_params = None
        if self.model_type == 'clustering':
            x_train = self._prepare_clustering_data()
        else:
            x_train, y_train, x_test, y_test = self._prepare_fit_data()
        self.model, model_args = self._create_model(**kwargs)
        logger.info(
            f"executing a {self.model.__class__.__name__} algorithm...")

        # convert to multioutput if there is more than one target to predict:
        if self.model_type != 'clustering' and len(self.target) > 1:
            logger.info(
                f"predicting multiple targets detected. Hence, the model will be automatically "
                f"converted to a multioutput model")
            self.model = MultiOutputClassifier(self.model) \
                if self.model_type == 'classification' else MultiOutputRegressor(self.model)

        if self.model_type != 'clustering':
            cv_params = self.model_props.get('cross_validate', None)
            if not cv_params:
                logger.info(f"cross validation is not provided")
            else:
                cv_results = cross_validate(estimator=self.model,
                                            X=x_train,
                                            y=y_train,
                                            **cv_params)
            self.model.fit(x_train, y_train)
        else:
            self.model.fit(x_train)

        saved = self._save_model(self.model)
        if saved:
            logger.info(
                f"model saved successfully and can be found in the {self.results_path} folder"
            )

        if self.model_type == 'clustering':
            eval_results = self.model.score(x_train)
        else:
            if x_test is None:
                logger.info(
                    f"no split options was provided. training score will be calculated"
                )
                eval_results = self.model.score(x_train, y_train)

            else:
                logger.info(
                    f"split option detected. The performance will be automatically evaluated "
                    f"using the test data portion")
                y_pred = self.model.predict(x_test)
                eval_results = self.get_evaluation(model=self.model,
                                                   x_test=x_test,
                                                   y_true=y_test,
                                                   y_pred=y_pred,
                                                   **kwargs)

        fit_description = {
            "model": self.model.__class__.__name__,
            "arguments": model_args if model_args else "default",
            "type": self.model_props['type'],
            "algorithm": self.model_props['algorithm'],
            "dataset_props": self.dataset_props,
            "model_props": self.model_props,
            "data_path": self.data_path,
            "train_data_shape": x_train.shape,
            "test_data_shape": None if x_test is None else x_test.shape,
            "train_data_size": x_train.shape[0],
            "test_data_size": None if x_test is None else x_test.shape[0],
            "results_path": str(self.results_path),
            "model_path": str(self.default_model_path),
            "target": None if self.model_type == 'clustering' else self.target,
            "results_on_test_data": eval_results
        }
        if self.model_type == 'clustering':
            clustering_res = {
                "cluster_centers": self.model.cluster_centers_,
                "cluster_labels": self.model.labels_
            }
            fit_description['clustering_results'] = clustering_res

        if cv_params:
            cv_res = {
                "fit_time": cv_results['fit_time'].tolist(),
                "score_time": cv_results['score_time'].tolist(),
                "test_score": cv_results['test_score'].tolist()
            }
            fit_description['cross_validation_params'] = cv_params
            fit_description['cross_validation_results'] = cv_res

        try:
            logger.info(f"saving fit description to {self.description_file}")
            with open(self.description_file, 'w', encoding='utf-8') as f:
                json.dump(fit_description, f, ensure_ascii=False, indent=4)
        except Exception as e:
            logger.exception(
                f"Error while storing the fit description file: {e}")

    def evaluate(self, **kwargs):
        """
        evaluate a pre-fitted model and save results to a evaluation.json
        @return: None
        """
        x_val = None
        y_true = None
        eval_results = None
        try:
            model = self._load_model()
            if self.model_type != 'clustering':
                x_val, y_true = self._prepare_eval_data()
                y_pred = model.predict(x_val)
                eval_results = self.get_evaluation(model=model,
                                                   x_test=x_val,
                                                   y_true=y_true,
                                                   y_pred=y_pred,
                                                   **kwargs)
            else:
                x_val = self._prepare_clustering_data()
                y_pred = model.predict(x_val)
                eval_results = model.score(x_val, y_pred)

            logger.info(f"saving fit description to {self.evaluation_file}")
            with open(self.evaluation_file, 'w', encoding='utf-8') as f:
                json.dump(eval_results, f, ensure_ascii=False, indent=4)

        except Exception as e:
            logger.exception(f"error occured during evaluation: {e}")

    def predict(self):
        """
        use a pre-fitted model to make predictions and save them as csv
        @return: None
        """
        try:
            model = self._load_model(f=self.model_path)
            x_val = self._prepare_predict_data(
            )  # the same is used for clustering
            y_pred = model.predict(x_val)
            y_pred = _reshape(y_pred)
            logger.info(
                f"predictions shape: {y_pred.shape} | shape len: {len(y_pred.shape)}"
            )
            logger.info(f"predict on targets: {self.target}")
            df_pred = pd.DataFrame.from_dict({
                self.target[i]: y_pred[:,
                                       i] if len(y_pred.shape) > 1 else y_pred
                for i in range(len(self.target))
            })

            logger.info(f"saving the predictions to {self.prediction_file}")
            df_pred.to_csv(self.prediction_file)

        except Exception as e:
            logger.exception(f"Error while preparing predictions: {e}")

    @staticmethod
    def create_init_mock_file(model_type=None,
                              model_name=None,
                              target=None,
                              *args,
                              **kwargs):
        path = configs.get('init_file_path', None)
        if not path:
            raise Exception("You need to provide a path for the init file")

        dataset_props = Igel.default_dataset_props
        model_props = Igel.default_model_props
        if model_type:
            logger.info(f"user selected model type = {model_type}")
            model_props['type'] = model_type
        if model_name:
            logger.info(f"user selected algorithm = {model_name}")
            model_props['algorithm'] = model_name

        logger.info(f"initalizing a default igel.yaml in {path}")
        default_data = {
            "dataset":
            dataset_props,
            "model":
            model_props,
            "target": ['provide your target(s) here']
            if not target else [tg for tg in target.split()]
        }
        created = create_yaml(default_data, path)
        if created:
            logger.info(
                f"a default igel.yaml is created for you in {path}. "
                f"you just need to overwrite the values to meet your expectations"
            )
        else:
            logger.warning(
                f"something went wrong while initializing a default file")
Пример #36
0
            train_interval[i], test_interval[i])
        x_train[i], y_train[i], x_test[i], y_test[
            i] = x_train_ud, y_train_ud, x_test_ud, y_test_ud

    print('[data helper -ud ] costs:' + str(time.time() - t_start) + 'secs')

    t_start = time.time()

    highest_acc = 0
    acc = 0
    for i in range(1, 100):
        for l in range(len(x_train)):
            nusvc = NuSVC(nu=float(i) / 100.0)
            multi_target_nusvc = MultiOutputClassifier(nusvc, n_jobs=-1)
            multi_target_nusvc.fit(x_train[l], y_train[l])
            pred_ud = multi_target_nusvc.predict(x_test[l])

            class_names = ['down', 'balance', 'up']
            # Compute confusion matrix
            cnf_matrix = confusion_matrix(y_test[l].flatten(),
                                          pred_ud.flatten())
            np.set_printoptions(precision=2)
            acc += np.trace(cnf_matrix, dtype='float32') / np.sum(
                np.sum(cnf_matrix, dtype='float32'))

        acc /= len(x_train)
        if acc >= highest_acc:
            highest_acc = acc
            print('acc->' + str(acc))
            print('nu->' + str(float(i) / 100.0))