def kfold_validate(self, splits, repeat):
        """
        Returns predictions from a repeated k-fold split
        Used for statistical tests
        """
        model_names = ModelFactory.get_models_list()
        result = pd.DataFrame(columns=["actual"] + model_names)
        for i in range(0, repeat):
            kFold = KFold(n_splits=splits, shuffle=True, random_state=None)
            for train_index, test_index in kFold.split(self.data):
                train_data = self.data.iloc[train_index]
                test_data = self.data.iloc[test_index]
                y_train = train_data['non-information']
                y_test = test_data['non-information']
                features_test = self.extract_features(test_data)
                features_train = self.extract_features(train_data)
                features_test = self.combine_features(features_test,
                                                      comments_only=True)
                features_train = self.combine_features(features_train,
                                                       comments_only=True)

                data = {'actual': y_test.tolist()}
                for model_name in model_names:
                    y_pred = self.execute_model_data(model_name,
                                                     features_train, y_train,
                                                     features_test)
                    data[model_name] = y_pred.tolist()
                df = pd.DataFrame(data=data)
                result = result.append(df)
        return result
    def compare_models(self, features_train, features_test, y_train, y_test):
        """
        Executes models of all types implemented in this project and prints their results
        """
        x_train = features_train
        model_names = ModelFactory.get_models_list()
        score_df = pd.DataFrame(
            columns=['name', 'accuracy', 'precision', 'recall', 'f1'])
        if self.imbalance_sampling:
            x_train, y_train = ImbalanceSampling.get_sampled_data(
                self.imbalance_sampling, x_train, y_train)

        for name in model_names:
            model = ModelFactory.get_model(name, optimised=False)
            model.fit_model(x_train, y_train)
            y_pred = model.predict(features_test)
            score = ScoreMetrics.get_scores(name, y_test, y_pred)
            print('-------')
            print(name)
            ScoreMetrics.print_scores(y_test, y_pred)
            score_df = score_df.append(score)
        return score_df
예제 #3
0
import pandas as pd
from models.model_exec import ModelExec

exec = ModelExec(include_comments=False, include_long_code=True)
from mlxtend.evaluate import paired_ttest_5x2cv
from statsmodels.stats.contingency_tables import mcnemar,cochrans_q
from models.model_factory import ModelFactory

# executes McNemar test to check the variance
# between results from k-fold validation of different models

model_names = ModelFactory.get_models_list()
result = exec.kfold_validate(2, 10)

data = []


for i in range(0, len(model_names)):
    name1 = model_names[i]
    df = pd.DataFrame()
    for j in range(0, len(model_names)):
        name2 = model_names[j]
        if name1 != name2:
            yes_yes = 0
            no_no = 0
            yes_no = 0
            no_yes = 0
            for index, row in result.iterrows():
                r1 = row[name1]
                r2 = row[name2]
                actual = row['actual']
    def kfold_split(self, folds_split, w1, w2, data):
        """
        Executes k-fold procedure on models of all types and prints final averaged results for all models.
        """
        i = 1
        kFold = KFold(n_splits=folds_split, shuffle=True, random_state=None)
        model_list = ModelFactory.get_models_list()
        model_list.append('ensemble')
        vals = [[name, [], [], [], [], [], [], []] for name in model_list]
        result = pd.DataFrame(columns=[
            'name', 'accuracy', 'precision', 'recall', 'f1',
            'matthews_corrcoef', 'balanced_accuracy', 'confusion_matrix'
        ],
                              data=vals)

        for train_index, test_index in kFold.split(self.data):
            train_data = data.iloc[train_index]
            test_data = data.iloc[test_index]
            features_test = self.extract_features(test_data)
            features_train = self.extract_features(train_data)
            comment_vectorised = self.vectorise_comment_data(
                features_train[4], features_test[4])
            comments_train = comment_vectorised[0]
            comments_test = comment_vectorised[1]
            features_test = self.combine_features(features_test,
                                                  comments_only=False)
            features_train = self.combine_features(features_train,
                                                   comments_only=False)
            y_train = train_data['non-information']
            y_test = test_data['non-information']
            ensemble_result = self.ensemble_model(features_train,
                                                  features_test,
                                                  comments_train,
                                                  comments_test, y_train,
                                                  y_test, w1, w2)
            res = self.compare_models(comments_train, comments_test,
                                      train_data['non-information'],
                                      test_data['non-information'])
            counter = 0
            for index, row in res.iterrows():
                result.iloc[counter]['accuracy'].append(row['accuracy'])
                result.iloc[counter]['precision'].append(row['precision'])
                result.iloc[counter]['recall'].append(row['recall'])
                result.iloc[counter]['f1'].append(row['f1'])
                result.iloc[counter]['matthews_corrcoef'].append(
                    row['matthews_corrcoef'])
                result.iloc[counter]['balanced_accuracy'].append(
                    row['balanced_accuracy'])
                result.iloc[counter]['confusion_matrix'].append(
                    row['confusion_matrix'])
                counter += 1
            acc = ensemble_result['accuracy'][0]
            result.iloc[counter]['accuracy'].append(acc)
            result.iloc[counter]['precision'].append(
                ensemble_result['precision'][0])
            result.iloc[counter]['recall'].append(ensemble_result['recall'][0])
            result.iloc[counter]['f1'].append(ensemble_result['f1'][0])
            result.iloc[counter]['matthews_corrcoef'].append(
                ensemble_result['matthews_corrcoef'][0])
            result.iloc[counter]['balanced_accuracy'].append(
                ensemble_result['balanced_accuracy'][0])
            result.iloc[counter]['confusion_matrix'].append(
                ensemble_result['confusion_matrix'][0])

        print('K fold')
        self.print_kfold_results(result)
        return result