Пример #1
0
    def error_analysis(self):
        """
        Method for printing incorrect classifications for manual error inspection.
        """
        # Error Analysis is only possible in train or eval mode
        if self.mode not in ['train', 'eval']:
            print(
                "ERROR: Error analysis is only possible in train and eval mode."
            )
            sys.exit(1)

        # Loop through classes and print incorrect classifications
        for clf_class in self.classes:
            print(
                "Incorrect classifications for class {}:\n".format(clf_class))

            # Load training data for respective class to extract the test set
            data = get_training_set(self.train_path,
                                    self.Fe,
                                    label=clf_class,
                                    original_labels=True)
            X = data.iloc[:, :-1]
            y = np.ravel(data.iloc[:, -1])
            # Get the correct train-test split
            _, X_test, _, y_test = train_test_split(
                X,
                y,
                test_size=self.split,
                random_state=self.random_state,
                stratify=y)

            # Get the original labels and text
            y_orig_labels_test = X_test.iloc[:, -2:-1]
            y_orig_text_test = X_test.iloc[:, -1:]
            X_test = X_test.iloc[:, :-2]

            clf = self.clfs[clf_class]
            # Get the predictions
            y_pred = clf.predict(X_test)

            for orig, text, pred in zip(y_orig_labels_test.values,
                                        y_orig_text_test.values, y_pred):
                orig = orig[0]
                text = text[0]
                orig_split = orig.split(",")

                if pred != any(
                    [label.startswith(clf_class) for label in orig_split]):
                    print("Text: {}, original label: {}, predicted label: {}".
                          format(text, orig, pred))

            print("\n")
Пример #2
0
    def evaluate(self, span_detection=False, save=False):
        """
        Method for evaluating the classifiers on the held out test sets.

        :param span_detection: If True, use postprocessing method for better span detection.
        :param save: if True the clfs trained for evaluation are saved for further inspection
        """
        # Evaluation is only possible in train or eval mode
        if self.mode not in ['train', 'eval']:
            print("ERROR: Evaluation is only possible in train and eval mode.")
            sys.exit(1)

        majority_classes = {
            'direct': None,
            'indirect': None,
            'free_indirect': None,
            'reported': None
        }

        # Loop through classes and evaluate
        for clf_class in self.classes:

            # Load training data for respective class to extract the train and test set,
            # as well as the original labels and texts to compute the accuracy on word level
            data = get_training_set(self.train_path,
                                    self.Fe,
                                    label=clf_class,
                                    original_labels=True)
            X = data.iloc[:, :-1]
            y = np.ravel(data.iloc[:, -1])
            # Get the correct train-test split
            X_train, X_test, y_train, y_test = train_test_split(
                X,
                y,
                test_size=self.split,
                random_state=self.random_state,
                stratify=y)

            # Get the original labels and text for span evaluation after the split
            y_orig_train = X_train.iloc[:, -2:]
            X_train = X_train.iloc[:, :-2]
            y_orig_test = X_test.iloc[:, -2:]
            X_test = X_test.iloc[:, :-2]

            # For SVM the features should be scaled for efficiency reasons
            if self.model == "svm":
                scaling = MinMaxScaler(feature_range=(-1, 1)).fit(X_train)
                X_train = scaling.transform(X_train)
                X_train = pd.DataFrame(X_train)
                X_test = scaling.transform(X_test)
                X_test = pd.DataFrame(X_test)

            # Use a wrapper for training the classifier, in order to trigger methods for countering class imbalance
            clf = self.models[self.model]['classifier'](
                **self.models[self.model]['parameter'][clf_class])
            clf_wrapped = CLFWrapper(clf)

            print("Training the {} classifier for label {}.\n".format(
                self.model, clf_class))

            # Get method for countering class imbalance if respective parameter is given
            if self.augment_data in ['oversampling', 'SMOTE']:
                augment_method = DATA_AUGMENT[self.augment_data]

            elif self.augment_data == 'augmentation':
                augment_method = DATA_AUGMENT[self.augment_data][clf_class]

            else:
                augment_method = None

            # Fit classifier on training data
            clf_wrapped.fit(X_train, y_train, augment_method=augment_method)

            # Save the classifier if flag indicates this
            if save:
                # Save the trained classifier
                self.clfs[clf_class] = clf
                joblib.dump(clf,
                            'models/{}/{}.clf'.format(self.model, clf_class))

            y_pred = clf.predict(X_test)

            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            print(
                "Classifier for label {} on test set: Precision {}, Recall {}, F1 {}"
                .format(clf_class, precision, recall, f1))

            majority_classes[clf_class] = self.get_max_type(
                y_orig_train.iloc[:, -2:-1], clf_class)
            y_pred_stw = [
                self.annotate_stw(t[0],
                                  clf_class,
                                  majority_classes=majority_classes)
                for ind, t in enumerate(y_orig_test.iloc[:,
                                                         -1:].values.tolist())
                if y_pred[ind]
            ]
            y_true_s, y_true_t, y_true_w = get_labels_stw([
                y[0] for ind, y in enumerate(
                    y_orig_test.iloc[:, -2:-1].values.tolist()) if y_pred[ind]
            ], clf_class)

            for type in ['speech', 'thought', 'writing']:
                if type == 'speech':
                    y_true_stw = y_true_s
                elif type == 'thought':
                    y_true_stw = y_true_t
                else:
                    y_true_stw = y_true_w

                y_pred_type = [int(y == type) for y in y_pred_stw]
                precision = precision_score(y_true_stw, y_pred_type)
                recall = recall_score(y_true_stw, y_pred_type)
                f1 = f1_score(y_true_stw, y_pred_type)
                print(
                    "Classification for label {} on predictions for class {} on test set: Precision {}, Recall {}, F1 {} (Count instances: {})"
                    .format(type, clf_class, precision, recall, f1,
                            sum(y_true_stw)))

            # Evaluate accuracy of span prediction
            y_pred_test = y_orig_test.copy()
            # Get full spans before postprocessing
            y_pred_test.iloc[:, 0] = [
                "{},0,{}".format(clf_class, str(len(y_orig_test.iloc[i, 1])))
                if int(y_hat) == 1 else "" for i, y_hat in enumerate(y_pred)
            ]

            # Do span detection if chosen
            if span_detection:
                y_pred_test.iloc[:, 0] = [
                    postprocess_spans(row, cl=clf_class)
                    for _, row in y_pred_test.iterrows()
                ]

            # Mark gold and predicted labeled words with different signs
            marked_text_gold = list(
                map(
                    lambda segment: mark_labeled_words(segment[1], segment[
                        0], clf_class), y_orig_test.values))
            marked_text_predicted = list(
                map(
                    lambda segment: mark_labeled_words(
                        segment[1], segment[0], clf_class, mark='#'),
                    y_pred_test.values))

            num_words_correctly_marked_total = 0
            num_words_incorrectly_marked_total = 0
            num_words_total = 0
            num_words_correctly_marked_correct_labels = 0
            num_words_incorrectly_marked_correct_labels = 0
            num_words_correct_labels = 0

            for i, gold_segment in enumerate(marked_text_gold):
                tokens_pred = marked_text_predicted[i].split()

                tokens = gold_segment.split()
                num_words_total += len(tokens)
                len_gold = len(
                    [token for token in tokens if token.endswith('$')])

                # Correctly identified instances
                if len_gold > 0 and int(y_pred[i]) == 1:
                    num_words_correct_labels += len(tokens)

                for j, token in enumerate(tokens_pred):
                    if token.endswith('#'):
                        if tokens[j].endswith('$'):
                            num_words_correctly_marked_total += 1
                            num_words_correctly_marked_correct_labels += 1
                        else:
                            if len_gold > 0 and int(y_pred[i]) == 1:
                                num_words_incorrectly_marked_correct_labels += 1
                            num_words_incorrectly_marked_total += 1

                    else:
                        if tokens[j].endswith('$'):
                            num_words_incorrectly_marked_total += 1
                            if int(y_pred[i]) == 1:
                                num_words_incorrectly_marked_correct_labels += 1
                        else:
                            num_words_correctly_marked_total += 1
                            if len_gold > 0 and int(y_pred[i]) == 1:
                                num_words_correctly_marked_correct_labels += 1

            print(
                "Word-level accuracy all instances: {}% of total words correctly labeled, {}% of total words incorrectly labeled."
                .format(
                    round(
                        (num_words_correctly_marked_total / num_words_total) *
                        100, 2),
                    round(
                        (num_words_incorrectly_marked_total / num_words_total)
                        * 100, 2)))

            print(
                "Word-level accuracy within correctly identified instances: {}% of words within labeled instances correctly labeled, {}% of words within labeled instances incorrectly labeled.\n"
                .format(
                    round((num_words_correctly_marked_correct_labels /
                           num_words_correct_labels) * 100, 2),
                    round((num_words_incorrectly_marked_correct_labels /
                           num_words_correct_labels) * 100, 2)))

        return
Пример #3
0
    def train(self, clf_class, cross_val=False, augment_data='oversampling'):
        """
        Train a binary classifier with the given ML technique (model) to be used in classification
        of clf_class instances.

        :param clf_class: label of the positive class instances.
        :param cross_val: if True, print evaluation with stratified 10-fold cross validation.
        :param augment_data: keyword for optional method to counter class imbalance within the training data.
        :return: the trained classifier.
        """
        print("Training the {} classifier for label {}.\n".format(
            self.model, clf_class))

        # Load training data
        data = get_training_set(self.train_path,
                                self.Fe,
                                label=clf_class,
                                original_labels=True)
        X = data.iloc[:, :-1]
        y = np.ravel(data.iloc[:, -1:])

        # Get the original labels for speech, thought, writing classification
        y_orig = X.iloc[:, -2:-1]
        X = X.iloc[:, :-2]

        # For SVM the features should be scaled for efficiency reasons
        if self.model == "svm":
            scaling = MinMaxScaler(feature_range=(-1, 1)).fit(X)
            X = scaling.transform(X)
            X = pd.DataFrame(X)

        # Use a wrapper for training the classifier, in order to trigger methods for countering class imbalance
        clf = self.models[self.model]['classifier'](
            **self.models[self.model]['parameter'][clf_class])
        clf_wrapped = CLFWrapper(clf)

        print("\nTraining...")

        # Get method for countering class imbalance if respective parameter is given
        if augment_data in ['oversampling', 'SMOTE']:
            augment_method = DATA_AUGMENT[augment_data]

        elif augment_data == 'augmentation':
            augment_method = DATA_AUGMENT[augment_data][clf_class]

        else:
            augment_method = None

        if cross_val:
            # For cross validation, use stratified train-test split in order to have a hold-out test-set which is NOT used as a dev set in cross validation
            X_train, _, y_train, _ = train_test_split(
                X,
                y,
                test_size=self.split,
                random_state=self.random_state,
                stratify=y)

            # Stratified 10-fold cross validation, treatment of imbalanced data sets by oversampling, data augmentation etc. is triggered via the fit_params parameter
            recall = cross_val_score(
                clf_wrapped,
                X_train,
                y_train,
                cv=10,
                scoring='recall',
                fit_params={'augment_method': augment_method})
            precision = cross_val_score(
                clf_wrapped,
                X_train,
                y_train,
                cv=10,
                scoring='precision',
                fit_params={'augment_method': augment_method})
            f1 = cross_val_score(clf_wrapped,
                                 X_train,
                                 y_train,
                                 cv=10,
                                 scoring='f1',
                                 fit_params={'augment_method': augment_method})

            # Precision, Recall, F1
            print(
                "Scores on training set with 10-fold cross validation for class {}: Precision {}, Recall {}, F1 {}"
                .format(clf_class, precision.mean(), recall.mean(), f1.mean()))

        # After evaluation fit classifier on all available training data
        clf_wrapped.fit(X, y, augment_method=augment_method)

        # Get the trained classifier
        clf = clf_wrapped.clf

        # Check that directory exists
        directory = os.getcwd() + "/models"
        if not os.path.exists(directory):
            os.makedirs(directory)

        print("Saving the trained classifier...")
        # Save the trained classifier
        joblib.dump(clf, 'models/{}/{}.clf'.format(self.model, clf_class))

        # Get the majority class (one of speech, thought, writing) from data for clf_class
        self.majority_classes[clf_class] = self.get_max_type(y_orig, clf_class)

        print("Done.\n")

        return (clf)