Пример #1
0
    def _fit(self, X, y):
        """
        Fit the provided Pipeline

        :param X: Array, The document features
        :param y: Array, The list of categories that belongs to the documents
        :return pipeline: the fitted pipeline
        """
        logging.info('Transforming and fitting pipeline...')
        return self.pipeline.fit(X, y)
Пример #2
0
    def vectorize(self, dataset_features) -> list:
        """
        Vectorize each key:value dict within the dataset_features

        :param dataset_features: List of the dictionaries that contains the feature:value for each token
        :return Array: A 2 dimentional array
        """
        logging.info('Vectorizing examples...')
        X = self.vectorizer.fit_transform(dataset_features)

        return X
Пример #3
0
    def _over_sample(self, X, y) -> list:
        """
        Perform oversampling technique to the provided documents

        :param X: Array, The document features array
        :param y: Array, The list of categories that belongs to the documents
        :return Array: The oversampled features and categories, in that order
        """
        logging.info('Over sampling...')
        X_over_sample, y_over_sample = self.over_sampler.fit_sample(
            X.toarray(), np.array(y))

        return X_over_sample, y_over_sample
Пример #4
0
    def _report(self, X_test, y_test) -> None:
        """
        Show a accuray report in case an Reporter was provided

        :param X: Array, The test document features
        :param y: Array, The list of test categories that belongs to the test documents
        """
        if self.reporter:
            logging.info('Reporting...')
            y_predict = self.pipeline.predict(X_test)

            self.reporter.show_accuracy_score(y_test, y_predict)
            self.reporter.show_precision_recall_and_f1_score(y_test, y_predict)
            self.reporter.show_confusion_matrix(y_test, y_predict)
Пример #5
0
    def show_confusion_matrix(self, y_test, y_predict):
        labels = sorted(list(set(y_test)))
        cm = self.confusion_matrix(y_test, y_predict, labels=labels)

        logging.info('  Confusion Matrix')
        logging.info('       ' + str(labels[0]) + ' ' + str(labels[1]))
        logging.info('  ' + str(labels[0]) + '   ' + str(cm[0][0]) + ' ' +
                     str(cm[0][1]))
        logging.info('  ' + str(labels[1]) + '   ' + str(cm[1][0]) + ' ' +
                     str(cm[1][1]))
Пример #6
0
    def split_train_test_dataset(self, X, y, train_size, test_size, random_state=config.DEFAULT_RANDOM_STATE) ->list:
        """
        Split the dataset into training and test data

        :param X: Array of examples
        :param y: Array of the categories for each example
        :param train_size: Training size portion desired to split the X dataset
        :param test_size: Test size portion desired to split the X dataset
        :param random_state: Optional random state to initialize the dataset splitter
        :return Array: The array with: training examples, test examples,
                       traning categories, testing categories, in that order
        """
        logging.info('Splitting dataset...')
        X_train, X_test, y_train, y_test = self.dataset_splitter(X, y,
            train_size=train_size,
            test_size=test_size,
            random_state=random_state)

        return X_train, X_test, y_train, y_test
Пример #7
0
    def get_features_and_labels(self) -> list:
        """
        Get the dataset features and the category labels by examples

        :return Array: First position a dict with features and the second
                       an array with the categories associated to the examples
        """
        logging.info('Getting features from dataset...')
        document_examples = self._read_documents_examples()
        dataset_features = []
        labels_by_document_example = []
        for document in document_examples:
            features_name_and_score = {}
            tokens = document.get_document_example_tokens_array()

            for token in tokens:
                # This just indicates that the feature is present
                features_name_and_score[token] = 1.0

            dataset_features.append(features_name_and_score)
            labels_by_document_example.append(document.get_document_type())

        return [dataset_features, labels_by_document_example]
Пример #8
0
 def show_precision_recall_and_f1_score(self, y_test, y_predict):
     logging.info(self.classification_report(y_test, y_predict))
Пример #9
0
 def show_accuracy_score(self, y_test, y_predict):
     accuracy = self.accuracy_score(y_test, y_predict)
     logging.info("Accuracy score: {}".format(accuracy))