def _fit(self, X, y): """ Fit the provided Pipeline :param X: Array, The document features :param y: Array, The list of categories that belongs to the documents :return pipeline: the fitted pipeline """ logging.info('Transforming and fitting pipeline...') return self.pipeline.fit(X, y)
def vectorize(self, dataset_features) -> list: """ Vectorize each key:value dict within the dataset_features :param dataset_features: List of the dictionaries that contains the feature:value for each token :return Array: A 2 dimentional array """ logging.info('Vectorizing examples...') X = self.vectorizer.fit_transform(dataset_features) return X
def _over_sample(self, X, y) -> list: """ Perform oversampling technique to the provided documents :param X: Array, The document features array :param y: Array, The list of categories that belongs to the documents :return Array: The oversampled features and categories, in that order """ logging.info('Over sampling...') X_over_sample, y_over_sample = self.over_sampler.fit_sample( X.toarray(), np.array(y)) return X_over_sample, y_over_sample
def _report(self, X_test, y_test) -> None: """ Show a accuray report in case an Reporter was provided :param X: Array, The test document features :param y: Array, The list of test categories that belongs to the test documents """ if self.reporter: logging.info('Reporting...') y_predict = self.pipeline.predict(X_test) self.reporter.show_accuracy_score(y_test, y_predict) self.reporter.show_precision_recall_and_f1_score(y_test, y_predict) self.reporter.show_confusion_matrix(y_test, y_predict)
def show_confusion_matrix(self, y_test, y_predict): labels = sorted(list(set(y_test))) cm = self.confusion_matrix(y_test, y_predict, labels=labels) logging.info(' Confusion Matrix') logging.info(' ' + str(labels[0]) + ' ' + str(labels[1])) logging.info(' ' + str(labels[0]) + ' ' + str(cm[0][0]) + ' ' + str(cm[0][1])) logging.info(' ' + str(labels[1]) + ' ' + str(cm[1][0]) + ' ' + str(cm[1][1]))
def split_train_test_dataset(self, X, y, train_size, test_size, random_state=config.DEFAULT_RANDOM_STATE) ->list: """ Split the dataset into training and test data :param X: Array of examples :param y: Array of the categories for each example :param train_size: Training size portion desired to split the X dataset :param test_size: Test size portion desired to split the X dataset :param random_state: Optional random state to initialize the dataset splitter :return Array: The array with: training examples, test examples, traning categories, testing categories, in that order """ logging.info('Splitting dataset...') X_train, X_test, y_train, y_test = self.dataset_splitter(X, y, train_size=train_size, test_size=test_size, random_state=random_state) return X_train, X_test, y_train, y_test
def get_features_and_labels(self) -> list: """ Get the dataset features and the category labels by examples :return Array: First position a dict with features and the second an array with the categories associated to the examples """ logging.info('Getting features from dataset...') document_examples = self._read_documents_examples() dataset_features = [] labels_by_document_example = [] for document in document_examples: features_name_and_score = {} tokens = document.get_document_example_tokens_array() for token in tokens: # This just indicates that the feature is present features_name_and_score[token] = 1.0 dataset_features.append(features_name_and_score) labels_by_document_example.append(document.get_document_type()) return [dataset_features, labels_by_document_example]
def show_precision_recall_and_f1_score(self, y_test, y_predict): logging.info(self.classification_report(y_test, y_predict))
def show_accuracy_score(self, y_test, y_predict): accuracy = self.accuracy_score(y_test, y_predict) logging.info("Accuracy score: {}".format(accuracy))