Пример #1
0
class LabelClassifier:
    """Class implemens various label Classifiers """
    def __init__(self, categoryToClassify: list, pretrained=None):
        """Constructor for Label Classier

        Args:
            categoryToClassify (list): data to save
            pretrained ([type], optional): Pretrained classifier. Defaults to None.
        """
        if not categoryToClassify:
            raise ("no categories to classify have been provided")
        self.category: list = categoryToClassify
        self.estimators = estimators=[('MultinomialNB', MultinomialNB()), \
        ('SGDClassifier', SGDClassifier(loss='modified_huber', penalty='l2',alpha=1e-3, random_state=100, max_iter=200)),
        ('sigmoidSVM', SVC(kernel='sigmoid', gamma=1.0)),
        ('RandomForest', RandomForestClassifier(200, bootstrap=False)),
        ('LogisticRegression',LogisticRegression(solver='sag',random_state=100))]
        self.trainedEstimator = pretrained
        self.fileLocation: str = self.generateFilename()
        self.stackingEstimator = None
        self.rbfKernel = None

    def trainingClassifier(self, X_train: numpy.ndarray,
                           y_train: numpy.ndarray):
        """Constructor for Label Classier

        Args:
            X_train (numpy.ndarray): X_train training documents
            y_train (numpy.ndarray): y_train labels for training documents
        """
        if not X_train.size:
            raise ("No X_train data was provided")
        if not y_train.size:
            raise ("No y_train data was provided")
        logging.info("> training classifier")
        voting = None
        if config.getValueFromConfig("classifier loadClassifier") == True:
            try:
                self.trainedEstimator = joblib.load(self.fileLocation)
                voting = load_classifier.getVotingClassifier()
            except:
                raise ("load voting classifier failed")

        else:
            self.trainedEstimator = VotingClassifier(self.estimators,
                                                     voting='hard')
            voting = self.trainedEstimator.fit_transform(
                X_train, y_train)  # test our model on the test data
            if config.getValueFromConfig("classifier saveClassifier") == True:
                joblib.dump(self.trainedEstimator,
                            self.fileLocation,
                            compress=9)
                joblib.dump(
                    voting,
                    '../classifier/trained_classifiers/voting_classifier',
                    compress=9)
                logging.info("> dumped Classifier: {}".format(
                    self.fileLocation))
        self.trainKernelApproxSvgOnVoting(voting, y_train)

    def predict(self, X_test: numpy.ndarray) -> numpy.ndarray:
        """Method labels data

        Args:
            X_test (numpy.ndarray): X_test data

        Returns:
            numpy.ndarray: Trained estimator prediction
        """
        if not X_test.size:
            raise ("No test documents were provided")
        logging.info("> predicting")
        prediction = self.trainedEstimator.predict(X_test)
        assert prediction.size, "No documents were predicted"
        return prediction

    def generateFilename(self) -> str:
        """Method generates Filename for classifier

        Returns:
            str: Filename as string
        """
        folder = config.getValueFromConfig("classifier path saveFolder")
        if folder == None:
            raise ("No folder name was provided")
        if len(self.category) < 2 or len(self.category) > 3:
            raise ("To few or many categories")
        if len(self.category) == 3:
            return "{}ensembleClassifier_{}-{}-{}.joblib.pkl".format(
                folder, self.category[0], self.category[1], self.category[2])
        else:
            return "{}ensembleClassifier_{}-{}.joblib.pkl".format(
                folder, self.category[0], self.category[1])

    def accuracy(self, X_test: numpy.ndarray, y_test: numpy.ndarray,
                 predicted: numpy.ndarray):
        """Methods plots the accuracy of the trained classifier

        Args:
            X_test (numpy.ndarray): The test documents
            y_test (numpy.ndarray): The results for the test documents
            predicted (numpy.ndarray): The predicted test values 

        Raises:
            AssertionError: This error is being thrown, if the classifier wasn't trained previousely
        """
        if not X_test.size:
            raise ("X_test was empty")
        if not y_test.size:
            raise ("y_test was empty")
        if not predicted.size:
            raise ("predicted was empty")
        if self.trainedEstimator == None:
            raise AssertionError("Classifier has not been trained yet")
        logging.info("\n ->> ensemble-score:{}\n".format(
            numpy.mean(predicted == y_test)))
        plot_confusion_matrix(
            self.trainedEstimator,
            X_test,
            y_test,
            normalize="all",
            display_labels=[self.category[0], self.category[1]])
        plt.show()

    def trainKernelApproxSvgOnVoting(self, X_predicted: numpy.ndarray,
                                     y: numpy.ndarray):
        """Train kernel for classifier

        Args:
            X_predicted (numpy.ndarray): The prediction of the other classifiers.
            y (numpy.ndarray): The real labels.
        """
        if not X_predicted.size:
            raise ("No X_predicted data was orovided")
        if not y.size:
            raise ("No y data was provided")
        logging.info("training stacking classifier")
        self.rbfKernel = RBFSampler(gamma=1, random_state=1)
        X_features = self.rbfKernel.fit_transform(X_predicted)
        self.stackingEstimator = SGDClassifier(
            max_iter=config.getValueFromConfig("SGDClassifierIterations"))
        self.stackingEstimator.fit(X_features, y)
        logging.info("stacking-classifier: " +
                     str(self.stackingEstimator.score(X_features, y)))

    def stackingPrediction(self, X_test: numpy.ndarray) -> numpy.ndarray:
        """This method predicts the result using another classifier - so called "stacking"

        Args:
            X_test (numpy.ndarray): The vectorized documents to test on. 

        Returns:
            numpy.ndarray: The prediction for the labels using stacking.
        """
        if not X_test.size:
            raise ("No X_test data was provided")
        voting = self.trainedEstimator.transform(X_test)
        influencedVoting = self.rbfKernel.transform(voting)
        prediction = self.stackingEstimator.predict(influencedVoting)
        assert prediction.size
        return prediction