def load_autoencoder_topic_sklearnclassifier(
        name,
        preprocessor=textpreprocess.standard_text_preprocessor_1(),
        compact=True):
    """ Load the classifier, a wrapper that uses scikit-learn classifier, with
     feature vectors given by an autocoder topic model, from files.

    # Reference

    Xuan Hieu Phan, Cam-Tu Nguyen, Dieu-Thu Le, Minh Le Nguyen, Susumu Horiguchi, Quang-Thuy Ha,
    "A Hidden Topic-Based Framework toward Building Applications with Short Web Documents,"
    *IEEE Trans. Knowl. Data Eng.* 23(7): 961-976 (2011).

    Xuan Hieu Phan, Le-Minh Nguyen, Susumu Horiguchi, "Learning to Classify Short and Sparse Text & Web withHidden Topics from Large-scale Data Collections,"
    WWW '08 Proceedings of the 17th international conference on World Wide Web. (2008) [`ACL
    <http://dl.acm.org/citation.cfm?id=1367510>`_]

    :param name: name (if compact==True) or prefix (if compact==False) of the paths of model files
    :param preprocessor: function that preprocesses the text (Default: `utils.textpreprocess.standard_text_preprocessor_1`)
    :param compact: whether model file is compact (Default: True)
    :return: a trained classifier
    :type name: str
    :type preprocessor: function
    :type compact: bool
    :rtype: TopicVectorSkLearnClassifier
    """
    if compact:
        # load the compact model
        classifier = TopicVectorSkLearnClassifier(
            AutoencodingTopicModeler(preprocessor=preprocessor), None)
        classifier.load_compact_model(name)
        classifier.trained = True

        # return the instance
        return classifier
    else:
        # load the autoencoder
        autoencoder = load_autoencoder_topicmodel(name,
                                                  preprocessor=preprocessor)

        # load intermediate model
        sklearn_classifier = joblib.load(name + '.pkl')

        # the wrapper classifier
        classifier = TopicVectorSkLearnClassifier(autoencoder,
                                                  sklearn_classifier)
        classifier.trained = True

        # return the instance
        return classifier
Exemplo n.º 2
0
def load_autoencoder_cosineClassifier(
        name,
        preprocessor=textpreprocess.standard_text_preprocessor_1(),
        compact=True):
    """ Load an autoencoder from files for topic modeling, and return a cosine classifier.

    Given the prefix of the file paths, load the model into files, with name given by the prefix.
    There are files with names ending with "_encoder.json" and "_encoder.h5", which are
    the JSON and HDF5 files for the encoder respectively.
    They also include a gensim dictionary (.gensimdict).

    :param name: name (if compact=True) or prefix (if compact=False) of the file paths
    :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`)
    :param compact: whether model file is compact (Default: True)
    :return: a classifier that scores the short text based on the autoencoder
    :type name: str
    :type preprocessor: function
    :type compact: bool
    :rtype: TopicVecCosineDistanceClassifier
    """
    autoencoder = load_autoencoder_topicmodel(name,
                                              preprocessor=preprocessor,
                                              compact=compact)
    return TopicVecCosineDistanceClassifier(autoencoder)