def load_gensimtopicmodel(
    nameprefix, preprocessor=textpreprocess.standard_text_preprocessor_1()):
    """ Load the gensim topic modeler from files.

    :param nameprefix: prefix of the paths of the model files
    :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`)
    :return: a topic modeler
    :type nameprefix: str
    :type preprocessor: function
    :rtype: GensimTopicModeler
    """
    topicmodeler = GensimTopicModeler(preprocessor=preprocessor)
    topicmodeler.loadmodel(nameprefix)
    return topicmodeler
def load_autoencoder_topic(
    nameprefix, preprocessor=textpreprocess.standard_text_preprocessor_1()):
    """ Load the autoencoding topic model from files.

    :param nameprefix: prefix of the paths of the model files
    :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`)
    :return: an autoencoder as a topic modeler
    :type nameprefix: str
    :type preprocessor: function
    :rtype: AutoencodingTopicModeler
    """
    autoencoder = AutoencodingTopicModeler(preprocessor=preprocessor)
    autoencoder.loadmodel(nameprefix)
    return autoencoder
예제 #3
0
def load_autoencoder_topic_sklearnclassifier(
        name,
        preprocessor=textpreprocess.standard_text_preprocessor_1(),
        compact=True):
    """ Load the classifier, a wrapper that uses scikit-learn classifier, with
     feature vectors given by an autocoder topic model, from files.

    # Reference

    Xuan Hieu Phan, Cam-Tu Nguyen, Dieu-Thu Le, Minh Le Nguyen, Susumu Horiguchi, Quang-Thuy Ha,
    "A Hidden Topic-Based Framework toward Building Applications with Short Web Documents,"
    *IEEE Trans. Knowl. Data Eng.* 23(7): 961-976 (2011).

    Xuan Hieu Phan, Le-Minh Nguyen, Susumu Horiguchi, "Learning to Classify Short and Sparse Text & Web withHidden Topics from Large-scale Data Collections,"
    WWW '08 Proceedings of the 17th international conference on World Wide Web. (2008) [`ACL
    <http://dl.acm.org/citation.cfm?id=1367510>`_]

    :param name: name (if compact==True) or prefix (if compact==False) of the paths of model files
    :param preprocessor: function that preprocesses the text (Default: `utils.textpreprocess.standard_text_preprocessor_1`)
    :param compact: whether model file is compact (Default: True)
    :return: a trained classifier
    :type name: str
    :type preprocessor: function
    :type compact: bool
    :rtype: TopicVectorSkLearnClassifier
    """
    if compact:
        # load the compact model
        classifier = TopicVectorSkLearnClassifier(
            AutoencodingTopicModeler(preprocessor=preprocessor), None)
        classifier.load_compact_model(name)
        classifier.trained = True

        # return the instance
        return classifier
    else:
        # load the autoencoder
        autoencoder = load_autoencoder_topic(name, preprocessor=preprocessor)

        # load intermediate model
        sklearn_classifier = joblib.load(name + '.pkl')

        # the wrapper classifier
        classifier = TopicVectorSkLearnClassifier(autoencoder,
                                                  sklearn_classifier)
        classifier.trained = True

        # return the instance
        return classifier
예제 #4
0
def train_autoencoder_topic_sklearnclassifier(
        classdict,
        nb_topics,
        sklearn_classifier,
        preprocessor=textpreprocess.standard_text_preprocessor_1(),
        normalize=True,
        keras_paramdict={},
        sklearn_paramdict={}):
    """ Train the supervised learning classifier, with features given by topic vectors.

    It trains an autoencoder topic model, and with its encoded vector representation, train a supervised
    learning classifier. The instantiated (not trained) scikit-learn classifier must be
    passed into the argument.

    # Reference

    Xuan Hieu Phan, Cam-Tu Nguyen, Dieu-Thu Le, Minh Le Nguyen, Susumu Horiguchi, Quang-Thuy Ha,
    "A Hidden Topic-Based Framework toward Building Applications with Short Web Documents,"
    *IEEE Trans. Knowl. Data Eng.* 23(7): 961-976 (2011).

    Xuan Hieu Phan, Le-Minh Nguyen, Susumu Horiguchi, "Learning to Classify Short and Sparse Text & Web withHidden Topics from Large-scale Data Collections,"
    WWW '08 Proceedings of the 17th international conference on World Wide Web. (2008) [`ACL
    <http://dl.acm.org/citation.cfm?id=1367510>`_]

    :param classdict: training data
    :param nb_topics: number topics, i.e., number of encoding dimensions
    :param sklearn_classifier: instantiated scikit-learn classifier
    :param preprocessor: function that preprocesses the text (Default: `utils.textpreprocess.standard_text_preprocessor_1`)
    :param normalize: whether the retrieved topic vectors are normalized (Default: True)
    :param keras_paramdict: arguments to be passed to keras for training autoencoder
    :param sklearn_paramdict: arguemtnst to be passed to scikit-learn for fitting the classifier
    :return: a trained classifier
    :type classdict: dict
    :type nb_topics: int
    :type sklearn_classifier: sklearn.base.BaseEstimator
    :type preprocessor: function
    :type normalize: bool
    :rtype: TopicVectorSkLearnClassifier
    """
    # train the autoencoder
    autoencoder = AutoencodingTopicModeler(preprocessor=preprocessor,
                                           normalize=normalize)
    autoencoder.train(classdict, nb_topics, **keras_paramdict)

    # intermediate classification training
    classifier = TopicVectorSkLearnClassifier(autoencoder, sklearn_classifier)
    classifier.train(classdict, **sklearn_paramdict)

    return classifier
예제 #5
0
def load_autoencoder_cosineClassifier(
    nameprefix, preprocessor=textpreprocess.standard_text_preprocessor_1()):
    """ Load an autoencoder from files for topic modeling, and return a cosine classifier.

    Given the prefix of the file paths, load the model into files, with name given by the prefix.
    There are files with names ending with "_encoder.json" and "_encoder.h5", which are
    the JSON and HDF5 files for the encoder respectively.
    They also include a gensim dictionary (.gensimdict).

    :param nameprefix: prefix of the paths of the model files
    :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`)
    :return: a classifier that scores the short text based on the autoencoder
    :type nameprefix: str
    :type preprocessor: function
    :rtype: TopicVecCosineDistanceClassifier
    """
    autoencoder = load_autoencoder_topic(nameprefix, preprocessor=preprocessor)
    return TopicVecCosineDistanceClassifier(autoencoder)
예제 #6
0
def load_gensimtopicvec_cosineClassifier(
    nameprefix, preprocessor=textpreprocess.standard_text_preprocessor_1()):
    """ Load a gensim topic model from files and return a cosine distance classifier.

    Given the prefix of the files of the topic model, return a cosine distance classifier
    based on this model, i.e., :class:`TopicVecCosineDistanceClassifier`.

    The files include a JSON (.json) file that specifies various parameters, a gensim dictionary (.gensimdict),
    and a topic model (.gensimmodel). If weighing is applied, load also the tf-idf model (.gensimtfidf).

    :param nameprefix: prefix of the file paths
    :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`)
    :return: a classifier that scores the short text based on the topic model
    :type nameprefix: str
    :type preprocessor: function
    :rtype: TopicVecCosineDistanceClassifier
    """
    topicmodeler = load_gensimtopicmodel(nameprefix, preprocessor=preprocessor)
    return TopicVecCosineDistanceClassifier(topicmodeler)
예제 #7
0
    def __init__(self,
                 preprocessor=textpreprocess.standard_text_preprocessor_1(),
                 algorithm='lda',
                 toweigh=True,
                 normalize=True):
        """ Initialize the topic modeler.

        :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`)
        :param algorithm: algorithm for topic modeling. Options: lda, lsi, rp. (Default: lda)
        :param toweigh: whether to weigh the words using tf-idf. (Default: True)
        :param normalize: whether the retrieved topic vectors are normalized. (Default: True)
        :type preprocessor: function
        :type algorithm: str
        :type toweigh: bool
        """
        LatentTopicModeler.__init__(self,
                                    preprocessor=preprocessor,
                                    normalize=normalize)
        self.algorithm = algorithm
        self.toweigh = toweigh
def load_autoencoder_topicmodel(
        name,
        preprocessor=textpreprocess.standard_text_preprocessor_1(),
        compact=True):
    """ Load the autoencoding topic model from files.

    :param name: name (if compact=True) or prefix (if compact=False) of the paths of the model files
    :param preprocessor: function that preprocesses the text. (Default: `shorttext.utils.textpreprocess.standard_text_preprocessor_1`)
    :param compact: whether model file is compact (Default: True)
    :return: an autoencoder as a topic modeler
    :type name: str
    :type preprocessor: function
    :type compact: bool
    :rtype: generators.bow.AutoEncodingTopicModeling.AutoencodingTopicModeler
    """
    autoencoder = AutoencodingTopicModeler(preprocessor=preprocessor)
    if compact:
        autoencoder.load_compact_model(name)
    else:
        autoencoder.loadmodel(name)
    return autoencoder
예제 #9
0
def load_gensimtopicmodel(
        name,
        preprocessor=textpreprocess.standard_text_preprocessor_1(),
        compact=True):
    """ Load the gensim topic modeler from files.

    :param name: name (if compact=True) or prefix (if compact=False) of the file path
    :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`)
    :param compact: whether model file is compact (Default: True)
    :return: a topic modeler
    :type name: str
    :type preprocessor: function
    :type compact: bool
    :rtype: GensimTopicModeler
    """
    topicmodeler = GensimTopicModeler(preprocessor=preprocessor)
    if compact:
        topicmodeler.load_compact_model(name)
    else:
        topicmodeler.loadmodel(name)
    return topicmodeler
예제 #10
0
def train_gensimtopicvec_cosineClassifier(
        classdict,
        nb_topics,
        preprocessor=textpreprocess.standard_text_preprocessor_1(),
        algorithm='lda',
        toweigh=True,
        normalize=True,
        *args,
        **kwargs):
    """ Return a cosine distance classifier, i.e., :class:`TopicVecCosineDistanceClassifier`, while
    training a gensim topic model in between.

    :param classdict: training data
    :param nb_topics: number of latent topics
    :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`)
    :param algorithm: algorithm for topic modeling. Options: lda, lsi, rp. (Default: lda)
    :param toweigh: whether to weigh the words using tf-idf. (Default: True)
    :param normalize: whether the retrieved topic vectors are normalized. (Default: True)
    :param args: arguments to pass to the `train` method for gensim topic models
    :param kwargs: arguments to pass to the `train` method for gensim topic models
    :return: a classifier that scores the short text based on the topic model
    :type classdict: dict
    :type nb_topics: int
    :type preprocessor: function
    :type algorithm: str
    :type toweigh: bool
    :type normalize: bool
    :rtype: TopicVecCosineDistanceClassifier
    """
    # train topic model
    topicmodeler = GensimTopicModeler(preprocessor=preprocessor,
                                      algorithm=algorithm,
                                      toweigh=toweigh,
                                      normalize=normalize)
    topicmodeler.train(classdict, nb_topics, *args, **kwargs)

    # cosine distance classifier
    return TopicVecCosineDistanceClassifier(topicmodeler)