def load_gensimtopicmodel( nameprefix, preprocessor=textpreprocess.standard_text_preprocessor_1()): """ Load the gensim topic modeler from files. :param nameprefix: prefix of the paths of the model files :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`) :return: a topic modeler :type nameprefix: str :type preprocessor: function :rtype: GensimTopicModeler """ topicmodeler = GensimTopicModeler(preprocessor=preprocessor) topicmodeler.loadmodel(nameprefix) return topicmodeler
def load_autoencoder_topic( nameprefix, preprocessor=textpreprocess.standard_text_preprocessor_1()): """ Load the autoencoding topic model from files. :param nameprefix: prefix of the paths of the model files :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`) :return: an autoencoder as a topic modeler :type nameprefix: str :type preprocessor: function :rtype: AutoencodingTopicModeler """ autoencoder = AutoencodingTopicModeler(preprocessor=preprocessor) autoencoder.loadmodel(nameprefix) return autoencoder
def load_autoencoder_topic_sklearnclassifier( name, preprocessor=textpreprocess.standard_text_preprocessor_1(), compact=True): """ Load the classifier, a wrapper that uses scikit-learn classifier, with feature vectors given by an autocoder topic model, from files. # Reference Xuan Hieu Phan, Cam-Tu Nguyen, Dieu-Thu Le, Minh Le Nguyen, Susumu Horiguchi, Quang-Thuy Ha, "A Hidden Topic-Based Framework toward Building Applications with Short Web Documents," *IEEE Trans. Knowl. Data Eng.* 23(7): 961-976 (2011). Xuan Hieu Phan, Le-Minh Nguyen, Susumu Horiguchi, "Learning to Classify Short and Sparse Text & Web withHidden Topics from Large-scale Data Collections," WWW '08 Proceedings of the 17th international conference on World Wide Web. (2008) [`ACL <http://dl.acm.org/citation.cfm?id=1367510>`_] :param name: name (if compact==True) or prefix (if compact==False) of the paths of model files :param preprocessor: function that preprocesses the text (Default: `utils.textpreprocess.standard_text_preprocessor_1`) :param compact: whether model file is compact (Default: True) :return: a trained classifier :type name: str :type preprocessor: function :type compact: bool :rtype: TopicVectorSkLearnClassifier """ if compact: # load the compact model classifier = TopicVectorSkLearnClassifier( AutoencodingTopicModeler(preprocessor=preprocessor), None) classifier.load_compact_model(name) classifier.trained = True # return the instance return classifier else: # load the autoencoder autoencoder = load_autoencoder_topic(name, preprocessor=preprocessor) # load intermediate model sklearn_classifier = joblib.load(name + '.pkl') # the wrapper classifier classifier = TopicVectorSkLearnClassifier(autoencoder, sklearn_classifier) classifier.trained = True # return the instance return classifier
def train_autoencoder_topic_sklearnclassifier( classdict, nb_topics, sklearn_classifier, preprocessor=textpreprocess.standard_text_preprocessor_1(), normalize=True, keras_paramdict={}, sklearn_paramdict={}): """ Train the supervised learning classifier, with features given by topic vectors. It trains an autoencoder topic model, and with its encoded vector representation, train a supervised learning classifier. The instantiated (not trained) scikit-learn classifier must be passed into the argument. # Reference Xuan Hieu Phan, Cam-Tu Nguyen, Dieu-Thu Le, Minh Le Nguyen, Susumu Horiguchi, Quang-Thuy Ha, "A Hidden Topic-Based Framework toward Building Applications with Short Web Documents," *IEEE Trans. Knowl. Data Eng.* 23(7): 961-976 (2011). Xuan Hieu Phan, Le-Minh Nguyen, Susumu Horiguchi, "Learning to Classify Short and Sparse Text & Web withHidden Topics from Large-scale Data Collections," WWW '08 Proceedings of the 17th international conference on World Wide Web. (2008) [`ACL <http://dl.acm.org/citation.cfm?id=1367510>`_] :param classdict: training data :param nb_topics: number topics, i.e., number of encoding dimensions :param sklearn_classifier: instantiated scikit-learn classifier :param preprocessor: function that preprocesses the text (Default: `utils.textpreprocess.standard_text_preprocessor_1`) :param normalize: whether the retrieved topic vectors are normalized (Default: True) :param keras_paramdict: arguments to be passed to keras for training autoencoder :param sklearn_paramdict: arguemtnst to be passed to scikit-learn for fitting the classifier :return: a trained classifier :type classdict: dict :type nb_topics: int :type sklearn_classifier: sklearn.base.BaseEstimator :type preprocessor: function :type normalize: bool :rtype: TopicVectorSkLearnClassifier """ # train the autoencoder autoencoder = AutoencodingTopicModeler(preprocessor=preprocessor, normalize=normalize) autoencoder.train(classdict, nb_topics, **keras_paramdict) # intermediate classification training classifier = TopicVectorSkLearnClassifier(autoencoder, sklearn_classifier) classifier.train(classdict, **sklearn_paramdict) return classifier
def load_autoencoder_cosineClassifier( nameprefix, preprocessor=textpreprocess.standard_text_preprocessor_1()): """ Load an autoencoder from files for topic modeling, and return a cosine classifier. Given the prefix of the file paths, load the model into files, with name given by the prefix. There are files with names ending with "_encoder.json" and "_encoder.h5", which are the JSON and HDF5 files for the encoder respectively. They also include a gensim dictionary (.gensimdict). :param nameprefix: prefix of the paths of the model files :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`) :return: a classifier that scores the short text based on the autoencoder :type nameprefix: str :type preprocessor: function :rtype: TopicVecCosineDistanceClassifier """ autoencoder = load_autoencoder_topic(nameprefix, preprocessor=preprocessor) return TopicVecCosineDistanceClassifier(autoencoder)
def load_gensimtopicvec_cosineClassifier( nameprefix, preprocessor=textpreprocess.standard_text_preprocessor_1()): """ Load a gensim topic model from files and return a cosine distance classifier. Given the prefix of the files of the topic model, return a cosine distance classifier based on this model, i.e., :class:`TopicVecCosineDistanceClassifier`. The files include a JSON (.json) file that specifies various parameters, a gensim dictionary (.gensimdict), and a topic model (.gensimmodel). If weighing is applied, load also the tf-idf model (.gensimtfidf). :param nameprefix: prefix of the file paths :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`) :return: a classifier that scores the short text based on the topic model :type nameprefix: str :type preprocessor: function :rtype: TopicVecCosineDistanceClassifier """ topicmodeler = load_gensimtopicmodel(nameprefix, preprocessor=preprocessor) return TopicVecCosineDistanceClassifier(topicmodeler)
def __init__(self, preprocessor=textpreprocess.standard_text_preprocessor_1(), algorithm='lda', toweigh=True, normalize=True): """ Initialize the topic modeler. :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`) :param algorithm: algorithm for topic modeling. Options: lda, lsi, rp. (Default: lda) :param toweigh: whether to weigh the words using tf-idf. (Default: True) :param normalize: whether the retrieved topic vectors are normalized. (Default: True) :type preprocessor: function :type algorithm: str :type toweigh: bool """ LatentTopicModeler.__init__(self, preprocessor=preprocessor, normalize=normalize) self.algorithm = algorithm self.toweigh = toweigh
def load_autoencoder_topicmodel( name, preprocessor=textpreprocess.standard_text_preprocessor_1(), compact=True): """ Load the autoencoding topic model from files. :param name: name (if compact=True) or prefix (if compact=False) of the paths of the model files :param preprocessor: function that preprocesses the text. (Default: `shorttext.utils.textpreprocess.standard_text_preprocessor_1`) :param compact: whether model file is compact (Default: True) :return: an autoencoder as a topic modeler :type name: str :type preprocessor: function :type compact: bool :rtype: generators.bow.AutoEncodingTopicModeling.AutoencodingTopicModeler """ autoencoder = AutoencodingTopicModeler(preprocessor=preprocessor) if compact: autoencoder.load_compact_model(name) else: autoencoder.loadmodel(name) return autoencoder
def load_gensimtopicmodel( name, preprocessor=textpreprocess.standard_text_preprocessor_1(), compact=True): """ Load the gensim topic modeler from files. :param name: name (if compact=True) or prefix (if compact=False) of the file path :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`) :param compact: whether model file is compact (Default: True) :return: a topic modeler :type name: str :type preprocessor: function :type compact: bool :rtype: GensimTopicModeler """ topicmodeler = GensimTopicModeler(preprocessor=preprocessor) if compact: topicmodeler.load_compact_model(name) else: topicmodeler.loadmodel(name) return topicmodeler
def train_gensimtopicvec_cosineClassifier( classdict, nb_topics, preprocessor=textpreprocess.standard_text_preprocessor_1(), algorithm='lda', toweigh=True, normalize=True, *args, **kwargs): """ Return a cosine distance classifier, i.e., :class:`TopicVecCosineDistanceClassifier`, while training a gensim topic model in between. :param classdict: training data :param nb_topics: number of latent topics :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`) :param algorithm: algorithm for topic modeling. Options: lda, lsi, rp. (Default: lda) :param toweigh: whether to weigh the words using tf-idf. (Default: True) :param normalize: whether the retrieved topic vectors are normalized. (Default: True) :param args: arguments to pass to the `train` method for gensim topic models :param kwargs: arguments to pass to the `train` method for gensim topic models :return: a classifier that scores the short text based on the topic model :type classdict: dict :type nb_topics: int :type preprocessor: function :type algorithm: str :type toweigh: bool :type normalize: bool :rtype: TopicVecCosineDistanceClassifier """ # train topic model topicmodeler = GensimTopicModeler(preprocessor=preprocessor, algorithm=algorithm, toweigh=toweigh, normalize=normalize) topicmodeler.train(classdict, nb_topics, *args, **kwargs) # cosine distance classifier return TopicVecCosineDistanceClassifier(topicmodeler)