def convert_traindata_embedvecs(self, classdict):
        """ Convert the training text data into embedded matrix.

        Convert the training text data into embedded matrix, where each short sentence
        is a normalized summed embedded vectors for all words.

        :param classdict: training data
        :return: tuples, consisting of class labels, matrix of embedded vectors, and corresponding outputs
        :type classdict: dict
        :rtype: (list, numpy.ndarray, list)
        """
        classlabels = classdict.keys()
        lblidx_dict = dict(zip(classlabels, range(len(classlabels))))

        indices = []
        embedvecs = []
        for classlabel in classlabels:
            for shorttext in classdict[classlabel]:
                embedvec = np.sum(np.array([
                    self.word_to_embedvec(token)
                    for token in tokenize(shorttext)
                ]),
                                  axis=0)
                norm = np.linalg.norm(embedvec)
                if norm == 0:
                    continue
                embedvec /= norm
                embedvecs.append(embedvec)
                category_bucket = [0] * len(classlabels)
                category_bucket[lblidx_dict[classlabel]] = 1
                indices.append(category_bucket)

        indices = np.array(indices)
        embedvecs = np.array(embedvecs)
        return classlabels, embedvecs, indices
Пример #2
0
 def update(self, additional_classdict):
     """ Update the model with additional data.
     
     It updates the topic model with additional data.
     
     Warning: It does not allow adding class labels, and new words.
     The dictionary is not changed. Therefore, such an update will alter the
     topic model only. It affects the topic vector representation. While the corpus
     is changed, the words pumped into calculating the similarity matrix is not changed.
     
     Therefore, this function means for a fast update.
     But if you want a comprehensive model, it is recommended to retrain.
     
     :param additional_classdict: additional training data
     :return: None
     :type additional_classdict: dict
     """
     # cannot use this way, as we want to update the corpus with existing words
     self.corpus, newcorpus = gc.update_corpus_labels(
         self.dictionary,
         self.corpus,
         additional_classdict,
         preprocess_and_tokenize=lambda sent: tokenize(
             self.preprocessor(sent)))
     self.topicmodel.update(newcorpus)
Пример #3
0
    def retrieve_bow(self, shorttext):
        """ Calculate the gensim bag-of-words representation of the given short text.

        :param shorttext: text to be represented
        :return: corpus representation of the text
        :type shorttext: str
        :rtype: list
        """
        return self.dictionary.doc2bow(tokenize(self.preprocessor(shorttext)))
Пример #4
0
    def generate_corpus(self, classdict):
        """ Calculate the gensim dictionary and corpus, and extract the class labels
        from the training data. Called by :func:`~train`.

        :param classdict: training data
        :return: None
        :type classdict: dict
        """
        self.dictionary, self.corpus, self.classlabels = gc.generate_gensim_corpora(
            classdict,
            preprocess_and_tokenize=lambda sent: tokenize(
                self.preprocessor(sent)))
    def shorttext_to_embedvec(self, shorttext):
        """ Convert the short text into an averaged embedded vector representation.

        Given a short sentence, it converts all the tokens into embedded vectors according to
        the given word-embedding model, sums
        them up, and normalize the resulting vector. It returns the resulting vector
        that represents this short sentence.

        :param shorttext: a short sentence
        :return: an embedded vector that represents the short sentence
        :type shorttext: str
        :rtype: numpy.ndarray
        """
        vec = np.sum([
            self.wvmodel[token] for token in tokenize(shorttext)
            if token in self.wvmodel
        ])
        norm = np.linalg.norm(vec)
        if norm != 0:
            vec /= np.linalg.norm(vec)
        return vec