示例#1
0
    def word2vec(self, x, n_grams) -> np.ndarray:
        """
        word2vec embedding

        :param x: list of texts
        :param n_grams: n-grams parameters
        :return: encoded vectors
        :rtype: numpy.ndarray
        """
        if self.word_vectors is None:
            assert os.path.exists(self.model_path), FileNotFoundError(
                f'Could not found word2vec model at {self.model_path}')

            import gensim
            from distutils.version import LooseVersion

            if LooseVersion(gensim.__version__) >= LooseVersion('1.0.1'):
                from gensim.models import KeyedVectors
                self.word_vectors = KeyedVectors.load_word2vec_format(
                    self.model_path, binary=True)
            else:
                from gensim.models import Word2Vec
                self.word_vectors = Word2Vec.load_word2vec_format(
                    self.model_path, binary=True)

        _x = None

        for item in tqdm(x, desc='Word2Vec Text Encoder'):
            __x = None
            items = TextProcessor.n_gram_split(item, n_grams)
            for token in items:
                try:
                    vector = self.word_vectors.get_vector(token)
                except Exception as ex:
                    vector = self.word_vectors.get_vector('unk')
                vector = np.expand_dims(vector, axis=0)
                __x = vector if __x is None else np.concatenate((__x, vector))

            if __x.shape[0] < self.max_length:
                adjust_size = self.max_length - __x.shape[0]
                adjust_array = np.zeros((adjust_size, 400))
                __x = np.concatenate((__x, adjust_array))

            __x = np.expand_dims(__x, axis=0)
            _x = __x if _x is None else np.concatenate((_x, __x))

        return _x
示例#2
0
    def bow(self, x, vocab, n_grams) -> np.ndarray:
        """
        Bag-of-Words encoder

        :param x: list of texts
        :param vocab: corpus vocabulary
        :param n_grams: n-grams parameters
        :return: Bag-of-Words vectors
        :rtype: numpy.ndarray
        """
        _x = np.zeros((x.__len__(), vocab.__len__()))
        for i, item in enumerate(tqdm(x, desc='Bag Of Words Text Encoder')):
            items = TextProcessor.n_gram_split(item, n_grams)
            for token in items:
                if token in vocab:
                    j = vocab.index(token)
                    _x[i][j] = _x[i][j] + 1

        return _x
示例#3
0
    def one_hot(self, x, vocab, n_grams) -> np.ndarray:
        """
        Convert corpus into batch of one-hot vectors.

        :param x: list of texts
        :param vocab: corpus vocabulary
        :param n_grams: n-grams parameters
        :return: one-hot vectors
        :rtype: numpy.ndarray
        """
        _x = np.zeros((x.__len__(), self.max_length, vocab.__len__()))
        for i, item in enumerate(tqdm(x, desc='One Hot Text Encoder')):
            items = TextProcessor.n_gram_split(item, n_grams)
            for j, token in enumerate(items):
                if j >= self.max_length:
                    break
                if token in vocab:
                    idx = vocab.index(token)
                    _x[i][j][idx] = 1

        return _x
示例#4
0
    def tf_idf(self, x, vocab, n_grams) -> np.ndarray:
        """
        Simple TF-IDF vectors

        :param x: list of texts
        :param vocab: corpus vocabulary
        :param n_grams: n-grams parameters
        :return: encoded vectors
        :rtype: numpy.ndarray
        """
        items = [TextProcessor.n_gram_split(item, n_grams) for item in x]
        appearances_in_doc = {k: 0 for k in vocab}

        for _ in items:
            _set = set()
            for __ in _:
                if __ not in _set:
                    appearances_in_doc[__] += 1
                    _set.add(__)

        import math

        _x = np.zeros((x.__len__(), vocab.__len__()))
        for i, _ in enumerate(tqdm(items, desc='TF-IDF Text Encoder')):
            appearances_in_here = dict()
            for __ in _:
                if __ not in appearances_in_here:
                    appearances_in_here[__] = 1
                else:
                    appearances_in_here[__] += 1

            for __ in _:
                if __ in vocab:
                    j = vocab.index(__)
                    _x[i][j] = math.log(1 +
                                        appearances_in_here[__]) * math.log(
                                            x.__len__() /
                                            appearances_in_doc[__])

        return _x
示例#5
0
import unittest
from sentivi.text_processor import TextProcessor

if __name__ == '__main__':
    text_processor = TextProcessor(
        methods=['remove_punctuation', 'word_segmentation'])
    print(text_processor('Trường đại học,   Tôn Đức Thắng, Hồ; Chí Minh.'))
    print(TextProcessor.n_gram_split('bài tập phân tích cảm xúc', 3))