Python TextModel.tokenize примеры использования

Язык программирования: Python

Пространство имен/Пакет: b4msa.textmodel

Класс/Тип: TextModel

Метод/Функция: tokenize

Примеров на hotexamples.com: 12

Python TextModel.tokenize - 12 примеров найдено. Это лучшие примеры Python кода для b4msa.textmodel.TextModel.tokenize, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

TextModel(30)

tokenize(9)

transform(3)

params(2)

text_transformations(2)

tonp(2)

__init__(1)

_num_terms(1)

default_parameters(1)

fit(1)

Пример #1

Показать файл

Файл: model.py Проект: MDAlvarezH/EvoMSA

class Corpus(BaseTextModel):
    """Text model using only words"""
    def __init__(self, corpus, **kwargs):
        self._text = os.getenv('TEXT', default='text')
        self._m = {}
        self._num_terms = 0
        self._training = True
        self._textModel = TextModel([''], token_list=[-1])
        self.fit(corpus)

    def get_text(self, text):
        return text[self._text]

    def fit(self, c):
        r = [self.__getitem__(x) for x in c]
        self._training = False
        return r

    @property
    def num_terms(self):
        return self._num_terms

    def tokenize(self, text):
        if isinstance(text, dict):
            text = self.get_text(text)
        if isinstance(text, (list, tuple)):
            tokens = []
            for _text in text:
                tokens.extend(self._textModel.tokenize(_text))
            return tokens
        else:
            return self._textModel.tokenize(text)

    def __getitem__(self, d):
        tokens = []
        for t in self.tokenize(d):
            try:
                index, k = self._m[t]
                if self._training:
                    self._m[t] = [index, k + 1]
            except KeyError:
                if not self._training:
                    continue
                index, k = self._num_terms, 1
                self._m[t] = [index, k]
                self._num_terms += 1
            tokens.append([index, k])
        return tokens

Пример #2

Показать файл

Файл: test_textmodel.py Проект: valjime95/b4msa

def test_lang():
    from b4msa.textmodel import TextModel

    text = ["Hi :) :P XD", "excelente dia xc", "el alma de la fiesta XD"]
    model = TextModel(
        text,
        **{
            "del_dup": True,
            "emo_option": "group",
            "lc": True,
            "negation": True,
            "num_option": "group",
            "stemming": True,
            "stopwords": "group",
            "del_diac": False,
            "token_list": [
                -1,
                # 5,
            ],
            "url_option": "group",
            "usr_option": "group",
            "lang": "spanish",
        })
    text = "El alma de la fiesta :) conociendo la maquinaria @user bebiendo nunca manches que onda"
    print(model.tokenize)
    a = model.tokenize(text)
    b = [
        '_sw', 'alma', '_sw', '_sw', 'fiest', '_pos', 'conoc', '_sw',
        'maquinari', '_usr', 'beb', 'no_manch', '_sw', 'onda'
    ]
    print(text)
    assert a == b, "got: {0}, expected: {1}".format(a, b)

Пример #3

Показать файл

Файл: test_textmodel.py Проект: valjime95/b4msa

def test_negations_italian():
    from b4msa.textmodel import TextModel

    text = ["XD"]

    model = TextModel(
        text, **{
            'num_option': 'group',
            'del_diac': False,
            'stopwords': 'delete',
            'negation': True,
            'stemming': True,
            'lc': False,
            'token_list': [-1],
            'usr_option': 'group',
            'del_dup': False,
            'emo_option': 'group',
            'lang': 'italian',
            'url_option': 'delete'
        })

    text = """@User Come non condividere; me ne frega niente"""
    a = model.tokenize(text)
    print("Input:", text)
    print("Output:", a)
    b = ['_usr', 'com', 'no_condividere', 'me', 'no_freg', 'nient']
    assert a == b

Пример #4

Показать файл

Файл: test_textmodel.py Проект: valjime95/b4msa

def test_negations():
    from b4msa.textmodel import TextModel

    text = ["el alma de la fiesta XD"]
    model = TextModel(
        text, **{
            'num_option': 'group',
            'del_diac': False,
            'stopwords': 'delete',
            'negation': True,
            'stemming': True,
            'lc': False,
            'token_list': [-1],
            'usr_option': 'group',
            'del_dup': False,
            'emo_option': 'group',
            'lang': 'spanish',
            'url_option': 'delete'
        })

    text = """@usuario los pollos y las vacas nunca hubiesen permitido que no se hubiese hecho nada al respecto"""
    a = model.tokenize(text)
    b = [
        '_usr', 'poll', 'vac', 'hub', 'no_permit', 'hub', 'no_hech',
        'no_respect'
    ]
    print(a, b)
    assert a == b

Пример #5

Показать файл

Файл: test_textmodel.py Проект: INGEOTEC/b4msa

def test_lang():
    from b4msa.textmodel import TextModel

    text = [
        "Hi :) :P XD",
        "excelente dia xc",
        "el alma de la fiesta XD"
    ]
    model = TextModel(text, **{
        "del_dup": True,
        "emo_option": "group",
        "lc": True,
        "negation": True,
        "num_option": "group",
        "stemming": True,
        "stopwords": "group",
        "del_diac": False,
        "token_list": [
            -1,
            # 5,
        ],
        "url_option": "group",
        "usr_option": "group",
        "lang": "spanish",
    })
    text = "El alma de la fiesta :) conociendo la maquinaria @user bebiendo nunca manches que onda"
    print(model.tokenize)
    a = model.tokenize(text)
    b = ['_sw', 'alma', '_sw', '_sw', 'fiest', '_pos', 'conoc', '_sw', 'maquinari', '_usr', 'beb', 'no_manch', '_sw', 'onda']
    print(text)
    assert a == b, "got: {0}, expected: {1}".format(a, b)

Пример #6

Показать файл

Файл: test_textmodel.py Проект: INGEOTEC/b4msa

def test_negations_italian():
    from b4msa.textmodel import TextModel

    text = [
        "XD"
    ]

    model = TextModel(text, **{
        'num_option': 'group',
        'del_diac': False,
        'stopwords': 'delete',
        'negation': True,
        'stemming': True,
        'lc': False, 'token_list': [-1],
        'usr_option': 'group',
        'del_dup': False,
        'emo_option': 'group',
        'lang': 'italian',
        'url_option': 'delete'
    })

    text = """@User Come non condividere; me ne frega niente"""
    a = model.tokenize(text)
    print("Input:", text)
    print("Output:", a)
    b = ['_usr', 'com', 'no_condividere', 'me', 'no_freg', 'nient']
    assert a == b

Пример #7

Показать файл

Файл: model.py Проект: MDAlvarezH/EvoMSA

    def tokenize(self, text):
        """Tokenize a text

        :param text: Text
        :type text: dict or str
        """

        if isinstance(text, dict):
            text = self.get_text(text)
        if isinstance(text, (list, tuple)):
            tokens = []
            for _text in text:
                tokens.extend(TextModel.tokenize(self, _text))
            return tokens
        else:
            return TextModel.tokenize(self, text)

Пример #8

Показать файл

Файл: test_textmodel.py Проект: INGEOTEC/b4msa

def test_negations():
    from b4msa.textmodel import TextModel

    text = [
        "el alma de la fiesta XD"
    ]
    model = TextModel(text, **{
        'num_option': 'group',
        'del_diac': False,
        'stopwords': 'delete',
        'negation': True,
        'stemming': True,
        'lc': False, 'token_list': [-1],
        'usr_option': 'group', 'del_dup': False, 'emo_option': 'group', 'lang': 'spanish', 'url_option': 'delete'
    })

    text = """@usuario los pollos y las vacas nunca hubiesen permitido que no se hubiese hecho nada al respecto"""
    a = model.tokenize(text)
    b = ['_usr', 'poll', 'vac', 'hub', 'no_permit', 'hub', 'no_hech', 'no_respect']
    print(a, b)
    assert a == b

Пример #9

Показать файл

Файл: test_textmodel.py Проект: dougcirqueira/b4msa

def test_lang():
    from b4msa.textmodel import TextModel

    #text = [
    #    "Hi :) :P XD",
    #    "excelente dia xc",
    #    "el alma de la fiesta XD"
    #]
    text = [
        "vish, nada desse carro! tomar Eguaa! vai toooomar no cuuuuu pq jamais vou deixar d lado! Realmente sem pe nem cabbbecaaa!! :("
    ]

    model = TextModel(
        text,
        **{
            "del_dup1": True,
            "emo_option": "group",
            "lc": True,
            "negation": True,
            "num_option": "group",
            "stemming": True,
            "stopwords": "group",
            "strip_diac": True,
            "token_list": [
                -1,
                # 5,
            ],
            "url_option": "group",
            "usr_option": "group",
            "lang": "portuguese",
        })
    #text = "El alma de la fiesta :) conociendo la maquinaria @user bebiendo nunca manches que onda"
    text = "vish, nada desse carro! tomar Eguaa! vai toooomar no cuuuuu pq jamais vou deixar d lado! Realmente sem pe nem cabbbecaaa!! :("
    a = model.tokenize(text)
    b = [
        '_sw', 'alma', '_sw', '_sw', 'fiest', '_pos', 'conoc', '_sw',
        'maquinari', '_usr', 'beb', 'no_manch', '_sw', 'onda'
    ]
    print a

Пример #10

Показать файл

Файл: 07BoW.py Проект: INGEOTEC/NLP-Course

ci

D = list(tweet_iterator('../../../datasets/semeval/semeval2017_En_train.json'))
tm = TextModel(token_list=[-1]).fit(D)

id2word = {v:k for k, v in tm.model.word2id.items()}
_ = {id2word[k]:v for k, v in tm.model.wordWeight.items()}

wc = WC().generate_from_frequencies(_)
plt.imshow(wc)
plt.axis('off')
plt.tight_layout()
plt.savefig('semeval2017_idf.png', dpi=300)

cnt = Counter()
_ = [cnt.update(tm.tokenize(x)) for x in D]
wc = WC().generate_from_frequencies(cnt)
plt.imshow(wc)
plt.axis('off')
plt.tight_layout()
plt.savefig('semeval2017_tf.png', dpi=300)


perf = load_model('dataset/performance.gz')
algs = list(perf.keys())
datasets = list(perf[algs[0]][1].keys())
data = []
for alg in algs:
    _ = [perf[alg][1][dataset] for dataset in datasets]
    data.append(_)
data = np.array(data)

Пример #11

Показать файл

Файл: model.py Проект: JuanCalderon/EvoMSA

class Corpus(BaseTextModel):
    """Text model using only words"""
    def __init__(self, corpus=None, **kwargs):
        self._text = os.getenv('TEXT', default='text')
        self._m = {}
        self._num_terms = 0
        self._training = True
        self._textModel = TextModel([''], token_list=[-1])
        if corpus is not None:
            self.fit(corpus)

    def get_text(self, text):
        return text[self._text]

    def fit(self, c):
        [self.__getitem__(x) for x in c]
        self._training = False
        return self

    @property
    def num_terms(self):
        return self._num_terms

    def tokenize(self, text):
        if isinstance(text, dict):
            text = self.get_text(text)
        if isinstance(text, (list, tuple)):
            tokens = []
            for _text in text:
                tokens.extend(self._textModel.tokenize(_text))
            return tokens
        else:
            return self._textModel.tokenize(text)

    def transform(self, texts):
        """Convert test into a vector

        :param texts: List of text to be transformed
        :type text: list

        :rtype: list

        Example:

        >>> from microtc.textmodel import TextModel
        >>> corpus = ['buenos dias catedras', 'catedras conacyt']
        >>> textmodel = TextModel().fit(corpus)
        >>> X = textmodel.transform(corpus)
        """
        return self._textModel.tonp([self.__getitem__(x) for x in texts])

    def __getitem__(self, d):
        tokens = []
        for t in self.tokenize(d):
            try:
                index, k = self._m[t]
                if self._training:
                    self._m[t] = [index, k + 1]
            except KeyError:
                if not self._training:
                    continue
                index, k = self._num_terms, 1
                self._m[t] = [index, k]
                self._num_terms += 1
            tokens.append([index, k])
        return tokens

Пример #12

Показать файл

Файл: 05TextNormalization.py Проект: INGEOTEC/NLP-Course

output = []
for word in text.split():
    w = stemmer.stem(word)
    output.append(w)
output = " ".join(output)
output

text = 'I like playing football on Saturday'
words = text.split()
n = 3
n_grams = []
for a in zip(*[words[i:] for i in range(n)]):
    n_grams.append("~".join(a))
n_grams

text = 'I like playing'
q = 4
q_grams = []
for a in zip(*[text[i:] for i in range(q)]):
    q_grams.append("".join(a))
q_grams

text = 'I like playing football with @mgraffg'
tm = TextModel(token_list=[-1, 5],
               lang='english',
               usr_option=OPTION_GROUP,
               stemming=True)
tm.text_transformations(text)

tm.tokenize(text)