Exemplos de Tokenizer em Python, exemplos de pythainlp.tokenize.Tokenizer em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: test_tokenize.py Projeto: Aattawut/textcls-flaskapi-02

 def test_word_tokenize_longest(self):
     self.assertEqual(longest.segment(None), [])
     self.assertEqual(longest.segment(""), [])
     self.assertIsInstance(longest.segment("กรุงเทพฯมากๆเพราโพาง BKKฯ"),
                           list)
     self.assertEqual(
         word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="longest"),
         ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
     )
     longest_tokenizer = Tokenizer(["ปวด", "เฉียบ", "พลัน", "เฉียบพลัน"])
     self.assertEqual(
         longest_tokenizer.word_tokenize("ปวดเฉียบพลัน"),
         ["ปวด", "เฉียบพลัน"],
     )
     self.assertEqual(
         longest_tokenizer.word_tokenize("เฉียบพลัน"),
         ["เฉียบพลัน"],
     )

Exemplo n.º 2

0

Exibir arquivo

def tokenize(request):
    import csv
    KammuangDB = list()
    with open('./KammuangDB.csv','rt')as f:
        data = csv.reader(f)
        for row in data:
            KammuangDB.append(row)
    # return KammuangDB
    
    from pythainlp.corpus.common import thai_words
    from pythainlp.tokenize import Tokenizer

    text = "ขอน้ำบะดาย อู้บ่าดาย อู้เล่นบะได้ก๋า จะไปบึงกาฬ"
    PATH_TO_CUSTOM_DICTIONARY = './custom_dictionary.txt'
    _tokenizer = Tokenizer(custom_dict=PATH_TO_CUSTOM_DICTIONARY)
    text_af = _tokenizer.word_tokenize(text)
    # return HttpResponse("E %s" %_tokenizer.word_tokenize(text))
    # def index(request):
    # testvar = 'value'
    # return render(request, 'template.html', {'testvar': testvar})
    
    return render(request, "rrddisplay/tokenize.html", {'text':text,'text_af':text_af,'KammuangDB':KammuangDB})

Exemplo n.º 3

0

Exibir arquivo

    "ห้า": 5,
    "หก": 6,
    "เจ็ด": 7,
    "แปด": 8,
    "เก้า": 9,
}
_powers_of_10 = {
    "สิบ": 10,
    "ร้อย": 100,
    "พัน": 1000,
    "หมื่น": 10000,
    "แสน": 100000,
    # "ล้าน" was excluded as a special case
}
_valid_tokens = set(_digits.keys()) | set(_powers_of_10.keys()) | {"ล้าน"}
_tokenizer = Tokenizer(custom_dict=_valid_tokens)


def thaiword_to_num(word: str) -> int:
    """
    Converts the spelled-out numerals in Thai scripts into an actual integer.

    :param str word: Spelled-out numerals in Thai scripts
    :return: Corresponding integer value of the input
    :rtype: int

    :Example:
    ::

        from pythainlp.util import thaiword_to_num

Exemplo n.º 4

0

Exibir arquivo

Arquivo: named_entity.py Projeto: pacharapol4066/Product-Position-SNA

    'ท๊อปส์', 'แมคโคร', 'แม็คโคร', 'โลตัส', 'บิ๊กซี', 'bigc', 'golden place',
    'big c', 'ขายไม่ดี', 'แพคคู่', 'ค่าจัดส่ง', 'shelf life', 'พนักงานขายนม',
    'ซื้อประจำ', 'หายาก', 'หาซื้อ', 'ของแถม', 'ราคาสูง', 'น้ำนมโค', 'นมโคแท้',
    'นมแพะ', 'นมโรงเรียน', 'แพ้นม', 'แพ้นมวัว', 'นมอัดเม็ด', 'เล่นเวท',
    'นำ้หนัก', 'คุณแม่มือใหม่', 'นมอุ่น', 'ชานม', 'กินนม', 'ดื่มนม',
    'ท้องเสีย', 'ขี้แตก', 'คุมอาหาร', 'นักวิ่ง', 'ร้านนมสด', 'ดูแลสุขภาพ',
    'คนท้อง', 'มวลกระดูก', 'คีเฟอร์นม', 'พันทิป', 'ร้านนม', 'เหมียวน้อย',
    'ลูกสุนัข', 'ลูกหมา', 'คายทิ้ง', 'เจมส์ จิ', 'เจมส์จิ', 'ณเดช', 'ณเดชน์',
    'สตอรี่', 'อยากสูง', 'ส่วนสูง', 'สูงขึ้น', 'รักษามะเร็ง', 'รักษาเบาหวาน',
    'ไม่มี', 'ไม่ชอบ', 'ไม่ได้', 'ไม่อร่อย', 'ชาไข่มุก', 'ชานมไข่มุก', 'นมข้น',
    'อเมซอน', 'นมเมจิสีฟ้า', 'ทำฟอง', 'ตีฟอง', 'โฟมนม', 'มื้อเช้า',
    'ไขมันทรานส์', 'ดาราเดลี่', 'แดรี่ฟาร์ม', 'แดรี่ควีน'
]
words = set(thai_words()).union(set(custom_list))
_trie = dict_trie(dict_source=words)
_tokenizer = Tokenizer(custom_dict=_trie, engine=_TOKENIZER_ENGINE)

########################################################


def _is_stopword(word: str) -> bool:  # เช็คว่าเป็นคำฟุ่มเฟือย
    return word in thai_stopwords()


def _doc2features(doc, i) -> dict:
    word = doc[i][0]
    postag = doc[i][1]

    # Features from current word
    features = {
        "word.word": word,

Exemplo n.º 5

0

Exibir arquivo

    "เอ็ด": 1,
    # กำหนดค่าของหน่วยเวลา
    "โมงเช้า": 6,  # เริ่มนับ 7:00
    "โมงเย็น": 13,
    "บ่าย": 13,
    "บ่ายโมง": 13,
    "ตี": 0,
    "เที่ยงวัน": 12,
    "เที่ยงคืน": 0,
    "เที่ยง": 12,
    "ทุ่ม": 18,
    "นาฬิกา": 0,
    "ครึ่ง": 30,
}
_THAI_TIME_CUT = Tokenizer(
    custom_dict=list(_DICT_THAI_TIME.keys()), engine="newmm"
)


def _format_6h(h: int) -> str:
    """Thai time (6-hour clock)."""
    text = ""

    if h == 0:
        text += "เที่ยงคืน"
    elif h < 7:
        text += "ตี" + num_to_thaiword(h)
    elif h < 12:
        text += num_to_thaiword(h - 6) + "โมงเช้า"
    elif h == 12:
        text += "เที่ยง"

Exemplo n.º 6

0

Exibir arquivo

Arquivo: naive.py Projeto: NAzT/dsc-hw-sentiment

    for line in f:
        pos.append(line.rstrip())

with open("neg.txt", 'r') as f:
    for line in f:
        neg.append(line.rstrip())

url = '35213250'
opinions = []
with open(url + ".txt", 'r') as f:
    for line in f:
        opinions.append(line.rstrip())

mydict = pos + neg

tokenizer = Tokenizer(custom_dict=mydict, engine='newmm')

for opinion in opinions:
    neg_count = 0
    pos_count = 0
    print(opinion)
    text = tokenizer.word_tokenize(opinion)
    for word in text:
        if word in pos:
            pos_count = pos_count + 1
        if word in neg:
            neg_count = neg_count + 1

    if pos_count > neg_count:
        print('Positive')
    elif neg_count > pos_count:

Exemplo n.º 7

0

Exibir arquivo

Arquivo: __init__.py Projeto: madmuv/pythainlp

    "merge_wgts",
    "pre_rules_th",
    "post_rules_th",
    "pre_rules_th_sparse",
    "post_rules_th_sparse",
    "process_thai",
    "_THWIKI_LSTM",
]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

_MODEL_NAME_LSTM = "wiki_lm_lstm"
_ITOS_NAME_LSTM = "wiki_itos_lstm"

_THAI2FIT_WORDS = get_corpus("words_th_thai2fit_201810.txt")
_pythainlp_tokenizer = Tokenizer(custom_dict=_THAI2FIT_WORDS, engine="newmm")


# Download pretrained models
def _get_path(fname: str) -> str:
    """
    :meth: download get path of file from pythainlp-corpus
    :param str fname: file name
    :return: path to downloaded file
    """
    path = get_corpus_path(fname)
    if not path:
        download(fname)
        path = get_corpus_path(fname)
    return path

Exemplo n.º 8

0

Exibir arquivo

Arquivo: etcc.py Projeto: totaeza31/pythainlp

:See Also:

Inrut, Jeeragone, Patiroop Yuanghirun, Sarayut Paludkong, Supot Nitsuwat, and
Para Limmaneepraserth. "Thai word segmentation using combination of forward
and backward longest matching techniques." In International Symposium on
Communications and Information Technology (ISCIT), pp. 37-40. 2001.
"""
import re
from typing import List

from pythainlp import thai_follow_vowels
from pythainlp.corpus import get_corpus
from pythainlp.tokenize import Tokenizer

_cut_etcc = Tokenizer(get_corpus("etcc.txt"), engine="longest")
_PAT_ENDING_CHAR = f"[{thai_follow_vowels}ๆฯ]"
_RE_ENDING_CHAR = re.compile(_PAT_ENDING_CHAR)


def _cut_subword(tokens: List[str]) -> List[str]:
    len_tokens = len(tokens)
    i = 0
    while True:
        if i == len_tokens:
            break
        if _RE_ENDING_CHAR.search(tokens[i]) and i > 0 and len(tokens[i]) == 1:
            tokens[i - 1] += tokens[i]
            del tokens[i]
            len_tokens -= 1
        i += 1

Exemplo n.º 9

0

Exibir arquivo

Arquivo: __init__.py Projeto: wannaphong/LaoNLP

# -*- coding: utf-8 -*-
from typing import List
from pythainlp.tokenize import Tokenizer
from laonlp.corpus import lao_words

_word = Tokenizer(lao_words(), engine="mm")


def word_tokenize(sent: str) -> List[str]:
    """
    Lao word tokenize

    :param str sent: lao text
    :return: returns a list of lao words
    :rtype: list
    """
    return _word.word_tokenize(sent)


def sent_tokenize(txt: str) -> List[str]:
    """
    Sentence tokenizer.

    Lao Text to sentence

    :param str sent: lao text
    :return: returns a list of lao sentence
    :rtype: list
    """
    return txt.split(".")

Exemplo n.º 10

0

Exibir arquivo

 def __init__(self, lang='th'):
     self.lang = lang
     self.pyengine = PyThaiTokenizer(
         os.path.join(github_path, 'words_modified.txt'))

Exemplo n.º 11

0

Exibir arquivo

 def test_Tokenizer(self):
     t_test = Tokenizer()
     self.assertEqual(t_test.word_tokenize(""), [])

Exemplo n.º 12

0

Exibir arquivo

    "หก": 6,
    "เจ็ด": 7,
    "แปด": 8,
    "เก้า": 9,
}
_powers_of_10 = {
    "สิบ": 10,
    "ร้อย": 100,
    "พัน": 1000,
    "หมื่น": 10000,
    "แสน": 100000,
    # "ล้าน" was excluded as a special case
}
_valid_tokens = (set(_digits.keys()) | set(_powers_of_10.keys())
                 | {"ล้าน", "ลบ"})
_tokenizer = Tokenizer(custom_dict=_valid_tokens)


def _check_is_thainum(word: str):
    for j in list(_digits.keys()):
        if j in word:
            return (True, 'num')
    for j in ["สิบ", "ร้อย", "พัน", "หมื่น", "แสน", "ล้าน", "จุด", "ลบ"]:
        if j in word:
            return (True, 'unit')
    return (False, None)


_dict_words = [i for i in list(thai_words()) if not _check_is_thainum(i)[0]]
_dict_words += list(_digits.keys())
_dict_words += ["สิบ", "ร้อย", "พัน", "หมื่น", "แสน", "ล้าน", "จุด"]

Exemplo n.º 13

0

Exibir arquivo

 def test_Tokenizer(self):
     t_test = Tokenizer(FROZEN_DICT_TRIE)
     self.assertEqual(t_test.word_tokenize(""), [])
     t_test.set_tokenize_engine("longest")
     self.assertEqual(t_test.word_tokenize(None), [])