예제 #1
0
 def test_word_tokenize_longest(self):
     self.assertEqual(longest.segment(None), [])
     self.assertEqual(longest.segment(""), [])
     self.assertIsInstance(longest.segment("กรุงเทพฯมากๆเพราโพาง BKKฯ"),
                           list)
     self.assertEqual(
         word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="longest"),
         ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
     )
     longest_tokenizer = Tokenizer(["ปวด", "เฉียบ", "พลัน", "เฉียบพลัน"])
     self.assertEqual(
         longest_tokenizer.word_tokenize("ปวดเฉียบพลัน"),
         ["ปวด", "เฉียบพลัน"],
     )
     self.assertEqual(
         longest_tokenizer.word_tokenize("เฉียบพลัน"),
         ["เฉียบพลัน"],
     )
예제 #2
0
def tokenize(request):
    import csv
    KammuangDB = list()
    with open('./KammuangDB.csv','rt')as f:
        data = csv.reader(f)
        for row in data:
            KammuangDB.append(row)
    # return KammuangDB
    
    from pythainlp.corpus.common import thai_words
    from pythainlp.tokenize import Tokenizer

    text = "ขอน้ำบะดาย อู้บ่าดาย อู้เล่นบะได้ก๋า จะไปบึงกาฬ"
    PATH_TO_CUSTOM_DICTIONARY = './custom_dictionary.txt'
    _tokenizer = Tokenizer(custom_dict=PATH_TO_CUSTOM_DICTIONARY)
    text_af = _tokenizer.word_tokenize(text)
    # return HttpResponse("E %s" %_tokenizer.word_tokenize(text))
    # def index(request):
    # testvar = 'value'
    # return render(request, 'template.html', {'testvar': testvar})
    
    return render(request, "rrddisplay/tokenize.html", {'text':text,'text_af':text_af,'KammuangDB':KammuangDB})
예제 #3
0
    "ห้า": 5,
    "หก": 6,
    "เจ็ด": 7,
    "แปด": 8,
    "เก้า": 9,
}
_powers_of_10 = {
    "สิบ": 10,
    "ร้อย": 100,
    "พัน": 1000,
    "หมื่น": 10000,
    "แสน": 100000,
    # "ล้าน" was excluded as a special case
}
_valid_tokens = set(_digits.keys()) | set(_powers_of_10.keys()) | {"ล้าน"}
_tokenizer = Tokenizer(custom_dict=_valid_tokens)


def thaiword_to_num(word: str) -> int:
    """
    Converts the spelled-out numerals in Thai scripts into an actual integer.

    :param str word: Spelled-out numerals in Thai scripts
    :return: Corresponding integer value of the input
    :rtype: int

    :Example:
    ::

        from pythainlp.util import thaiword_to_num
    'ท๊อปส์', 'แมคโคร', 'แม็คโคร', 'โลตัส', 'บิ๊กซี', 'bigc', 'golden place',
    'big c', 'ขายไม่ดี', 'แพคคู่', 'ค่าจัดส่ง', 'shelf life', 'พนักงานขายนม',
    'ซื้อประจำ', 'หายาก', 'หาซื้อ', 'ของแถม', 'ราคาสูง', 'น้ำนมโค', 'นมโคแท้',
    'นมแพะ', 'นมโรงเรียน', 'แพ้นม', 'แพ้นมวัว', 'นมอัดเม็ด', 'เล่นเวท',
    'นำ้หนัก', 'คุณแม่มือใหม่', 'นมอุ่น', 'ชานม', 'กินนม', 'ดื่มนม',
    'ท้องเสีย', 'ขี้แตก', 'คุมอาหาร', 'นักวิ่ง', 'ร้านนมสด', 'ดูแลสุขภาพ',
    'คนท้อง', 'มวลกระดูก', 'คีเฟอร์นม', 'พันทิป', 'ร้านนม', 'เหมียวน้อย',
    'ลูกสุนัข', 'ลูกหมา', 'คายทิ้ง', 'เจมส์ จิ', 'เจมส์จิ', 'ณเดช', 'ณเดชน์',
    'สตอรี่', 'อยากสูง', 'ส่วนสูง', 'สูงขึ้น', 'รักษามะเร็ง', 'รักษาเบาหวาน',
    'ไม่มี', 'ไม่ชอบ', 'ไม่ได้', 'ไม่อร่อย', 'ชาไข่มุก', 'ชานมไข่มุก', 'นมข้น',
    'อเมซอน', 'นมเมจิสีฟ้า', 'ทำฟอง', 'ตีฟอง', 'โฟมนม', 'มื้อเช้า',
    'ไขมันทรานส์', 'ดาราเดลี่', 'แดรี่ฟาร์ม', 'แดรี่ควีน'
]
words = set(thai_words()).union(set(custom_list))
_trie = dict_trie(dict_source=words)
_tokenizer = Tokenizer(custom_dict=_trie, engine=_TOKENIZER_ENGINE)

########################################################


def _is_stopword(word: str) -> bool:  # เช็คว่าเป็นคำฟุ่มเฟือย
    return word in thai_stopwords()


def _doc2features(doc, i) -> dict:
    word = doc[i][0]
    postag = doc[i][1]

    # Features from current word
    features = {
        "word.word": word,
예제 #5
0
    "เอ็ด": 1,
    # กำหนดค่าของหน่วยเวลา
    "โมงเช้า": 6,  # เริ่มนับ 7:00
    "โมงเย็น": 13,
    "บ่าย": 13,
    "บ่ายโมง": 13,
    "ตี": 0,
    "เที่ยงวัน": 12,
    "เที่ยงคืน": 0,
    "เที่ยง": 12,
    "ทุ่ม": 18,
    "นาฬิกา": 0,
    "ครึ่ง": 30,
}
_THAI_TIME_CUT = Tokenizer(
    custom_dict=list(_DICT_THAI_TIME.keys()), engine="newmm"
)


def _format_6h(h: int) -> str:
    """Thai time (6-hour clock)."""
    text = ""

    if h == 0:
        text += "เที่ยงคืน"
    elif h < 7:
        text += "ตี" + num_to_thaiword(h)
    elif h < 12:
        text += num_to_thaiword(h - 6) + "โมงเช้า"
    elif h == 12:
        text += "เที่ยง"
예제 #6
0
    for line in f:
        pos.append(line.rstrip())

with open("neg.txt", 'r') as f:
    for line in f:
        neg.append(line.rstrip())

url = '35213250'
opinions = []
with open(url + ".txt", 'r') as f:
    for line in f:
        opinions.append(line.rstrip())

mydict = pos + neg

tokenizer = Tokenizer(custom_dict=mydict, engine='newmm')

for opinion in opinions:
    neg_count = 0
    pos_count = 0
    print(opinion)
    text = tokenizer.word_tokenize(opinion)
    for word in text:
        if word in pos:
            pos_count = pos_count + 1
        if word in neg:
            neg_count = neg_count + 1

    if pos_count > neg_count:
        print('Positive')
    elif neg_count > pos_count:
예제 #7
0
    "merge_wgts",
    "pre_rules_th",
    "post_rules_th",
    "pre_rules_th_sparse",
    "post_rules_th_sparse",
    "process_thai",
    "_THWIKI_LSTM",
]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

_MODEL_NAME_LSTM = "wiki_lm_lstm"
_ITOS_NAME_LSTM = "wiki_itos_lstm"

_THAI2FIT_WORDS = get_corpus("words_th_thai2fit_201810.txt")
_pythainlp_tokenizer = Tokenizer(custom_dict=_THAI2FIT_WORDS, engine="newmm")


# Download pretrained models
def _get_path(fname: str) -> str:
    """
    :meth: download get path of file from pythainlp-corpus
    :param str fname: file name
    :return: path to downloaded file
    """
    path = get_corpus_path(fname)
    if not path:
        download(fname)
        path = get_corpus_path(fname)
    return path
예제 #8
0
파일: etcc.py 프로젝트: totaeza31/pythainlp
:See Also:

Inrut, Jeeragone, Patiroop Yuanghirun, Sarayut Paludkong, Supot Nitsuwat, and
Para Limmaneepraserth. "Thai word segmentation using combination of forward
and backward longest matching techniques." In International Symposium on
Communications and Information Technology (ISCIT), pp. 37-40. 2001.
"""
import re
from typing import List

from pythainlp import thai_follow_vowels
from pythainlp.corpus import get_corpus
from pythainlp.tokenize import Tokenizer

_cut_etcc = Tokenizer(get_corpus("etcc.txt"), engine="longest")
_PAT_ENDING_CHAR = f"[{thai_follow_vowels}ๆฯ]"
_RE_ENDING_CHAR = re.compile(_PAT_ENDING_CHAR)


def _cut_subword(tokens: List[str]) -> List[str]:
    len_tokens = len(tokens)
    i = 0
    while True:
        if i == len_tokens:
            break
        if _RE_ENDING_CHAR.search(tokens[i]) and i > 0 and len(tokens[i]) == 1:
            tokens[i - 1] += tokens[i]
            del tokens[i]
            len_tokens -= 1
        i += 1
예제 #9
0
# -*- coding: utf-8 -*-
from typing import List
from pythainlp.tokenize import Tokenizer
from laonlp.corpus import lao_words

_word = Tokenizer(lao_words(), engine="mm")


def word_tokenize(sent: str) -> List[str]:
    """
    Lao word tokenize

    :param str sent: lao text
    :return: returns a list of lao words
    :rtype: list
    """
    return _word.word_tokenize(sent)


def sent_tokenize(txt: str) -> List[str]:
    """
    Sentence tokenizer.

    Lao Text to sentence

    :param str sent: lao text
    :return: returns a list of lao sentence
    :rtype: list
    """
    return txt.split(".")
예제 #10
0
 def __init__(self, lang='th'):
     self.lang = lang
     self.pyengine = PyThaiTokenizer(
         os.path.join(github_path, 'words_modified.txt'))
예제 #11
0
 def test_Tokenizer(self):
     t_test = Tokenizer()
     self.assertEqual(t_test.word_tokenize(""), [])
예제 #12
0
    "หก": 6,
    "เจ็ด": 7,
    "แปด": 8,
    "เก้า": 9,
}
_powers_of_10 = {
    "สิบ": 10,
    "ร้อย": 100,
    "พัน": 1000,
    "หมื่น": 10000,
    "แสน": 100000,
    # "ล้าน" was excluded as a special case
}
_valid_tokens = (set(_digits.keys()) | set(_powers_of_10.keys())
                 | {"ล้าน", "ลบ"})
_tokenizer = Tokenizer(custom_dict=_valid_tokens)


def _check_is_thainum(word: str):
    for j in list(_digits.keys()):
        if j in word:
            return (True, 'num')
    for j in ["สิบ", "ร้อย", "พัน", "หมื่น", "แสน", "ล้าน", "จุด", "ลบ"]:
        if j in word:
            return (True, 'unit')
    return (False, None)


_dict_words = [i for i in list(thai_words()) if not _check_is_thainum(i)[0]]
_dict_words += list(_digits.keys())
_dict_words += ["สิบ", "ร้อย", "พัน", "หมื่น", "แสน", "ล้าน", "จุด"]
예제 #13
0
 def test_Tokenizer(self):
     t_test = Tokenizer(FROZEN_DICT_TRIE)
     self.assertEqual(t_test.word_tokenize(""), [])
     t_test.set_tokenize_engine("longest")
     self.assertEqual(t_test.word_tokenize(None), [])