Пример #1
0
def createNgram(word, gram):
    segmented_word = segment(word_cleaning(word))
    ret_word = []
    grams = ngrams(segmented_word, gram)
    for t in grams:
        ret_word.append(''.join(t))
    return ret_word
Пример #2
0
def romanization(txt):
	txt = segment(txt)  # (','.join(str(x) for x in txt))  # แยกออกมาเป็น list
	cc=''
	#print(txt)
	for b in txt:
		cc+=consonant(b)
	return cc
Пример #3
0
def create_ngram_from_list_bynltk(word_arr, gram):
    keywords = []
    for item in word_arr:
        segmented_word = segment(word_cleaning(item))
        grams = ngrams(segmented_word, gram)
        for t in grams:
            keywords.append(''.join(t))
    return list(set(keywords))
Пример #4
0
def nlp_segment(text):  # type: str
    parts = []
    for part in text.split(" "):
        try:
            subparts = segment(part)
            parts.extend(subparts)
        except Exception:
            parts.append(part)
    return parts
Пример #5
0
 def cutSentence(self, sentence):
     '''cut thai language
     input:
         @sentence : input text
     output:
         a list of words
     '''
     try:
         result_cut = segment(sentence)
         Log().write('thai language cut words complete.')
         return result_cut
     except Exception as e:
         Log().write(e, 'Error')
     return False
Пример #6
0
def review_to_wordlist(review):
    review_text = review
    review_text = review_text.lower()
    review_text = review_text.replace(u'é', 'e')
    pokemonSyn = [u'pokemongo', u'โปเกม่อนโก', u'โปเกมอนโก', u'pokemon go']
    for syn in pokemonSyn:
        review_text = review_text.replace(syn, u'ม่อน')
    removeCharList = ['rt', '!', '?']
    for c in removeCharList:
        review_text = review_text.replace(c, '')
    elimSet = ['http', '@', '#']
    review_word_list = review_text.split()
    for e in elimSet:
        review_word_list = [word for word in review_word_list if e not in word]

    reviewSentence = "".join(review_word_list)
    wordList = segment(reviewSentence)

    return wordList
Пример #7
0
	def testSegment(self):
		self.assertEqual(segment('ฉันรักภาษาไทยเพราะฉันเป็นคนไทย'),[u'ฉัน', u'รัก', u'ภาษา', u'ไทย', u'เพราะ', u'ฉัน', u'เป็น', u'คน', u'ไทย'])
Пример #8
0
def tag(text):
    """รับค่าเป็นข้อความ ''str'' คืนค่าเป็น ''list'' เช่น [('ข้อความ', 'ชนิดคำ')]"""
    text = segment(text)
    tagger = nltk.tag.UnigramTagger(model=data1)  # backoff=default_tagger)
    return tagger.tag(text)
Пример #9
0
def tag(text):
	"""รับค่าเป็นข้อความ ''str'' คืนค่าเป็น ''list'' เช่น [('ข้อความ', 'ชนิดคำ')]"""
	text= segment(text)
	tagger = nltk.tag.UnigramTagger(model=data1)# backoff=default_tagger)
	return tagger.tag(text)
Пример #10
0
def word_tokenize(text):
    return segment(text)
Пример #11
0
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from pythainlp.segment import segment
a = 'ฉันรักภาษาไทยเพราะฉันเป็นคนไทย'
b = segment(a)
print(b)
from pythainlp.segment.dict import segment
print(segment(a))
print(type(b))
from pythainlp.rank import rank
aa = rank(b)
print(aa)
from pythainlp.romanization import romanization
b=romanization("แมว")
print(b)
from pythainlp.change import *
a="l;ylfu8iy["
a=texttothai(a)
b="นามรสนอำันี"
b=texttoeng(b)
print(a)
print(b)
from pythainlp.segment.dict import segment
print(segment('ปีคริสต์ศักราช'))
from pythainlp.number import numtowords
print("5611116.50")
print(numtowords(5611116.50))

from pythainlp.postaggers import tag
Пример #12
0
from pythainlp.segment import segment
a = 'ฉันรักภาษาไทยเพราะฉันเป็นคนไทย'
b = segment(a)
print(b)
from pythainlp.segment.dict import segment
print(segment(a))
print(type(b))
from pythainlp.rank import rank
aa = rank(a)
print(aa)
from pythainlp.romanization import romanization
b=romanization("ต้นกก")
print(b)
from pythainlp.change import *
a="l;ylfu8iy["
a=texttothai(a)
b="นามรสนอำันี"
b=texttoeng(b)
print(a)
print(b)
from pythainlp.segment.dict import segment
print(segment('ฉันรักคุณ'))
from pythainlp.number import numtowords
print("5611116.50")
print(numtowords(5611116.50))
Пример #13
0
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from pythainlp.segment import segment
a = 'ฉันรักภาษาไทยเพราะฉันเป็นคนไทย'
b = segment(a)
print(b)
from pythainlp.segment.dict import segment
print(segment(a))
print(type(b))
from pythainlp.rank import rank
aa = rank(b)
print(aa)
from pythainlp.romanization import romanization
b = romanization("แมว")
print(b)
from pythainlp.change import *
a = "l;ylfu8iy["
a = texttothai(a)
b = "นามรสนอำันี"
b = texttoeng(b)
print(a)
print(b)
from pythainlp.segment.dict import segment
print(segment('ปีคริสต์ศักราช'))
from pythainlp.number import numtowords
print("5611116.50")
print(numtowords(5611116.50))

from pythainlp.postaggers import tag
Пример #14
0
from pythainlp.segment import segment
a = 'ฉันรักภาษาไทยเพราะฉันเป็นคนไทยและฉันใช้ภาษาไทย'
b = segment(a)
print(b)