示例#1
0
    def test_simple_cases(self):
        sentence = u""
        actual = word_sent(sentence)
        expected = []
        self.assertEqual(actual, expected)

        actual = word_sent(sentence, format="text")
        expected = u""
        self.assertEqual(actual, expected)
示例#2
0
 def test_1(self):
     n_tokens = 0
     for text in self.texts:
         n_tokens += len(tokenize(text).split(" "))
     start = time.time()
     for text in self.texts:
         word_sent(text)
     end = time.time()
     duration = end - start  # in seconds
     speed = n_tokens / duration
     print("Speed: ", speed)
     self.assertGreater(speed, EXPECTED_SPEED)
示例#3
0
def question_answers(sents):
    """ Divide the dataset into two sets: questions and answers. """
    ques = []
    ans = []
    for i in range(len(sents) - 1):
        if sents[i][-1].endswith("?") and not sents[i+1][0].endswith("?"):
            try: 
                ques.append(word_sent(sents[i][-1], format="text").encode("utf-8"))
                ans.append(word_sent(sents[i+1][-0], format="text").encode("utf-8"))
            except:
                print(sents[i][-1])
                print(sents[i+1][0])
    return ques, ans
示例#4
0
 def segmentation(self):
     """
     tách từ từ cơ sở dữ liệu đã được đưa lên
     :return:
     """
     for i in range(len(self.question)):
         result_question = uts.word_sent(self.question[i], format='text')
         self.question[i] = result_question
     for j in range(len(self.question)):
         result_answer = uts.word_sent(self.answer[j], format='text')
         self.answer[j] = result_answer
     for k in range(len(self.question)):
         result_date = uts.word_sent(self.date[k], format='text')
         self.date[k] = result_date
     return self
示例#5
0
 def tokenize(self, str):
     sentence = str
     doc_bow = word_sent(sentence)
     result = []
     for i in doc_bow:
         result.append(i.lower())
     return result
示例#6
0
def auto_annotation(input_file, output_folder="."):
    file_id = basename(input_file).split(".")[0]
    texts = open(input_file).read().strip().decode("utf-8").split("\n")
    content = u"\n".join([u" ".join(word_sent(text)) for text in texts])
    output_text_file = join(output_folder, "%s.txt" % file_id)
    io.open(output_text_file, "w", encoding="utf-8",
            newline="\n").write(content)

    start = 0
    end = 0
    output_annotation_file = join(output_folder, "%s.ann" % file_id)
    ann_file = io.open(output_annotation_file,
                       "w",
                       encoding="utf-8",
                       newline="\n")
    token_id = 1
    for text in texts:
        tokens = pos_tag(text)
        for token in tokens:
            word, tag = token
            end = start + len(word)
            ann_file.write(u"T%d\t%s %d %d\t%s\n" %
                           (token_id, tag, start, end, word))
            token_id += 1
            start = end + 1
示例#7
0
def pos_tag(sentence, format=None):
    """
    Vietnamese POS tagging

    Parameters
    ==========

    sentence: {unicode, str}
        Raw sentence

    Returns
    =======
    tokens: list of tuple with word, pos tag
        tagged sentence
    Examples
    --------
    >>> # -*- coding: utf-8 -*-
    >>> from underthesea import pos_tag
    >>> sentence = "Chợ thịt chó nổi tiếng ở TPHCM bị truy quét"
    >>> pos_tag(sentence)
    [('Chợ', 'N'),
    ('thịt', 'N'),
    ('chó', 'N'),
    ('nổi tiếng', 'A'),
    ('ở', 'E'),
    ('TPHCM', 'Np'),
    ('bị', 'V'),
    ('truy quét', 'V')]
    """
    sentence = word_sent(sentence)
    crf_model = CRFPOSTagPredictor.Instance()
    result = crf_model.predict(sentence, format)
    return result
示例#8
0
def word_sent(request):
    result = {}
    try:
        text = json.loads(request.body.decode("utf-8"))["text"]
        tags = uts.word_sent(text)
        result["output"] = tags
    except:
        result = {"error": "Bad request!"}
    return JsonResponse(result)
def word_separation(s):
    """
    Tách từ trong câu
    :return: List
    """
    text = uts.word_sent(s, format='text')  # tách từ
    tokenizer = RegexpTokenizer(
        '\w+')  # lấy từ và trả về một mảng danh sách các từ đã tách
    tokens = tokenizer.tokenize(text)
    return tokens  # ['đăng_kí', 'nguyện_vọng', '1', 'như', 'thế_nào', 'Em', 'cảm_ơn', 'ạ']
示例#10
0
  def process_data_json(self, input_file, out_file):
    with open(input_file) as data_file:
      data = json.load(data_file)
    
    train=[]
    for element in data:
      element["text"] = word_sent(element["text"], format="text").lower()     

    with open(out_file, 'w', encoding='utf-8') as outfile:
    	json.dump(data, outfile, ensure_ascii=False)  
示例#11
0
 def create_freq_words(self, string):
     map = {}
     list_word = word_sent(string)
     for word in list_word:
         id = 0
         id = self.news_dict.load_word_id(word)
         if map.has_key(id):
             map[id] += 1
         else:
             map[id] = 1
     return (map, len(list_word))
示例#12
0
 def question_parse(self, text):
     tokens = vnltk.word_sent(text)
     result = []
     if '?' in tokens:
         if 'gì' in tokens or 'bao nhiêu' in tokens or 'thế nào' in tokens:
             if 'giá' in tokens or 'giá cả' in tokens:
                 result.append('price')
             if 'tên' in tokens:
                 result.append('name')
             result.append('null')
     return result
示例#13
0
def pos_tag(sentence, format=None):
    """
    part of speech tagging
    
    :param unicode|str sentence: raw sentence
    :return: tagged sentence 
    :rtype: list 
    """
    sentence = word_sent(sentence)
    crf_model = CRFModel.Instance()
    result = crf_model.predict(sentence, format)
    return result
示例#14
0
 def parse_string_to_arr_id(self, string_input, cnt):
     arr_id_ret = []
     items = word_sent(string_input)
     with open('./data/stop') as f:
         stop_word = f.readlines()
         stop_word = [x.strip() for x in stop_word]
     items  = [word for word in items if word.lower() not in stop_word]
     # load or create word
     for (i, item) in enumerate(items):
         arr_id_ret.append(self.load_or_create_word_id(item))
     print "File", cnt, "done"
     return arr_id_ret
示例#15
0
 def test_word_sent(self):
     test_dir = join(dirname(__file__), "samples", "accuracy")
     files = listdir(test_dir)
     ids = [f.split(".")[0] for f in files if ".in" in f]
     for id in ids:
         input_file = join(test_dir, "%s.in" % id)
         output_file = join(test_dir, "%s.out" % id)
         sentence = load_input(input_file)
         actual = word_sent(sentence)
         expected = load_output(output_file)
         if actual != expected:
             print("Fail {}".format(id))
             save_temp(id, actual)
         self.assertEqual(actual, expected)
示例#16
0
    def tokenize(self, text):
        from pyvi.pyvi import ViTokenizer
        from underthesea import word_sent
        # type: (Text) -> List[Token]
        # Vietnamese pyvi
        #tokenizer = ViTokenizer()
        #words = tokenizer.tokenize(text)

        # Vietnamese underthesea
        words = word_sent(text)
        tokenized = [(word, text.find(word), text.find(word) + len(word))
                     for word in words]
        tokens = [Token(word, start) for (word, start, end) in tokenized]

        return tokens
示例#17
0
def handle_mess_2(sender_id, message):
    if message is not None:
        print('mess is not none')
        word_list = word_sent(message)

        find_node(word_list)
        chosen_branch = find_branch(word_list)

        if check_a_perfect_branch(chosen_branch):
            answer = find_answer_in_a_perfect_branch(chosen_branch)
            cbtest.send(sender_id, answer)
        else:
            print('abc')
    else:
        print('message is none')
示例#18
0
def handle_faq_message(sender_id, message):
    if message is not None:
        print('message la: ', message)
        # kiem tra user, neu chua co thi them vao database
        check_user = USER.find_one({'id_user': sender_id})
        if bool(check_user):
            # pass
            # page.send(sender_id, "user da co trong database")
            print('user da co trong database')
        else:
            user_profile = page.get_user_profile(sender_id)  # return dict
            if user_profile['first_name'] is not None:

                first = user_profile["first_name"]
                last = user_profile["last_name"]
                id_user = user_profile["id"]
                insert_new_user(first, last, id_user)

        # TACH TU (word_segmentation)
        word_dict = word_sent(message)
        print('Word Segmentation: ', word_dict)

        chosen_cat = find_cat(sender_id, word_dict, message)
        if chosen_cat != {}:
            print('da tim thay chosen_cat')
            chosen_subcat = find_subcat(sender_id, word_dict, chosen_cat)
            if chosen_subcat != {}:
                print('da tim thay chosen_subcat')
                chosen_qa = find_qa(sender_id, word_dict, chosen_subcat)

                if chosen_qa != {}:
                    print('da tim thay chosen_qa')
                else:
                    print(
                        'tim thay chosen_cat,tim thay chosen_subcat, khong tim thay chosen_qa')
            else:
                print('tim thay chosen_cat, khong tim thay chosen_subcat')
        else:
            print('khong tim thay chosen_cat')
    else:
        print('Message is None')
示例#19
0
def segment_words(corpus_dir, target_dir):
    try:
        mkdir(target_dir)
    except:
        pass
    corpus = PlainTextCorpus()
    corpus.load(corpus_dir)
    existed_documents = listdir(target_dir)
    n_ignore = 0
    for document in corpus.documents:
        document_id = document.id
        if document_id not in existed_documents:
            print("Process %s" % document_id)
            sentences = document.sentences
            sentences = _.flatten(sentences)
            sentences = [tokenize(s).split(" . ") for s in sentences]
            sentences = _.flatten(sentences)
            segmented_sentences = [word_sent(s) for s in sentences if s not in [u""]]
            content = convert_to_text(segmented_sentences)
            filepath = join(target_dir, document.id)
            io.open(filepath, "w", encoding="utf8").write(content)
        else:
            n_ignore += 1
    print("Ignore %s documents" % n_ignore)
示例#20
0
 def split_word_sent(self):
     self.word_sent = word_sent(self.str)
     return self.word_sent
示例#21
0
 def test_wordsent(self):
     text = u"""Tổng thống Nga coi việc Mỹ không kích căn cứ quân sự của Syria là "sự gây hấn nhằm vào một quốc gia có chủ quyền", gây tổn hại đến quan hệ Moscow-Washington."""
     word_sent(text)
示例#22
0
def pos_tag(sentence, format=None):
    sentence = word_sent(sentence)
    crf_model = CRFPOSTagPredictor.Instance()
    result = crf_model.predict(sentence, format)
    return result
示例#23
0
# -*- coding: utf-8 -*-

import underthesea

from Split_world import is_exist

print 'KIỂM LỖI CHÍNH TẢ\n\n'

sentence = u'Grab và xe ôm truyền thống\
 Mọi người cùng bình luận về giá cả, chất lượng 2 dịch vụ này'

_list = underthesea.word_sent(sentence)

for word in _list:
    print word.lower(), is_exist(word.lower().encode('utf-8'))


def get_word(dec):
    r_value = dec.split('\t')
    return r_value[0].strip()


def get_dictionary():

    input_file = open("VDic_uni.txt", "r")
    dic = []
    for line in input_file:
        dic.append(get_word(line))
    return dic

 def __sentences_segmentation(self, sent):
     return word_sent(sent)
示例#25
0
from src.eval import whole_word_position
from underthesea import word_sent, pos_tag, chunk
test = "Đối với các chuyên khoa khác như : Phẩu thuật tổng quát ( nội trú 5 năm ) , sản ( nội trú 5 năm ) , chấn thương chỉnh hình ( nội trú 5 năm ) . Và chuyên sâu mỗi chuyên khoa tầm ( 1 - 3 năm tùy chuyên khoa ) . Nói chung cũng tầm 15 - 16 năm ( cho lầm sàn , chưa kể Ph.D )"
print(word_sent(test))
print(chunk(test))

from underthesea.word_sent.model_crf import CRFModel
示例#26
0
 def test_decomposed_from(self):
     text = u"yếu"
     acutal = word_sent(text)
     expected = [u'yếu']
     self.assertEqual(acutal, expected)
示例#27
0
 def test_special_cases_3(self):
     sentence = u"=))"
     actual = word_sent(sentence)
     expected = ["=))"]
     self.assertEqual(actual, expected)
示例#28
0
 def test_simple_cases_2(self):
     sentence = u"="
     actual = word_sent(sentence)
     expected = ["="]
     self.assertEqual(actual, expected)
示例#29
0
 def pre_process(self, sen):
     sen = word_sent(sen, format="text")
     sen = sen.lower()
     sen = self.repace_wrong_tokenize(sen)
     prced_sen = self.remove_stw(sen)
     return prced_sen
示例#30
0
 def isNextItem(self, text):
     tokens = vnltk.word_sent(text)
     return 'khác' in tokens or text in [
         'tôi không thích cái này', 'không mua', 'không muốn cái này',
         'không thích cái này', 'bỏ qua cái này', 'bỏ qua'
     ]