Python word_sent示例，underthesea.word_sent Python示例

示例#1

0

显示文件

文件： test_word_sent.py 项目： vunb/underthesea

    def test_simple_cases(self):
        sentence = u""
        actual = word_sent(sentence)
        expected = []
        self.assertEqual(actual, expected)

        actual = word_sent(sentence, format="text")
        expected = u""
        self.assertEqual(actual, expected)

示例#2

0

显示文件

文件： test_performance.py 项目： vietzerg/underthesea

 def test_1(self):
     n_tokens = 0
     for text in self.texts:
         n_tokens += len(tokenize(text).split(" "))
     start = time.time()
     for text in self.texts:
         word_sent(text)
     end = time.time()
     duration = end - start  # in seconds
     speed = n_tokens / duration
     print("Speed: ", speed)
     self.assertGreater(speed, EXPECTED_SPEED)

示例#3

0

显示文件

def question_answers(sents):
    """ Divide the dataset into two sets: questions and answers. """
    ques = []
    ans = []
    for i in range(len(sents) - 1):
        if sents[i][-1].endswith("?") and not sents[i+1][0].endswith("?"):
            try: 
                ques.append(word_sent(sents[i][-1], format="text").encode("utf-8"))
                ans.append(word_sent(sents[i+1][-0], format="text").encode("utf-8"))
            except:
                print(sents[i][-1])
                print(sents[i+1][0])
    return ques, ans

示例#4

0

显示文件

 def segmentation(self):
     """
     tách từ từ cơ sở dữ liệu đã được đưa lên
     :return:
     """
     for i in range(len(self.question)):
         result_question = uts.word_sent(self.question[i], format='text')
         self.question[i] = result_question
     for j in range(len(self.question)):
         result_answer = uts.word_sent(self.answer[j], format='text')
         self.answer[j] = result_answer
     for k in range(len(self.question)):
         result_date = uts.word_sent(self.date[k], format='text')
         self.date[k] = result_date
     return self

示例#5

0

显示文件

 def tokenize(self, str):
     sentence = str
     doc_bow = word_sent(sentence)
     result = []
     for i in doc_bow:
         result.append(i.lower())
     return result

示例#6

0

显示文件

文件： task_make_annotation.py 项目： nguyenntt97/N2N

def auto_annotation(input_file, output_folder="."):
    file_id = basename(input_file).split(".")[0]
    texts = open(input_file).read().strip().decode("utf-8").split("\n")
    content = u"\n".join([u" ".join(word_sent(text)) for text in texts])
    output_text_file = join(output_folder, "%s.txt" % file_id)
    io.open(output_text_file, "w", encoding="utf-8",
            newline="\n").write(content)

    start = 0
    end = 0
    output_annotation_file = join(output_folder, "%s.ann" % file_id)
    ann_file = io.open(output_annotation_file,
                       "w",
                       encoding="utf-8",
                       newline="\n")
    token_id = 1
    for text in texts:
        tokens = pos_tag(text)
        for token in tokens:
            word, tag = token
            end = start + len(word)
            ann_file.write(u"T%d\t%s %d %d\t%s\n" %
                           (token_id, tag, start, end, word))
            token_id += 1
            start = end + 1

示例#7

0

显示文件

文件： __init__.py 项目： enamoria/3module

def pos_tag(sentence, format=None):
    """
    Vietnamese POS tagging

    Parameters
    ==========

    sentence: {unicode, str}
        Raw sentence

    Returns
    =======
    tokens: list of tuple with word, pos tag
        tagged sentence
    Examples
    --------
    >>> # -*- coding: utf-8 -*-
    >>> from underthesea import pos_tag
    >>> sentence = "Chợ thịt chó nổi tiếng ở TPHCM bị truy quét"
    >>> pos_tag(sentence)
    [('Chợ', 'N'),
    ('thịt', 'N'),
    ('chó', 'N'),
    ('nổi tiếng', 'A'),
    ('ở', 'E'),
    ('TPHCM', 'Np'),
    ('bị', 'V'),
    ('truy quét', 'V')]
    """
    sentence = word_sent(sentence)
    crf_model = CRFPOSTagPredictor.Instance()
    result = crf_model.predict(sentence, format)
    return result

示例#8

0

显示文件

文件： views.py 项目： NguyenHauHN/underthesea.demo

def word_sent(request):
    result = {}
    try:
        text = json.loads(request.body.decode("utf-8"))["text"]
        tags = uts.word_sent(text)
        result["output"] = tags
    except:
        result = {"error": "Bad request!"}
    return JsonResponse(result)

示例#9

0

显示文件

文件： search_index.py 项目： BichTuyenNguyen/KhoaLuanTotNghiep

def word_separation(s):
    """
    Tách từ trong câu
    :return: List
    """
    text = uts.word_sent(s, format='text')  # tách từ
    tokenizer = RegexpTokenizer(
        '\w+')  # lấy từ và trả về một mảng danh sách các từ đã tách
    tokens = tokenizer.tokenize(text)
    return tokens  # ['đăng_kí', 'nguyện_vọng', '1', 'như', 'thế_nào', 'Em', 'cảm_ơn', 'ạ']

示例#10

0

显示文件

  def process_data_json(self, input_file, out_file):
    with open(input_file) as data_file:
      data = json.load(data_file)
    
    train=[]
    for element in data:
      element["text"] = word_sent(element["text"], format="text").lower()     

    with open(out_file, 'w', encoding='utf-8') as outfile:
    	json.dump(data, outfile, ensure_ascii=False)

示例#11

0

显示文件

文件： data_process.py 项目： nguyentran25/Basic-ML

 def create_freq_words(self, string):
     map = {}
     list_word = word_sent(string)
     for word in list_word:
         id = 0
         id = self.news_dict.load_word_id(word)
         if map.has_key(id):
             map[id] += 1
         else:
             map[id] = 1
     return (map, len(list_word))

示例#12

0

显示文件

文件： state.py 项目： AndrejLee/ZaloHackathon

 def question_parse(self, text):
     tokens = vnltk.word_sent(text)
     result = []
     if '?' in tokens:
         if 'gì' in tokens or 'bao nhiêu' in tokens or 'thế nào' in tokens:
             if 'giá' in tokens or 'giá cả' in tokens:
                 result.append('price')
             if 'tên' in tokens:
                 result.append('name')
             result.append('null')
     return result

示例#13

0

显示文件

def pos_tag(sentence, format=None):
    """
    part of speech tagging
    
    :param unicode|str sentence: raw sentence
    :return: tagged sentence 
    :rtype: list 
    """
    sentence = word_sent(sentence)
    crf_model = CRFModel.Instance()
    result = crf_model.predict(sentence, format)
    return result

示例#14

0

显示文件

文件： dictparser.py 项目： nguyentran25/Basic-ML

 def parse_string_to_arr_id(self, string_input, cnt):
     arr_id_ret = []
     items = word_sent(string_input)
     with open('./data/stop') as f:
         stop_word = f.readlines()
         stop_word = [x.strip() for x in stop_word]
     items  = [word for word in items if word.lower() not in stop_word]
     # load or create word
     for (i, item) in enumerate(items):
         arr_id_ret.append(self.load_or_create_word_id(item))
     print "File", cnt, "done"
     return arr_id_ret

示例#15

0

显示文件

 def test_word_sent(self):
     test_dir = join(dirname(__file__), "samples", "accuracy")
     files = listdir(test_dir)
     ids = [f.split(".")[0] for f in files if ".in" in f]
     for id in ids:
         input_file = join(test_dir, "%s.in" % id)
         output_file = join(test_dir, "%s.out" % id)
         sentence = load_input(input_file)
         actual = word_sent(sentence)
         expected = load_output(output_file)
         if actual != expected:
             print("Fail {}".format(id))
             save_temp(id, actual)
         self.assertEqual(actual, expected)

示例#16

0

显示文件

    def tokenize(self, text):
        from pyvi.pyvi import ViTokenizer
        from underthesea import word_sent
        # type: (Text) -> List[Token]
        # Vietnamese pyvi
        #tokenizer = ViTokenizer()
        #words = tokenizer.tokenize(text)

        # Vietnamese underthesea
        words = word_sent(text)
        tokenized = [(word, text.find(word), text.find(word) + len(word))
                     for word in words]
        tokens = [Token(word, start) for (word, start, end) in tokenized]

        return tokens

示例#17

0

显示文件

def handle_mess_2(sender_id, message):
    if message is not None:
        print('mess is not none')
        word_list = word_sent(message)

        find_node(word_list)
        chosen_branch = find_branch(word_list)

        if check_a_perfect_branch(chosen_branch):
            answer = find_answer_in_a_perfect_branch(chosen_branch)
            cbtest.send(sender_id, answer)
        else:
            print('abc')
    else:
        print('message is none')

示例#18

0

显示文件

文件： message.py 项目： hoangphuc17/Bot

def handle_faq_message(sender_id, message):
    if message is not None:
        print('message la: ', message)
        # kiem tra user, neu chua co thi them vao database
        check_user = USER.find_one({'id_user': sender_id})
        if bool(check_user):
            # pass
            # page.send(sender_id, "user da co trong database")
            print('user da co trong database')
        else:
            user_profile = page.get_user_profile(sender_id)  # return dict
            if user_profile['first_name'] is not None:

                first = user_profile["first_name"]
                last = user_profile["last_name"]
                id_user = user_profile["id"]
                insert_new_user(first, last, id_user)

        # TACH TU (word_segmentation)
        word_dict = word_sent(message)
        print('Word Segmentation: ', word_dict)

        chosen_cat = find_cat(sender_id, word_dict, message)
        if chosen_cat != {}:
            print('da tim thay chosen_cat')
            chosen_subcat = find_subcat(sender_id, word_dict, chosen_cat)
            if chosen_subcat != {}:
                print('da tim thay chosen_subcat')
                chosen_qa = find_qa(sender_id, word_dict, chosen_subcat)

                if chosen_qa != {}:
                    print('da tim thay chosen_qa')
                else:
                    print(
                        'tim thay chosen_cat,tim thay chosen_subcat, khong tim thay chosen_qa')
            else:
                print('tim thay chosen_cat, khong tim thay chosen_subcat')
        else:
            print('khong tim thay chosen_cat')
    else:
        print('Message is None')

示例#19

0

显示文件

def segment_words(corpus_dir, target_dir):
    try:
        mkdir(target_dir)
    except:
        pass
    corpus = PlainTextCorpus()
    corpus.load(corpus_dir)
    existed_documents = listdir(target_dir)
    n_ignore = 0
    for document in corpus.documents:
        document_id = document.id
        if document_id not in existed_documents:
            print("Process %s" % document_id)
            sentences = document.sentences
            sentences = _.flatten(sentences)
            sentences = [tokenize(s).split(" . ") for s in sentences]
            sentences = _.flatten(sentences)
            segmented_sentences = [word_sent(s) for s in sentences if s not in [u""]]
            content = convert_to_text(segmented_sentences)
            filepath = join(target_dir, document.id)
            io.open(filepath, "w", encoding="utf8").write(content)
        else:
            n_ignore += 1
    print("Ignore %s documents" % n_ignore)

示例#20

0

显示文件

文件： NLPVN.py 项目： BKThesisTeam/NLP_Vietnamese

 def split_word_sent(self):
     self.word_sent = word_sent(self.str)
     return self.word_sent

示例#21

0

显示文件

文件： test_word_sent.py 项目： vunb/underthesea

 def test_wordsent(self):
     text = u"""Tổng thống Nga coi việc Mỹ không kích căn cứ quân sự của Syria là "sự gây hấn nhằm vào một quốc gia có chủ quyền", gây tổn hại đến quan hệ Moscow-Washington."""
     word_sent(text)

示例#22

0

显示文件

def pos_tag(sentence, format=None):
    sentence = word_sent(sentence)
    crf_model = CRFPOSTagPredictor.Instance()
    result = crf_model.predict(sentence, format)
    return result

示例#23

0

显示文件

# -*- coding: utf-8 -*-

import underthesea

from Split_world import is_exist

print 'KIỂM LỖI CHÍNH TẢ\n\n'

sentence = u'Grab và xe ôm truyền thống\
 Mọi người cùng bình luận về giá cả, chất lượng 2 dịch vụ này'

_list = underthesea.word_sent(sentence)

for word in _list:
    print word.lower(), is_exist(word.lower().encode('utf-8'))


def get_word(dec):
    r_value = dec.split('\t')
    return r_value[0].strip()


def get_dictionary():

    input_file = open("VDic_uni.txt", "r")
    dic = []
    for line in input_file:
        dic.append(get_word(line))
    return dic

示例#24

0

显示文件

文件： command_recognition.py 项目： thandongtb/command_iot

 def __sentences_segmentation(self, sent):
     return word_sent(sent)

示例#25

0

显示文件

文件： zz_test.py 项目： enamoria/3module

from src.eval import whole_word_position
from underthesea import word_sent, pos_tag, chunk
test = "Đối với các chuyên khoa khác như : Phẩu thuật tổng quát ( nội trú 5 năm ) , sản ( nội trú 5 năm ) , chấn thương chỉnh hình ( nội trú 5 năm ) . Và chuyên sâu mỗi chuyên khoa tầm ( 1 - 3 năm tùy chuyên khoa ) . Nói chung cũng tầm 15 - 16 năm ( cho lầm sàn , chưa kể Ph.D )"
print(word_sent(test))
print(chunk(test))

from underthesea.word_sent.model_crf import CRFModel

示例#26

0

显示文件

文件： test_word_sent.py 项目： tuanoxf/underthesea

 def test_decomposed_from(self):
     text = u"yếu"
     acutal = word_sent(text)
     expected = [u'yếu']
     self.assertEqual(acutal, expected)

示例#27

0

显示文件

文件： test_word_sent.py 项目： tuanoxf/underthesea

 def test_special_cases_3(self):
     sentence = u"=))"
     actual = word_sent(sentence)
     expected = ["=))"]
     self.assertEqual(actual, expected)

示例#28

0

显示文件

 def test_simple_cases_2(self):
     sentence = u"="
     actual = word_sent(sentence)
     expected = ["="]
     self.assertEqual(actual, expected)

示例#29

0

显示文件

 def pre_process(self, sen):
     sen = word_sent(sen, format="text")
     sen = sen.lower()
     sen = self.repace_wrong_tokenize(sen)
     prced_sen = self.remove_stw(sen)
     return prced_sen

示例#30

0

显示文件

文件： state.py 项目： AndrejLee/ZaloHackathon

 def isNextItem(self, text):
     tokens = vnltk.word_sent(text)
     return 'khác' in tokens or text in [
         'tôi không thích cái này', 'không mua', 'không muốn cái này',
         'không thích cái này', 'bỏ qua cái này', 'bỏ qua'
     ]