예제 #1
0
파일: q1.py 프로젝트: nazaninsbr/NLP-UT
def perform_sentence_segmentation(data_dict):
    from parsivar import Tokenizer
    my_tokenizer = Tokenizer()

    return_value = {}
    for folder_name in data_dict.keys():
        return_value[folder_name] = {}
        for file_name in data_dict[folder_name].keys():
            tmp_text = data_dict[folder_name][file_name]
            token_text = my_tokenizer.tokenize_sentences(tmp_text)
            return_value[folder_name][file_name] = token_text

    return return_value
예제 #2
0
from parsivar import Tokenizer
my_tokenizer = Tokenizer()

f = open("Sentence.txt", "r", encoding='utf-8-sig')
_sentences = f.readlines()

sentences = []

for _sentence in _sentences:
    sents = my_tokenizer.tokenize_sentences(_sentence)