def perform_sentence_segmentation(data_dict): from parsivar import Tokenizer my_tokenizer = Tokenizer() return_value = {} for folder_name in data_dict.keys(): return_value[folder_name] = {} for file_name in data_dict[folder_name].keys(): tmp_text = data_dict[folder_name][file_name] token_text = my_tokenizer.tokenize_sentences(tmp_text) return_value[folder_name][file_name] = token_text return return_value
from parsivar import Tokenizer my_tokenizer = Tokenizer() f = open("Sentence.txt", "r", encoding='utf-8-sig') _sentences = f.readlines() sentences = [] for _sentence in _sentences: sents = my_tokenizer.tokenize_sentences(_sentence)