def Modify_Data(tokenize_engine): y = open('StartWord.txt', 'r') Data = y.read() Data_Split = Data.split("|") Start_Word_Dicts = {1:[],2:[],3:[],4:[],5:[]} for n in Data_Split: Start_Word_Dicts[len(wt(n,engine=tokenize_engine))].append(wt(n,engine=tokenize_engine)) return Start_Word_Dicts
def Modify_Text(text,Lower_Bound,Upper_Bound,tokenize_engine): Array_Of_Text = [] Real_len_Array = [] Real_len = 0 Word_len = 0 for n in wt(text,engine=tokenize_engine): if n not in ['/n',' ','(',')','“','”']: Word_len += 1 if Word_len == Lower_Bound: Real_len_Array.append(Real_len) elif Word_len == Upper_Bound: Real_len_Array.append(Real_len) Real_len += 1 Array_Of_Text = wt(text,engine=tokenize_engine)[Real_len_Array[0]:Real_len_Array[1]+1] return Array_Of_Text,Real_len_Array
def clean(text: str) -> list: """ teim text & tokenize """ text = html.unescape(text) text = re.sub(r'(\n|\t|\xa0)', ' ', text) text = re.sub(r'(\r|\u200b)', '', text) text = re.sub(r'\bhttps?://\S*\b', '', text) text = re.sub(r' +', ' ', text) text = re.sub(r'[\'\"‘’“”`\)\(]', '', text) return wt(text.strip(' '), keep_whitespace=False)
def Check_Size_Of_Sentence(text, lenght, tokenize_engine): if len(wt(text,engine=tokenize_engine)) <= 50: return True else: return False
def Cut_Sentence(text,Fin_Case,Real_len_Array,tokenize_engine): List_Text = wt(text,engine=tokenize_engine) Size_Cut = Fin_Case[0] + Real_len_Array[0] Output_Sentence = ''.join(List_Text[0:Size_Cut]) return Output_Sentence,Size_Cut,List_Text