def Modify_Data(tokenize_engine):
    y = open('StartWord.txt', 'r')
    Data = y.read()
    Data_Split = Data.split("|")
    Start_Word_Dicts = {1:[],2:[],3:[],4:[],5:[]}
    for n in Data_Split:
        Start_Word_Dicts[len(wt(n,engine=tokenize_engine))].append(wt(n,engine=tokenize_engine))
    return Start_Word_Dicts
def Modify_Text(text,Lower_Bound,Upper_Bound,tokenize_engine):
    Array_Of_Text = []
    Real_len_Array = []
    Real_len = 0
    Word_len = 0
    for n in wt(text,engine=tokenize_engine):
        if n not in ['/n',' ','(',')','“','”']:
            Word_len += 1
            if Word_len == Lower_Bound:
                Real_len_Array.append(Real_len)
            elif Word_len == Upper_Bound:
                Real_len_Array.append(Real_len)
        Real_len += 1 
    Array_Of_Text = wt(text,engine=tokenize_engine)[Real_len_Array[0]:Real_len_Array[1]+1]
    return Array_Of_Text,Real_len_Array
Пример #3
0
def clean(text: str) -> list:
    """
    teim text & tokenize
    """
    text = html.unescape(text)
    text = re.sub(r'(\n|\t|\xa0)', ' ', text)
    text = re.sub(r'(\r|\u200b)', '', text)
    text = re.sub(r'\bhttps?://\S*\b', '', text)
    text = re.sub(r' +', ' ', text)
    text = re.sub(r'[\'\"‘’“”`\)\(]', '', text)
    return wt(text.strip(' '), keep_whitespace=False)
def Check_Size_Of_Sentence(text, lenght, tokenize_engine):
    if len(wt(text,engine=tokenize_engine)) <= 50:
        return True
    else:
        return False
def Cut_Sentence(text,Fin_Case,Real_len_Array,tokenize_engine):
    List_Text = wt(text,engine=tokenize_engine)
    Size_Cut = Fin_Case[0] + Real_len_Array[0] 
    Output_Sentence = ''.join(List_Text[0:Size_Cut])
    return Output_Sentence,Size_Cut,List_Text