def cleanup_text(sentences):
    cleaned_text = []
    for sent in sentences:
        # get hastag
        sent = re.sub(r'(?:^|\s)(\#\w+)', ' hastag', sent)
        # get email
        sent = re.sub(r'[\w\.-]+@[\w\.-]+', 'email', sent)
        # get url
        sent = re.sub(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*',
                      ' url', sent)
        # dupplicate word
        # sent = re.sub(r'(\w)\1+', r'\1', sent)
        # delete string that contain number
        sent = re.sub(r"\w*\d\w*", ' ', sent)

        # y = re.sub(r'\b\d+([\.,]\d+)?', ' number', g)
        # delete number in string
        # g = re.sub(r"\d*([^\d\W]+)\d*", r'\1', z)
        sent = re.sub('[^\w ]', ' ', sent)
        sent = visen.clean_tone(sent)
        cleaned_text.append(sent)
    return cleaned_text
'[ỏ]':'ỏ','[ó]':'ó','[õ]':'õ','[ở]':'ở','[ổ]':'ổ','[ọ]':'ọ','[ỗ]':'ỗ','[ố]':'ố','[ồ]':'ồ','[ớ]':'ớ','[ờ]':'ờ',
'[ự]':'ự','[ụ]':'ụ','[ú]':'ú','[ủ]':'ủ','[ứ]':'ứ','[ử]':'ử',

}


# must use encoding="utf-8" -- 2020-05-22 Japlin Chen
f=open('./WordList.txt','r',encoding="utf-8")
a=open('./vn_pinyin.txt','w',encoding="utf-8")
SourceText = ''
for i in f:
    # Fix Other UTF8 vietnamese character
    ##for UTF8regex, UTF8replace in UTF8Char.items():
    ##    SourceText = re.sub(UTF8regex, UTF8replace, i)
    # Fix Tone position
    Clean_Tone_Text = visen.clean_tone(i)
    # Get pure English Character => PinYin key
    NoToneText = visen.remove_tone(Clean_Tone_Text).strip()
    # Get the Telex of every vietnamese word
    '''
    TelexText = ''
    TempTelexText = ''
    for word in Clean_Tone_Text.split(' '):
        TempTelexText = visen.get_enter_code(word)
        TelexText += TempTelexText + ' '
    '''
    # Save to file
    print(Clean_Tone_Text.strip() + '	' + NoToneText + '	' + '30000', file=a)
    #a.write(i.strip() + ' ' + convert(i) )

a.close()
Пример #3
0
def replace_all(text, dic):
    for i, j in dic.items():
        text = re.sub(i, j, text)
    return text


t = replace_all(s, abbrevs)

print(t)
# get hastag
new_string = re.sub(r'(?:^|\s)(\#\w+)', ' hastag', t)
# get email
email = re.sub(r'[\w\.-]+@[\w\.-]+', 'email', new_string)
# get url
x = re.sub(
    r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*',
    ' url', email)
# dupplicate word
z = re.sub(r'(\w)\1+', r'\1', x)
# delete string that contain number
g = re.sub(r"\w*\d\w*", ' ', z)

# y = re.sub(r'\b\d+([\.,]\d+)?', ' number', g)

# delete number in string
# g = re.sub(r"\d*([^\d\W]+)\d*", r'\1', z)
h = re.sub('[^\w ]', ' ', g)
final = visen.clean_tone(h)
remove_stop_word
print(final)
wb = openpyxl.load_workbook('VietnameseWordList.xlsx')
sheet = wb.active    # -- open active sheet name Ex: 'Sheet1'

last_text = ''
#last_chinese = ''
now_chinese = ''
rcount = 0
count = 1   
# -- for "A" column 
for column in list(sheet.columns)[0]:

    # -- if A[xxx] have data
    if column.value is not None:
        if count > 1:   # Skip Excel Sheet1 Title
            # -- Fix Tone position
            Clean_Tone_Text = visen.clean_tone(column.value)
            # -- Get pure English Character => PinYin key
            NoToneText = visen.remove_tone(Clean_Tone_Text).strip()
            # -- Get the Telex key for each vietnamese word
            TelexText = ''
            TempTelexText = ''
            for word in Clean_Tone_Text.split(' '):
                # -- 'ươ' telex = 'uwow', now change to 'uow' 
                TempTelexText = visen.get_enter_code(word).replace('uwow', 'uow')
                TelexText += TempTelexText + ' '

            TelexText = TelexText.replace('\n', '').replace('\r', '').strip()

            # -- set Excel C/D/E column value
            sheet.cell(row=count,column=3).value = Clean_Tone_Text
            sheet.cell(row=count,column=4).value = NoToneText