def review_to_wordlist(review_text): review_text = emoji_to_text(review_text) review_text = abbreviation_to_text(review_text) review_text = re.sub("(@[\w]*\ )+", "@USER", review_text) duplicateSpacePattern = re.compile(r'\ +') review_text = re.sub(duplicateSpacePattern, ' ', review_text).strip() # review_text = re.sub("@[\w]*\ ", " ", review_text) # review_text = re.sub("(@[\w]*\ )+", "@USER ", review_text).strip() #将重复出现的@USER替换成只有一个的@USER # print(review_text) # review_text = re.sub("[!?,.]", " ", review_text).strip() review_text = ekphrasis_config(review_text) review_text = re.sub("[^a-zA-Z0-9\@\&\:]", " ", str(review_text)) # review_text = review_text.lower() # print(review_text) words = stanford_tokenizer(review_text) # return (review_text) return (words)
def review_to_wordlist(review_text): repeatedChars = ['.', '?', '!', ',', '"'] for c in repeatedChars: lineSplit = review_text.split(c) # print(lineSplit) while True: try: lineSplit.remove('') except: break cSpace = ' ' + c + ' ' line = cSpace.join(lineSplit) emoji_repeatedChars = TWEMOJI_LIST for emoji_meta in emoji_repeatedChars: emoji_lineSplit = line.split(emoji_meta) while True: try: emoji_lineSplit.remove('') emoji_lineSplit.remove(' ') emoji_lineSplit.remove(' ') emoji_lineSplit = [x for x in emoji_lineSplit if x != ''] except: break emoji_cSpace = ' ' + TWEMOJI[emoji_meta][0] + ' ' review_text = emoji_cSpace.join(emoji_lineSplit) review_text = emoji_to_text(review_text) review_text = abbreviation_to_text(review_text) review_text = re.sub("(@[\w]*\ )+", " @USER ", review_text) duplicateSpacePattern = re.compile(r'\ +') review_text = re.sub(duplicateSpacePattern, ' ', review_text).strip() # print(review_text) review_text = ekphrasis_config(review_text) # print(review_text) review_text = re.sub("[^a-zA-Z0-9\@\&\:]", " ", str(review_text)) # review_text = review_text.lower() words = stanford_tokenizer(review_text) return (words)
def review_to_wordlist(review_text): repeatedChars = ['.', '?', '!', ',', '"'] for c in repeatedChars: lineSplit = review_text.split(c) # print(lineSplit) while True: try: lineSplit.remove('') except: break cSpace = ' ' + c + ' ' line = cSpace.join(lineSplit) emoji_repeatedChars = TWEMOJI_LIST for emoji_meta in emoji_repeatedChars: emoji_lineSplit = line.split(emoji_meta) while True: try: emoji_lineSplit.remove('') emoji_lineSplit.remove(' ') emoji_lineSplit.remove(' ') emoji_lineSplit = [x for x in emoji_lineSplit if x != ''] except: break emoji_cSpace = ' ' + TWEMOJI[emoji_meta][0] + ' ' review_text = emoji_cSpace.join(emoji_lineSplit) review_text = emoji_to_text(review_text) review_text = re.sub("(@[\w]*\ )+", " @USER ", review_text) duplicateSpacePattern = re.compile(r'\ +') review_text = re.sub(duplicateSpacePattern, ' ', review_text).strip() # print(review_text) string = re.sub("tha+nks ", ' thanks ', review_text) string = re.sub("Tha+nks ", ' Thanks ', string) string = re.sub("yes+ ", ' yes ', string) string = re.sub("Yes+ ", ' Yes ', string) string = re.sub("very+ ", ' very ', string) string = re.sub("go+d ", ' good ', string) string = re.sub("Very+ ", ' Very ', string) string = re.sub("why+ ", ' why ', string) string = re.sub("wha+t ", ' what ', string) string = re.sub("sil+y ", ' silly ', string) string = re.sub("hm+ ", ' hmm ', string) string = re.sub("no+ ", ' no ', string) string = re.sub("sor+y ", ' sorry ', string) string = re.sub("so+ ", ' so ', string) string = re.sub("lie+ ", ' lie ', string) string = re.sub("okay+ ", ' okay ', string) string = re.sub(' lol[a-z]+ ', 'laugh out loud', string) string = re.sub(' wow+ ', ' wow ', string) string = re.sub('wha+ ', ' what ', string) string = re.sub(' ok[a-z]+ ', ' ok ', string) string = re.sub(' u+ ', ' you ', string) string = re.sub(' wellso+n ', ' well soon ', string) review_text = re.sub(' byy+ ', ' bye ', string) # review_text = re.sub("(im\s)+", " i am ", review_text) review_text = re.sub("(\wl\ss\w)+", ' also ', review_text) # review_text = re.sub("(IM\s)+", " i am ", review_text) review_text = re.sub("(\sbro$)+", " brother ", review_text) review_text = re.sub("\stv", " Television ", review_text) # review_text = review_text.replace('’', '\'').replace('"', ' ').replace("`", "'") review_text = abbreviation_to_text(review_text) string = review_text.replace('whats ', 'what is ').replace(" i'm ", 'i am ') string = string.replace("it's ", 'it is ') string = string.replace('Iam ', 'I am ').replace(' iam ', ' i am ').replace( ' dnt ', ' do not ') string = string.replace('I ve ', 'I have ').replace(' I m ', ' I\'am ').replace( ' i m ', 'i\'m ') string = string.replace(' Iam ', 'I am ').replace(' iam ', 'i am ') string = string.replace('dont ', 'do not ').replace( 'google.co.in ', ' google ').replace(' hve ', ' have ') string = string.replace(' F ', ' F**k ').replace('Ain\'t ', ' are not ').replace( ' lv ', ' love ') string = string.replace(' ok~~ay~~ ', ' okay ').replace(' Its ', ' It is').replace( ' its ', ' it is ') string = string.replace(' Nd ', ' and ').replace(' nd ', ' and ').replace( 'i ll ', 'i will ') # string = ' ' + string # string = abbreviation_to_text(string) string = ' ' + string for item in LOGOGRAM.keys(): string = string.replace(' ' + item + ' ', ' ' + LOGOGRAM[item] + ' ') list_str = ekphrasis_config(string) for index in range(len(list_str)): if list_str[index] in EMOTICONS_TOKEN.keys(): list_str[index] = EMOTICONS_TOKEN[ list_str[index]][1:len(EMOTICONS_TOKEN[list_str[index]]) - 1] for index in range(len(list_str)): if list_str[index] in LOGOGRAM.keys(): list_str[index] = LOGOGRAM[list_str[index]] for index in range(len(list_str)): if list_str[index] in LOGOGRAM.keys(): list_str[index] = LOGOGRAM[list_str[index]] string = ' '.join(list_str) # review_text = re.sub("(@[\w]*\ )+", " @USER ", string) # duplicateSpacePattern = re.compile(r'\ +') # review_text = re.sub(duplicateSpacePattern, ' ', review_text).strip() # print(review_text) # review_text = ekphrasis_config(review_text) # print(review_text) review_text = re.sub("[^a-zA-Z0-9\@\&\:]", " ", string) # review_text = review_text.lower() words = stanford_tokenizer(review_text) return (words)