예제 #1
0
def review_to_wordlist(review_text):

    review_text = emoji_to_text(review_text)

    review_text = abbreviation_to_text(review_text)

    review_text = re.sub("(@[\w]*\ )+", "@USER", review_text)

    duplicateSpacePattern = re.compile(r'\ +')
    review_text = re.sub(duplicateSpacePattern, ' ', review_text).strip()

    # review_text = re.sub("@[\w]*\ ", " ", review_text)

    # review_text = re.sub("(@[\w]*\ )+", "@USER ", review_text).strip()     #将重复出现的@USER替换成只有一个的@USER
    # print(review_text)

    # review_text = re.sub("[!?,.]", " ", review_text).strip()

    review_text = ekphrasis_config(review_text)

    review_text = re.sub("[^a-zA-Z0-9\@\&\:]", " ", str(review_text))

    # review_text = review_text.lower()
    # print(review_text)
    words = stanford_tokenizer(review_text)

    # return (review_text)
    return (words)
def review_to_wordlist(review_text):
    repeatedChars = ['.', '?', '!', ',', '"']
    for c in repeatedChars:
        lineSplit = review_text.split(c)
        # print(lineSplit)
        while True:
            try:
                lineSplit.remove('')
            except:
                break
        cSpace = ' ' + c + ' '
        line = cSpace.join(lineSplit)

    emoji_repeatedChars = TWEMOJI_LIST
    for emoji_meta in emoji_repeatedChars:
        emoji_lineSplit = line.split(emoji_meta)
        while True:
            try:
                emoji_lineSplit.remove('')
                emoji_lineSplit.remove(' ')
                emoji_lineSplit.remove('  ')
                emoji_lineSplit = [x for x in emoji_lineSplit if x != '']
            except:
                break
        emoji_cSpace = ' ' + TWEMOJI[emoji_meta][0] + ' '
        review_text = emoji_cSpace.join(emoji_lineSplit)

    review_text = emoji_to_text(review_text)

    review_text = abbreviation_to_text(review_text)

    review_text = re.sub("(@[\w]*\ )+", " @USER ", review_text)

    duplicateSpacePattern = re.compile(r'\ +')
    review_text = re.sub(duplicateSpacePattern, ' ', review_text).strip()
    # print(review_text)

    review_text = ekphrasis_config(review_text)
    # print(review_text)
    review_text = re.sub("[^a-zA-Z0-9\@\&\:]", " ", str(review_text))

    # review_text = review_text.lower()

    words = stanford_tokenizer(review_text)

    return (words)
def review_to_wordlist(review_text):
    repeatedChars = ['.', '?', '!', ',', '"']
    for c in repeatedChars:
        lineSplit = review_text.split(c)
        # print(lineSplit)
        while True:
            try:
                lineSplit.remove('')
            except:
                break
        cSpace = ' ' + c + ' '
        line = cSpace.join(lineSplit)

    emoji_repeatedChars = TWEMOJI_LIST
    for emoji_meta in emoji_repeatedChars:
        emoji_lineSplit = line.split(emoji_meta)
        while True:
            try:
                emoji_lineSplit.remove('')
                emoji_lineSplit.remove(' ')
                emoji_lineSplit.remove('  ')
                emoji_lineSplit = [x for x in emoji_lineSplit if x != '']
            except:
                break
        emoji_cSpace = ' ' + TWEMOJI[emoji_meta][0] + ' '
        review_text = emoji_cSpace.join(emoji_lineSplit)

    review_text = emoji_to_text(review_text)

    review_text = re.sub("(@[\w]*\ )+", " @USER ", review_text)

    duplicateSpacePattern = re.compile(r'\ +')
    review_text = re.sub(duplicateSpacePattern, ' ', review_text).strip()
    # print(review_text)

    string = re.sub("tha+nks ", ' thanks ', review_text)
    string = re.sub("Tha+nks ", ' Thanks ', string)
    string = re.sub("yes+ ", ' yes ', string)
    string = re.sub("Yes+ ", ' Yes ', string)
    string = re.sub("very+ ", ' very ', string)
    string = re.sub("go+d ", ' good ', string)
    string = re.sub("Very+ ", ' Very ', string)
    string = re.sub("why+ ", ' why ', string)
    string = re.sub("wha+t ", ' what ', string)
    string = re.sub("sil+y ", ' silly ', string)
    string = re.sub("hm+ ", ' hmm ', string)
    string = re.sub("no+ ", ' no ', string)
    string = re.sub("sor+y ", ' sorry ', string)
    string = re.sub("so+ ", ' so ', string)
    string = re.sub("lie+ ", ' lie ', string)
    string = re.sub("okay+ ", ' okay ', string)
    string = re.sub(' lol[a-z]+ ', 'laugh out loud', string)
    string = re.sub(' wow+ ', ' wow ', string)
    string = re.sub('wha+ ', ' what ', string)
    string = re.sub(' ok[a-z]+ ', ' ok ', string)
    string = re.sub(' u+ ', ' you ', string)
    string = re.sub(' wellso+n ', ' well soon ', string)
    review_text = re.sub(' byy+ ', ' bye ', string)
    # review_text = re.sub("(im\s)+", " i am ", review_text)
    review_text = re.sub("(\wl\ss\w)+", ' also ', review_text)
    # review_text = re.sub("(IM\s)+", " i am ", review_text)
    review_text = re.sub("(\sbro$)+", " brother ", review_text)
    review_text = re.sub("\stv", " Television ", review_text)
    # review_text = review_text.replace('’', '\'').replace('"', ' ').replace("`", "'")

    review_text = abbreviation_to_text(review_text)

    string = review_text.replace('whats ',
                                 'what is ').replace(" i'm ", 'i am ')
    string = string.replace("it's ", 'it is ')
    string = string.replace('Iam ', 'I am ').replace(' iam ',
                                                     ' i am ').replace(
                                                         ' dnt ', ' do not ')
    string = string.replace('I ve ', 'I have ').replace(' I m ',
                                                        ' I\'am ').replace(
                                                            ' i m ', 'i\'m ')
    string = string.replace(' Iam ', 'I am ').replace(' iam ', 'i am ')
    string = string.replace('dont ', 'do not ').replace(
        'google.co.in ', ' google ').replace(' hve ', ' have ')
    string = string.replace(' F ', ' F**k ').replace('Ain\'t ',
                                                     ' are not ').replace(
                                                         ' lv ', ' love ')
    string = string.replace(' ok~~ay~~ ',
                            ' okay ').replace(' Its ', ' It is').replace(
                                ' its ', ' it is ')
    string = string.replace('  Nd  ', ' and ').replace(' nd ',
                                                       ' and ').replace(
                                                           'i ll ', 'i will ')

    # string = ' ' + string
    # string = abbreviation_to_text(string)
    string = ' ' + string
    for item in LOGOGRAM.keys():
        string = string.replace(' ' + item + ' ', ' ' + LOGOGRAM[item] + ' ')

    list_str = ekphrasis_config(string)
    for index in range(len(list_str)):
        if list_str[index] in EMOTICONS_TOKEN.keys():
            list_str[index] = EMOTICONS_TOKEN[
                list_str[index]][1:len(EMOTICONS_TOKEN[list_str[index]]) - 1]

    for index in range(len(list_str)):
        if list_str[index] in LOGOGRAM.keys():
            list_str[index] = LOGOGRAM[list_str[index]]

    for index in range(len(list_str)):
        if list_str[index] in LOGOGRAM.keys():
            list_str[index] = LOGOGRAM[list_str[index]]

    string = ' '.join(list_str)
    # review_text = re.sub("(@[\w]*\ )+", " @USER ", string)

    # duplicateSpacePattern = re.compile(r'\ +')
    # review_text = re.sub(duplicateSpacePattern, ' ', review_text).strip()
    # print(review_text)

    # review_text = ekphrasis_config(review_text)
    # print(review_text)
    review_text = re.sub("[^a-zA-Z0-9\@\&\:]", " ", string)

    # review_text = review_text.lower()

    words = stanford_tokenizer(review_text)

    return (words)