def clean_str(string): """ Tokenization/string cleaning for dataset Every dataset is lower cased except """ string = string.replace(':)', ' smile ').replace(':-)', ' smile ') \ .replace(':D', ' smile ').replace('=)', ' smile ').replace('😄', ' smile ').replace('☺', ' smile ') string = string.replace('❤', ' like ').replace('<3', ' like ').replace( '💕', ' like ').replace('😍', ' like ') string = string.replace('🤗', ' happy ').replace(':-)', ' happy ') string = string.replace(':(', ' unhappy ').replace(':-(', ' unhappy ').replace('💔', ' unhappy ') \ .replace('😕', 'unhappy ').replace('😤', ' unhappy ') string = string.replace('😡', ' anger ').replace('🙃', ' anger ') string = string.replace('😞', ' sadness ').replace('😓', ' sadness ').replace( '😔', ' sadness ') string = string.replace(';-;', ' unhappy ') string = string.replace('’', '\'').replace('"', ' ') string = string.replace('whats ', 'what is') string = string.replace('Iam ', 'I am').replace(' iam ', 'i am').replace( ' dnt ', ' do not ') string = string.replace('I ve ', 'I have ').replace('I m ', 'I\'am ').replace( 'i m ', 'i\'m ') string = string.replace('Iam ', 'I am ').replace('iam ', 'i am ') string = string.replace('dont ', 'do not ').replace( 'google.co.in ', 'google').replace('hve ', 'have ') string = string.replace(' F ', ' F**k ').replace('Ain\'t ', ' are not ').replace( ' lv ', ' love ') string = string.replace(' ok~~ay~~ ', ' okay ').replace(' Its ', 'It is').replace( ' its ', ' it is ') string = string.replace(' Nd ', ' and ').replace(' nd ', ' and ') string = string.replace('Thnx ', ' Thanx ').replace('[#TRIGGERWORD#]', '') # delete ascll string = re.sub('[^\x00-\x7f]', ' ', string) word1_list = string.split() for index in range(len(word1_list)): if word1_list[index] in LOGOGRAM.keys(): word1_list[index] = LOGOGRAM[word1_list[index]] string = ' '.join(word1_list) # letters only # string = re.sub("[^a-zA-Z\'.!?]", " ", string) string = string.lower() word_list = string.split() for index in range(len(word_list)): if word_list[index] in LOGOGRAM.keys(): word_list[index] = LOGOGRAM[word_list[index]] string = " ".join(word_list) # words = stanford_tokenizer(string) # stops = set(stopwords.words("english")) # meaningful_words = [w for w in words if not w in stops] # string = " ".join(words) return string
def processemoji(text): repeatedChars = ['user', 'hashtag'] for c in repeatedChars: lineSplit = text.split(c) while True: try: lineSplit.remove('') except: break cSpace = ' ' + c + ' ' text = cSpace.join(lineSplit) emoji_repeatedChars = TWEMOJI_LIST for emoji_meta in emoji_repeatedChars: emoji_lineSplit = text.split(emoji_meta) while True: try: emoji_lineSplit.remove('') emoji_lineSplit.remove(' ') emoji_lineSplit.remove(' ') emoji_lineSplit = [x for x in emoji_lineSplit if x != ''] except: break emoji_cSpace = ' ' + TWEMOJI[emoji_meta][0] + ' ' text = emoji_cSpace.join(emoji_lineSplit) for item in LOGOGRAM.keys(): text = text.replace(' ' + item + ' ', ' ' + LOGOGRAM[item].lower() + ' ') # print(item) list_str = ekphrasis_config(text) for index in range(len(list_str)): if list_str[index] in EMOTICONS_TOKEN.keys(): list_str[index] = EMOTICONS_TOKEN[ list_str[index]][1:len(EMOTICONS_TOKEN[list_str[index]]) - 1].lower() for index in range(len(list_str)): if list_str[index] in LOGOGRAM.keys(): # print("kkk",list_str[index]) list_str[index] = LOGOGRAM[list_str[index]].lower() string = ' '.join(list_str) string = emoji.demojize(string.lower()) string = re.sub(':\S+?:', '', string) return string
def logogram_processing(review): string = review.replace('’', '\'') string = string.replace('Iam ', ' I am').replace(' iam ', ' i am').replace( ' dnt ', ' do not ') string = string.replace('I ve ', ' I have ').replace('I m ', ' I\'am ').replace( 'i m ', ' i\'m ') string = string.replace('Iam ', ' I am ').replace('iam ', ' i am ') string = string.replace('dont ', ' do not ').replace( 'google.co.in ', 'google').replace('hve ', ' have ') string = string.replace(' F ', ' F**k ').replace('Ain\'t ', ' can\'t ').replace( ' lv ', ' love ') string = string.replace(' ok~~ay~~ ', ' okay ').replace( ' Its ', ' It is ').replace(' its ', ' it is ') string = string.replace(' Nd ', ' and ').replace(' nd ', ' and ') string = string.replace('Thnx ', 'Than') review_list = string.split(' ') for index in range(len(review_list)): if review_list[index] in LOGOGRAM.keys(): review_list[index] = LOGOGRAM[review_list[index]] print(' '.join(review_list)) return ' '.join(review_list)
def preprocessData(dataFilePath, mode): """Load data from a file, process and return indices, conversations and labels in separate lists Input: dataFilePath : Path to train/test file to be processed mode : "train" mode returns labels. "test" mode doesn't return labels. Output: indices : Unique conversation ID list conversations : List of 3 turn conversations, processed and each turn separated by the <eos> tag labels : [Only available in "train" mode] List of labels """ indices = [] conversations = [] labels = [] with io.open(dataFilePath, encoding="utf8") as finput: finput.readline() for line in finput: # Convert multiple instances of . ? ! , to single instance # okay...sure -> okay . sure # okay???sure -> okay ? sure # Add whitespace around such punctuation # okay!sure -> okay ! sure repeatedChars = ['.', '?', '!', ','] for c in repeatedChars: lineSplit = line.split(c) while True: try: lineSplit.remove('') except: break cSpace = ' ' + c + ' ' line = cSpace.join(lineSplit) emoji_repeatedChars = TWEMOJI_LIST for emoji_meta in emoji_repeatedChars: emoji_lineSplit = line.split(emoji_meta) while True: try: emoji_lineSplit.remove('') emoji_lineSplit.remove(' ') emoji_lineSplit.remove(' ') emoji_lineSplit = [ x for x in emoji_lineSplit if x != '' ] except: break emoji_cSpace = ' ' + TWEMOJI[emoji_meta][0] + ' ' line = emoji_cSpace.join(emoji_lineSplit) line = line.strip().split('\t') if mode == "train": # Train data contains id, 3 turns and label label = emotion2label[line[4]] labels.append(label) conv = ' <eos> '.join(line[1:4]) + ' ' # Remove any duplicate spaces duplicateSpacePattern = re.compile(r'\ +') conv = re.sub(duplicateSpacePattern, ' ', conv) string = re.sub("tha+nks ", ' thanks ', conv) string = re.sub("Tha+nks ", ' Thanks ', string) string = re.sub("yes+ ", ' yes ', string) string = re.sub("Yes+ ", ' Yes ', string) string = re.sub("very+ ", ' very ', string) string = re.sub("go+d ", ' good ', string) string = re.sub("Very+ ", ' Very ', string) string = re.sub("why+ ", ' why ', string) string = re.sub("wha+t ", ' what ', string) string = re.sub("sil+y ", ' silly ', string) string = re.sub("hm+ ", ' hmm ', string) string = re.sub("no+ ", ' no ', string) string = re.sub("sor+y ", ' sorry ', string) string = re.sub("so+ ", ' so ', string) string = re.sub("lie+ ", ' lie ', string) string = re.sub("okay+ ", ' okay ', string) string = re.sub(' lol[a-z]+ ', 'laugh out loud', string) string = re.sub(' wow+ ', ' wow ', string) string = re.sub('wha+ ', ' what ', string) string = re.sub(' ok[a-z]+ ', ' ok ', string) string = re.sub(' u+ ', ' you ', string) string = re.sub(' wellso+n ', ' well soon ', string) string = re.sub(' byy+ ', ' bye ', string) string = string.replace('’', '\'').replace('"', ' ').replace("`", "'") string = string.replace('whats ', 'what is ').replace( "what's ", 'what is ').replace("i'm ", 'i am ') string = string.replace("it's ", 'it is ') string = string.replace('Iam ', 'I am ').replace( ' iam ', ' i am ').replace(' dnt ', ' do not ') string = string.replace('I ve ', 'I have ').replace( 'I m ', ' I\'am ').replace('i m ', 'i\'m ') string = string.replace('Iam ', 'I am ').replace('iam ', 'i am ') string = string.replace('dont ', 'do not ').replace( 'google.co.in ', ' google ').replace(' hve ', ' have ') string = string.replace(' F ', ' F**k ').replace( 'Ain\'t ', ' are not ').replace(' lv ', ' love ') string = string.replace(' ok~~ay~~ ', ' okay ').replace( ' Its ', ' It is').replace(' its ', ' it is ') string = string.replace(' Nd ', ' and ').replace( ' nd ', ' and ').replace('i ll ', 'i will ') string = ' ' + string.lower() for item in LOGOGRAM.keys(): string = string.replace(' ' + item + ' ', ' ' + LOGOGRAM[item].lower() + ' ') list_str = ekphrasis_config(string) for index in range(len(list_str)): if list_str[index] in EMOTICONS_TOKEN.keys(): list_str[index] = EMOTICONS_TOKEN[list_str[index]][ 1:len(EMOTICONS_TOKEN[list_str[index]]) - 1].lower() for index in range(len(list_str)): if list_str[index] in LOGOGRAM.keys(): list_str[index] = LOGOGRAM[list_str[index]].lower() for index in range(len(list_str)): if list_str[index] in LOGOGRAM.keys(): list_str[index] = LOGOGRAM[list_str[index]].lower() string = ' '.join(list_str) indices.append(int(line[0])) conversations.append(string.lower()) if mode == "train": return indices, conversations, labels else: return indices, conversations
def review_to_wordlist(review_text): repeatedChars = ['.', '?', '!', ',', '"'] for c in repeatedChars: lineSplit = review_text.split(c) # print(lineSplit) while True: try: lineSplit.remove('') except: break cSpace = ' ' + c + ' ' line = cSpace.join(lineSplit) emoji_repeatedChars = TWEMOJI_LIST for emoji_meta in emoji_repeatedChars: emoji_lineSplit = line.split(emoji_meta) while True: try: emoji_lineSplit.remove('') emoji_lineSplit.remove(' ') emoji_lineSplit.remove(' ') emoji_lineSplit = [x for x in emoji_lineSplit if x != ''] except: break emoji_cSpace = ' ' + TWEMOJI[emoji_meta][0] + ' ' review_text = emoji_cSpace.join(emoji_lineSplit) review_text = emoji_to_text(review_text) review_text = re.sub("(@[\w]*\ )+", " @USER ", review_text) duplicateSpacePattern = re.compile(r'\ +') review_text = re.sub(duplicateSpacePattern, ' ', review_text).strip() # print(review_text) string = re.sub("tha+nks ", ' thanks ', review_text) string = re.sub("Tha+nks ", ' Thanks ', string) string = re.sub("yes+ ", ' yes ', string) string = re.sub("Yes+ ", ' Yes ', string) string = re.sub("very+ ", ' very ', string) string = re.sub("go+d ", ' good ', string) string = re.sub("Very+ ", ' Very ', string) string = re.sub("why+ ", ' why ', string) string = re.sub("wha+t ", ' what ', string) string = re.sub("sil+y ", ' silly ', string) string = re.sub("hm+ ", ' hmm ', string) string = re.sub("no+ ", ' no ', string) string = re.sub("sor+y ", ' sorry ', string) string = re.sub("so+ ", ' so ', string) string = re.sub("lie+ ", ' lie ', string) string = re.sub("okay+ ", ' okay ', string) string = re.sub(' lol[a-z]+ ', 'laugh out loud', string) string = re.sub(' wow+ ', ' wow ', string) string = re.sub('wha+ ', ' what ', string) string = re.sub(' ok[a-z]+ ', ' ok ', string) string = re.sub(' u+ ', ' you ', string) string = re.sub(' wellso+n ', ' well soon ', string) review_text = re.sub(' byy+ ', ' bye ', string) # review_text = re.sub("(im\s)+", " i am ", review_text) review_text = re.sub("(\wl\ss\w)+", ' also ', review_text) # review_text = re.sub("(IM\s)+", " i am ", review_text) review_text = re.sub("(\sbro$)+", " brother ", review_text) review_text = re.sub("\stv", " Television ", review_text) # review_text = review_text.replace('’', '\'').replace('"', ' ').replace("`", "'") review_text = abbreviation_to_text(review_text) string = review_text.replace('whats ', 'what is ').replace(" i'm ", 'i am ') string = string.replace("it's ", 'it is ') string = string.replace('Iam ', 'I am ').replace(' iam ', ' i am ').replace( ' dnt ', ' do not ') string = string.replace('I ve ', 'I have ').replace(' I m ', ' I\'am ').replace( ' i m ', 'i\'m ') string = string.replace(' Iam ', 'I am ').replace(' iam ', 'i am ') string = string.replace('dont ', 'do not ').replace( 'google.co.in ', ' google ').replace(' hve ', ' have ') string = string.replace(' F ', ' F**k ').replace('Ain\'t ', ' are not ').replace( ' lv ', ' love ') string = string.replace(' ok~~ay~~ ', ' okay ').replace(' Its ', ' It is').replace( ' its ', ' it is ') string = string.replace(' Nd ', ' and ').replace(' nd ', ' and ').replace( 'i ll ', 'i will ') # string = ' ' + string # string = abbreviation_to_text(string) string = ' ' + string for item in LOGOGRAM.keys(): string = string.replace(' ' + item + ' ', ' ' + LOGOGRAM[item] + ' ') list_str = ekphrasis_config(string) for index in range(len(list_str)): if list_str[index] in EMOTICONS_TOKEN.keys(): list_str[index] = EMOTICONS_TOKEN[ list_str[index]][1:len(EMOTICONS_TOKEN[list_str[index]]) - 1] for index in range(len(list_str)): if list_str[index] in LOGOGRAM.keys(): list_str[index] = LOGOGRAM[list_str[index]] for index in range(len(list_str)): if list_str[index] in LOGOGRAM.keys(): list_str[index] = LOGOGRAM[list_str[index]] string = ' '.join(list_str) # review_text = re.sub("(@[\w]*\ )+", " @USER ", string) # duplicateSpacePattern = re.compile(r'\ +') # review_text = re.sub(duplicateSpacePattern, ' ', review_text).strip() # print(review_text) # review_text = ekphrasis_config(review_text) # print(review_text) review_text = re.sub("[^a-zA-Z0-9\@\&\:]", " ", string) # review_text = review_text.lower() words = stanford_tokenizer(review_text) return (words)
string = string.replace('I ve ', 'I have ').replace('I m ', ' I am ').replace( 'i m ', 'i am ') string = string.replace('Iam ', 'I am ').replace('iam ', 'i am ') string = string.replace('dont ', 'do not ').replace( 'google.co.in ', ' google ').replace(' hve ', ' have ') string = string.replace('Ain\'t ', ' are not ').replace(' lv ', ' love ') string = string.replace(' ok~~ay~~ ', ' okay ').replace( ' Its ', ' It is').replace(' its ', ' it is ') string = string.replace(' Nd ', ' and ').replace(' nd ', ' and ').replace( 'i ll ', 'i will ') string = string.replace(" I'd ", ' i would ').replace(''', "'") string = ' ' + string.lower() for item in LOGOGRAM.keys(): string = string.replace(' ' + item + ' ', ' ' + LOGOGRAM[item].lower() + ' ') list_str = ekphrasis_config(string) for index in range(len(list_str)): if list_str[index] in slang_map.keys(): list_str[index] = slang_map[list_str[index]] string = ' '.join(list_str) list_str = string.split() for index in range(len(list_str)): if list_str[index] in EMOTICONS_TOKEN.keys(): # print('kkkkkkkkk') # print(EMOTICONS_TOKEN[list_str[index]][1:len(EMOTICONS_TOKEN[list_str[index]]) - 1].lower()) list_str[index] = EMOTICONS_TOKEN[