def preprocess(readfilename, writefilename): print("Preprocessing...") reader = csv.reader(open(readfilename, encoding="utf8")) writer = open(writefilename, 'w', encoding="utf8") line_num = 0 next(reader) labels = [] messages = [] #test_labels = [] for row in reader: line_num += 1 #print line_num if line_num % 500 == 0: print(line_num) temp_label = row[0] temp_text = row[1] #get the train label list if temp_label == 'realDonaldTrump': labels.append(0) elif temp_label == 'HillaryClinton': labels.append(1) #Make the words to lower format and Remove the stopwords stopWords = set(stopwords.words('english')) words = TweetTokenizer().tokenize(temp_text) words_lower = [w.lower() for w in words] words_lower_filter_stopwords = [] for w in words_lower: if w not in stopWords: words_lower_filter_stopwords.append(w) words = words_lower_filter_stopwords for word in words: #looking too much into these information will presumably lead to overfitting if word.startswith('http'): words[words.index(word)] = '<url>' if word.startswith('@'): words[words.index(word)] = '<@mention>' if word.startswith('#'): words[words.index(word)] = '<hashtag>' if word[0].isdigit(): words[words.index(word)] = '<num>' # if word.endswith('...'): #for some reason some characters are turned into ellipsis, e.g. 'clicks'->'cl...' # words.pop(words.index(word)) word_num = 0 temp_sentence = "" for temp_word in words_lower_filter_stopwords: word_num += 1 if word_num == 1: temp_sentence += temp_word else: temp_sentence += " " + temp_word temp_sentence += "\n" messages.append(temp_sentence) # print(temp_sentence) writer.write(temp_sentence) writer.close() print("Preprocessing is done!") return labels, messages
def preprocess(readfilename, writefilename, write=True): print("Preprocessing...") try: if write == False: raise FileNotFoundError reader = csv.reader(open(writefilename, encoding="utf8"), delimiter=';') labels = [] messages = [] for row in reader: if row[0] == 'realDonaldTrump': labels.append(0) elif row[0] == 'HillaryClinton': labels.append(1) messages.append(row[1].split()) except FileNotFoundError: if 'clinton' in readfilename: temp_label = 'HillaryClinton' tweet_loc = 4 elif 'trump' in readfilename: temp_label = 'realDonaldTrump' tweet_loc = 1 reader = csv.reader(open(readfilename, encoding='utf8'), delimiter=';') writer = open(writefilename, 'w', encoding="utf8") line_num = 0 next(reader) labels = [] messages = [] for row in reader: line_num += 1 # line_num += 1 is the same as line_num++ if line_num % 500 == 0: print(line_num) temp_text = row[tweet_loc] #get the train label list if temp_label == 'realDonaldTrump': labels.append(0) elif temp_label == 'HillaryClinton': labels.append(1) words = TweetTokenizer().tokenize(temp_text) for word in words: if 'pic.twitter.com' in word: words[words.index(word)] = '<pic>' elif word.startswith('http'): words[words.index(word)] = '<url>' # elif word.startswith('@'): # words[words.index(word)] = '<@mention>' # elif word == '#': #clinton likes to include space after hashtag # words[words.index(word)+1] = word+words[words.index(word)+1] # words.pop(words.index(word)) elif word[0].isdigit(): words[words.index(word)] = '<num>' if '#' in words: index = words.index('#') words[index] += words[index + 1] words.pop(index + 1) if '@' in words: index = words.index('@') words[index] += words[index + 1] words.pop(index + 1) words_lower = [w.lower() for w in words] word_num = 0 temp_sentence = "" for temp_word in words_lower: word_num += 1 if word_num == 1: temp_sentence += temp_word else: temp_sentence += " " + temp_word temp_sentence += "\n" messages.append(temp_sentence.split()) writer.write(temp_label + ';') writer.write(temp_sentence) writer.close() print("Preprocessing is done!") return labels, messages
def preprocess(readfilename, writefilename): print("Preprocessing...") try: reader = csv.reader(open(writefilename, encoding="utf8"), delimiter=';') labels = [] messages = [] for row in reader: if row[0] == 'realDonaldTrump': labels.append(0) elif row[0] == 'HillaryClinton': labels.append(1) messages.append(row[1].split()) except FileNotFoundError: reader = csv.reader(open(readfilename, encoding='utf8')) writer = open(writefilename, 'w', encoding="utf8") line_num = 0 next(reader) labels = [] messages = [] for row in reader: line_num += 1 # line_num += 1 is the same as line_num++ if line_num % 500 == 0: print(line_num) # temp_label = row[0] temp_label = 'realDonaldTrump' temp_text = row[1] #get the train label list if temp_label == 'realDonaldTrump': labels.append(0) elif temp_label == 'HillaryClinton': labels.append(1) words = TweetTokenizer().tokenize(temp_text) for word in words: if 'pic.twitter.com' in word: words[words.index(word)] = '<pic>' elif word.startswith('http'): words[words.index(word)] = '<url>' elif word.startswith('@'): words[words.index(word)] = '<@mention>' elif word.startswith('#'): words[words.index(word)] = '<hashtag>' elif word[0].isdigit(): words[words.index(word)] = '<num>' words_lower = [w.lower() for w in words] word_num = 0 temp_sentence = "" for temp_word in words_lower: word_num += 1 if word_num == 1: temp_sentence += temp_word else: temp_sentence += " " + temp_word temp_sentence += "\n" messages.append(temp_sentence.split()) writer.write(temp_label + ';') writer.write(temp_sentence) writer.close() print("Preprocessing is done!") return labels, messages