def preprocess(readfilename, writefilename):
    print("Preprocessing...")
    reader = csv.reader(open(readfilename, encoding="utf8"))
    writer = open(writefilename, 'w', encoding="utf8")
    line_num = 0
    next(reader)
    labels = []
    messages = []
    #test_labels = []

    for row in reader:
        line_num += 1
        #print line_num
        if line_num % 500 == 0:
            print(line_num)
        temp_label = row[0]
        temp_text = row[1]
        #get the train label list
        if temp_label == 'realDonaldTrump':
            labels.append(0)
        elif temp_label == 'HillaryClinton':
            labels.append(1)
        #Make the words to lower format and Remove the stopwords
        stopWords = set(stopwords.words('english'))
        words = TweetTokenizer().tokenize(temp_text)
        words_lower = [w.lower() for w in words]
        words_lower_filter_stopwords = []
        for w in words_lower:
            if w not in stopWords:
                words_lower_filter_stopwords.append(w)
        words = words_lower_filter_stopwords
        for word in words:  #looking too much into these information will presumably lead to overfitting
            if word.startswith('http'):
                words[words.index(word)] = '<url>'
            if word.startswith('@'):
                words[words.index(word)] = '<@mention>'
            if word.startswith('#'):
                words[words.index(word)] = '<hashtag>'
            if word[0].isdigit():
                words[words.index(word)] = '<num>'
    #            if word.endswith('...'): #for some reason some characters are turned into ellipsis, e.g. 'clicks'->'cl...'
    #                words.pop(words.index(word))
        word_num = 0
        temp_sentence = ""
        for temp_word in words_lower_filter_stopwords:
            word_num += 1
            if word_num == 1:
                temp_sentence += temp_word
            else:
                temp_sentence += " " + temp_word
        temp_sentence += "\n"
        messages.append(temp_sentence)
        #        print(temp_sentence)
        writer.write(temp_sentence)
    writer.close()
    print("Preprocessing is done!")
    return labels, messages
示例#2
0
def preprocess(readfilename, writefilename, write=True):
    print("Preprocessing...")
    try:
        if write == False:
            raise FileNotFoundError
        reader = csv.reader(open(writefilename, encoding="utf8"),
                            delimiter=';')
        labels = []
        messages = []
        for row in reader:
            if row[0] == 'realDonaldTrump':
                labels.append(0)
            elif row[0] == 'HillaryClinton':
                labels.append(1)
            messages.append(row[1].split())
    except FileNotFoundError:
        if 'clinton' in readfilename:
            temp_label = 'HillaryClinton'
            tweet_loc = 4
        elif 'trump' in readfilename:
            temp_label = 'realDonaldTrump'
            tweet_loc = 1
        reader = csv.reader(open(readfilename, encoding='utf8'), delimiter=';')
        writer = open(writefilename, 'w', encoding="utf8")
        line_num = 0
        next(reader)
        labels = []
        messages = []
        for row in reader:
            line_num += 1  # line_num += 1 is the same as line_num++
            if line_num % 500 == 0:
                print(line_num)
            temp_text = row[tweet_loc]
            #get the train label list
            if temp_label == 'realDonaldTrump':
                labels.append(0)
            elif temp_label == 'HillaryClinton':
                labels.append(1)
            words = TweetTokenizer().tokenize(temp_text)
            for word in words:
                if 'pic.twitter.com' in word:
                    words[words.index(word)] = '<pic>'
                elif word.startswith('http'):
                    words[words.index(word)] = '<url>'


#                elif word.startswith('@'):
#                    words[words.index(word)] = '<@mention>'
#                elif word == '#': #clinton likes to include space after hashtag
#                    words[words.index(word)+1] = word+words[words.index(word)+1]
#                    words.pop(words.index(word))
                elif word[0].isdigit():
                    words[words.index(word)] = '<num>'
            if '#' in words:
                index = words.index('#')
                words[index] += words[index + 1]
                words.pop(index + 1)
            if '@' in words:
                index = words.index('@')
                words[index] += words[index + 1]
                words.pop(index + 1)
            words_lower = [w.lower() for w in words]
            word_num = 0
            temp_sentence = ""
            for temp_word in words_lower:
                word_num += 1
                if word_num == 1:
                    temp_sentence += temp_word
                else:
                    temp_sentence += " " + temp_word
            temp_sentence += "\n"
            messages.append(temp_sentence.split())
            writer.write(temp_label + ';')
            writer.write(temp_sentence)
        writer.close()
    print("Preprocessing is done!")
    return labels, messages
示例#3
0
def preprocess(readfilename, writefilename):
    print("Preprocessing...")
    try:
        reader = csv.reader(open(writefilename, encoding="utf8"),
                            delimiter=';')
        labels = []
        messages = []
        for row in reader:
            if row[0] == 'realDonaldTrump':
                labels.append(0)
            elif row[0] == 'HillaryClinton':
                labels.append(1)
            messages.append(row[1].split())
    except FileNotFoundError:
        reader = csv.reader(open(readfilename, encoding='utf8'))
        writer = open(writefilename, 'w', encoding="utf8")
        line_num = 0
        next(reader)
        labels = []
        messages = []
        for row in reader:
            line_num += 1  # line_num += 1 is the same as line_num++
            if line_num % 500 == 0:
                print(line_num)


#            temp_label = row[0]
            temp_label = 'realDonaldTrump'
            temp_text = row[1]
            #get the train label list
            if temp_label == 'realDonaldTrump':
                labels.append(0)
            elif temp_label == 'HillaryClinton':
                labels.append(1)
            words = TweetTokenizer().tokenize(temp_text)
            for word in words:
                if 'pic.twitter.com' in word:
                    words[words.index(word)] = '<pic>'
                elif word.startswith('http'):
                    words[words.index(word)] = '<url>'
                elif word.startswith('@'):
                    words[words.index(word)] = '<@mention>'
                elif word.startswith('#'):
                    words[words.index(word)] = '<hashtag>'
                elif word[0].isdigit():
                    words[words.index(word)] = '<num>'
            words_lower = [w.lower() for w in words]
            word_num = 0
            temp_sentence = ""
            for temp_word in words_lower:
                word_num += 1
                if word_num == 1:
                    temp_sentence += temp_word
                else:
                    temp_sentence += " " + temp_word
            temp_sentence += "\n"
            messages.append(temp_sentence.split())
            writer.write(temp_label + ';')
            writer.write(temp_sentence)
        writer.close()
    print("Preprocessing is done!")
    return labels, messages