예제 #1
0
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

folders = ["EI-reg-En", "2018-EI-reg-En", "2018-EI-reg-En"]
datatypes = ["train", "dev", "test"]
emotions = ["anger", "fear", "joy", "sadness"]

data = []
vocabulary = []

for i, x in enumerate(folders):
    for j, y in enumerate(emotions):
        f = open(x + "-" + datatypes[i] + "/" + x + "-" + y + "-" +
                 datatypes[i] + ".txt")
        raw = f.read()
        g = preProcess.getData(raw)
        data.append(g)

train_anger_len = len(data[0][0])
train_fear_len = len(data[1][0])
train_joy_len = len(data[2][0])
train_sadness_len = len(data[3][0])

train_len = train_anger_len + train_fear_len + train_joy_len + train_sadness_len

dev_anger_len = len(data[4][0])
dev_fear_len = len(data[5][0])
dev_joy_len = len(data[6][0])
dev_sadness_len = len(data[7][0])

dev_len = dev_anger_len + dev_fear_len + dev_joy_len + dev_sadness_len
예제 #2
0
import preProcess
import re
from nltk.tokenize import TweetTokenizer
import gensim
from gensim import corpora, models, similarities
import tweetPreprocessor

f_anger = open("./EI-reg-En-train/EI-reg-En-anger-train.txt")
angerTrain = f_anger.read()

f_fear = open("./EI-reg-En-train/EI-reg-En-fear-train.txt")
fearTrain = f_fear.read()

f_joy = open("./EI-reg-En-train/EI-reg-En-joy-train.txt")
joyTrain = f_joy.read()

f_sadness = open("./EI-reg-En-train/EI-reg-En-sadness-train.txt")
sadnessTrain = f_sadness.read()
# S_emotion is the set of all the tweets (actual) of emotion training set
[S_anger, y_anger] = preProcess.getData(angerTrain)
[S_fear, y_fear] = preProcess.getData(fearTrain)
[S_joy, y_joy] = preProcess.getData(joyTrain)
[S_sadness, y_sadness] = preProcess.getData(sadnessTrain)

corpus = [S_anger, S_fear, S_joy, S_sadness]

for z in corpus:
    for t in z:
        tweetPreprocessor.produceWordEmbd(t)