from nltk.tokenize import word_tokenize from nltk.corpus import stopwords folders = ["EI-reg-En", "2018-EI-reg-En", "2018-EI-reg-En"] datatypes = ["train", "dev", "test"] emotions = ["anger", "fear", "joy", "sadness"] data = [] vocabulary = [] for i, x in enumerate(folders): for j, y in enumerate(emotions): f = open(x + "-" + datatypes[i] + "/" + x + "-" + y + "-" + datatypes[i] + ".txt") raw = f.read() g = preProcess.getData(raw) data.append(g) train_anger_len = len(data[0][0]) train_fear_len = len(data[1][0]) train_joy_len = len(data[2][0]) train_sadness_len = len(data[3][0]) train_len = train_anger_len + train_fear_len + train_joy_len + train_sadness_len dev_anger_len = len(data[4][0]) dev_fear_len = len(data[5][0]) dev_joy_len = len(data[6][0]) dev_sadness_len = len(data[7][0]) dev_len = dev_anger_len + dev_fear_len + dev_joy_len + dev_sadness_len
import preProcess import re from nltk.tokenize import TweetTokenizer import gensim from gensim import corpora, models, similarities import tweetPreprocessor f_anger = open("./EI-reg-En-train/EI-reg-En-anger-train.txt") angerTrain = f_anger.read() f_fear = open("./EI-reg-En-train/EI-reg-En-fear-train.txt") fearTrain = f_fear.read() f_joy = open("./EI-reg-En-train/EI-reg-En-joy-train.txt") joyTrain = f_joy.read() f_sadness = open("./EI-reg-En-train/EI-reg-En-sadness-train.txt") sadnessTrain = f_sadness.read() # S_emotion is the set of all the tweets (actual) of emotion training set [S_anger, y_anger] = preProcess.getData(angerTrain) [S_fear, y_fear] = preProcess.getData(fearTrain) [S_joy, y_joy] = preProcess.getData(joyTrain) [S_sadness, y_sadness] = preProcess.getData(sadnessTrain) corpus = [S_anger, S_fear, S_joy, S_sadness] for z in corpus: for t in z: tweetPreprocessor.produceWordEmbd(t)