def cbet_data(file_path='data/CBET.csv', remove_stop_words=True, get_text=True, preprocess=True, multi=False, vector=False): NUM_CLASS = 9 emo_list = ["anger", "fear", "joy", "love", "sadness", "surprise", "thankfulness", "disgust", "guilt"] stop_words = set(stopwords.words('english')) label = [] train_text = [] df = pd.read_csv(file_path) for i, row in df.iterrows(): if get_text: from utils.tweet_processor import tweet_process text = row['text'] if preprocess: text = tweet_process(text) if remove_stop_words: text = ' '.join([x for x in text.split() if x not in stop_words]) train_text.append(text) emo_one_hot = row[emo_list] emo_one_hot = np.asarray(emo_one_hot) if not multi: if sum(emo_one_hot) != 1: continue emo_idx = np.argmax(emo_one_hot) else: if not vector: emo_idx = np.where(emo_one_hot == 1)[0].tolist() else: emo_idx = emo_one_hot label.append(emo_idx) return train_text, label, emo_list, NUM_CLASS
def isear_data(file_path='data/ISEAR.csv', remove_stop_words=True, get_text=True, preprocess=True): stop_words = set(stopwords.words('english')) NUM_CLASS = 7 emo_list = ["joy", "fear", "anger", "sadness", "disgust", "shame", "guilt"] attributes = ['SIT'] target = ['EMOT'] loader = IsearLoader(attributes, target, True) data = loader.load_isear(file_path) train_text = [] if get_text: text_all = data.get_freetext_content() # returns attributes for text in text_all: from utils.tweet_processor import tweet_process if preprocess: text = tweet_process(text) if remove_stop_words: text = ' '.join( [x for x in text.split() if x not in stop_words]) train_text.append(text) emo = data.get_target() # returns target return train_text, emo, emo_list, NUM_CLASS
def interactive_inference(model_token=''): with open(f'lstm_{model_token}{opt.dataset}_tokenizer.pkl', 'br') as f: tokenizer = pkl.load(f) def encode_seq(src): src = tokenizer.encode_ids(src) if len(src) < PAD_LEN: src_len = len(src) src = src + [0] * (PAD_LEN - len(src)) else: src = src[:PAD_LEN] src_len = PAD_LEN return torch.LongTensor(src).unsqueeze(0), \ torch.LongTensor([src_len]).unsqueeze(0) def softmax(x): """Compute softmax values for each sets of scores in x.""" return np.exp(x) / np.sum(np.exp(x), axis=0) model = AttentionLSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, tokenizer.get_vocab_size(), NUM_EMO, BATCH_SIZE, att_mode=opt.attention, soft_last=False) # multi-GPU # model = nn.DataParallel(model) with open(f'lstm_{model_token}{opt.dataset}_model.pt', 'br') as f: model.load_state_dict(torch.load(f)) model.cuda() label_cols = ['anger', 'fear', 'joy', 'love', 'sadness', 'surprise', 'thankfulness', 'disgust', 'guilt'] while True: print('type "end" to terminate >>> ') text = input() if text.strip().lower() == 'end': break text = tweet_process(text) seq, seq_len = encode_seq(text) y_pred = model(seq.cuda(), seq_len) response = '' y_pred = y_pred[0].detach().cpu().numpy() y_pred = softmax(y_pred) for emo, prob in zip(label_cols, y_pred): if prob > 0: response += emo + ":" + str(prob) + '\n' print(response)
def cbet_data_other(mode='small', remove_stop_words=True, get_text=True, preprocess=True, vector=False): """ :param mode: Small or median :param remove_stop_words: :param get_text: :param preprocess: :param multi: :param vector: :return: """ assert mode in ['small', 'median'] NUM_CLASS = 9 emo_list = ["anger", "fear", "joy", "love", "sadness", "surprise", "thankfulness", "disgust", "guilt"] stop_words = set(stopwords.words('english')) label = [] train_text = [] if mode == 'small': file_path = 'data/CBET-single-small.txt' else: file_path = 'data/CBET-single-medium.txt' file_reader = open(file_path, 'r') for row in file_reader.readlines(): tokens = row.strip().split('\t\t') if get_text: from utils.tweet_processor import tweet_process text = tokens[0] if preprocess: text = tweet_process(text) if remove_stop_words: text = ' '.join([x for x in text.split() if x not in stop_words]) train_text.append(text) emo = int(tokens[1]) if not vector: label.append(emo) else: emo_one_hot = np.zeros(NUM_CLASS) emo_one_hot[emo] = 1 label.append(emo_one_hot) return train_text, label, emo_list, NUM_CLASS
def tec_data(file_path='data/TEC.txt', remove_stop_words=True, get_text=True): NUM_CLASS = 6 emo_list = ['anger', 'fear', 'joy', 'sadness', 'surprise', 'disgust'] stop_words = set(stopwords.words('english')) f = open(file_path, 'r', encoding='utf8') lines = f.readlines() f.close() label = [] train_text = [] for line in lines: text, emo = line.split('\t') if get_text: from utils.tweet_processor import tweet_process text = tweet_process(text) if remove_stop_words: text = ' '.join( [x for x in text.split() if x not in stop_words]) train_text.append(text) label.append(int(emo)) return train_text, label, emo_list, NUM_CLASS