def deal_data(path): word2id = load_obj(W2V_DIR, "word2id_dict.pkl") # w2v_matrix = np.load(join(W2V_DIR, "word2vex_matrix.npy")) with codecs.open(path, "r", "utf-8") as f: data = {"distribution": [], "label": [], "token": []} for i, line in enumerate(f): data["token"].append([]) line_list = line.strip().split() _sum = int(line_list[1].split(':')[1]) # scale directly here _labels = [int(v.split(':')[1]) / _sum for v in line_list[2:10]] data["label"].append([_labels.index(max(_labels))]) data["distribution"].append(_labels) result = Counter(line_list[10:]) for w in line_list[10:]: # if it's not Chinese, ignore it if not is_Chinese(w): continue ''' if result[w] < 2: continue ''' try: data["token"][i].append(word2id[w]) # if a word is not in pre-trained embedding, ignore it except KeyError: continue seq_lens.append(len(data["token"][i])) return data
def __init__(self, input_dir, output_dir): # 对模型中的权重赋值,权重是根据调参文件得到的 # 因为浮点数精度问题所以有的数值有很多位 self.num = 8 self.a = 0.0001 self.b = 1 self.c = 1.5 self.d = 1.5 self.e = 1.5 # 导入矩阵 # 单个拼音转汉字的概率 self.pinyin2hanzi_matrix = settings.load_obj( settings.PINYIN_HANZI_DIR, "new_pinyin2hanzi_freq.dat") # 连续文本中单个汉字转汉字的概率 self.two_gram_continuous_matrix = settings.load_obj( settings.COUNT_DIR, "new_2_gram_continuous_freq_matrix.dat") # 2元词语中单个汉字转汉字的概率 self.two_gram_matrix = settings.load_obj(settings.COUNT_DIR, "new_2_gram_freq_matrix.dat") # 3元词语中前两个汉字转移到第三个汉字的概率 self.three_gram_matrix = settings.load_obj( settings.COUNT_DIR, "new_3_gram_freq_matrix.dat") # 4元词语中前三个汉字转移到第四个汉字的概率 self.four_gram_matrix = settings.load_obj( settings.COUNT_DIR, "new_4_gram_freq_matrix.dat") # 输入文件文件夹 self.__input_dir = input_dir # 输出文件文件夹 self.__output_dir = output_dir # 记录全句概率,初始化 self.prob = {} for i in range(self.num): self.prob[i] = 1 # 记录选择的汉字,初始化 self.seq = {} for i in range(self.num): self.seq[i] = ''
from scipy.stats import pearsonr import os from os.path import join from settings import W2V_DIR, DATA_DIR, load_obj, EMB_DIM, FILTER_NUM, FILTER_SIZES, DROPOUT, TARGET_SIZE, BATCH_SIZE, CNN_HISTORY_PATH from sklearn.metrics import f1_score def setup_seed(seed): torch.manual_seed(seed) np.random.seed(seed) if __name__ == '__main__': setup_seed(1) device = torch.device("cpu") train_data = load_obj(DATA_DIR, "train_CNN.pkl") test_data = load_obj(DATA_DIR, "test_CNN.pkl") y_train = torch.tensor(np.array(train_data["label"]), dtype=torch.long) X_train = torch.tensor(np.array(train_data["token"]), dtype=torch.long) y_test = torch.tensor(np.array(test_data["label"]), dtype=torch.long) X_test = torch.tensor(np.array(test_data["token"]), dtype=torch.long) test_distr = test_data["distribution"] sub_train_dataset = Data.TensorDataset(X_train, y_train) sub_train_loader = Data.DataLoader( dataset=sub_train_dataset, # torch TensorDataset format batch_size=BATCH_SIZE, # mini batch size shuffle=True, # 要不要打乱数据 (打乱比较好) drop_last=True, )
import numpy as np from scipy.stats import pearsonr from os.path import join from settings import W2V_DIR, DATA_DIR, CHECKPOINT_DIR, load_obj from sklearn.metrics import f1_score def setup_seed(seed): torch.manual_seed(seed) np.random.seed(seed) if __name__ == '__main__': setup_seed(1) device = torch.device("cpu") test_data = load_obj(DATA_DIR, "test_MLP.pkl") test_input = load_obj(DATA_DIR, "average_test_MLP.pkl") y_test = torch.tensor(np.array(test_data["label"]), dtype=torch.long) X_test = torch.tensor(np.array(test_input), dtype=torch.float) test_distr = test_data["distribution"] sub_test_dataset = Data.TensorDataset(X_test, y_test) sub_test_loader = Data.DataLoader( dataset=sub_test_dataset, # torch TensorDataset format batch_size=len(sub_test_dataset), # all test data ) weights = np.load(join(W2V_DIR, "word2vex_matrix.npy")) model = torch.load(join(CHECKPOINT_DIR, "MLP_model.ckpt"), map_location='cpu') print(model)
import torch import numpy as np import os from os.path import join from settings import emotions, emo2id, DATA_DIR, TRAIN_DATA_NAME, BALANCED_TRAIN_DATA_NAME, CHECKPOINT_DIR, \ store_obj, load_obj, EMB_DIM, FILTER_NUM, FILTER_SIZES, DROPOUT, TARGET_SIZE, BATCH_SIZE, CNN_HISTORY_PATH if __name__ == '__main__': train_data = load_obj(DATA_DIR, TRAIN_DATA_NAME) samples = { "感动": [], "同情": [], "无聊": [], "愤怒": [], "搞笑": [], "难过": [], "新奇": [], "温馨": [] } for i, emo in enumerate(train_data["label"]): samples[emotions[int(emo[0])]].append(train_data["token"][i]) max_num = 0 for (k, v) in samples.items(): if max_num < len(v): max_num = len(v) print(max_num) for (k, v) in samples.items(): copy_num = max_num // len(v) v_copy = v.copy()
import numpy as np from scipy.stats import pearsonr from os.path import join from settings import W2V_DIR, DATA_DIR, CHECKPOINT_DIR, load_obj from sklearn.metrics import f1_score def setup_seed(seed): torch.manual_seed(seed) np.random.seed(seed) if __name__ == '__main__': setup_seed(1) device = torch.device("cpu") test_data = load_obj(DATA_DIR, "test_RNN.pkl") y_test = torch.tensor(np.array(test_data["label"]), dtype=torch.long) X_test = torch.tensor(np.array(test_data["token"]), dtype=torch.long) test_distr = test_data["distribution"] sub_test_dataset = Data.TensorDataset(X_test, y_test) sub_test_loader = Data.DataLoader( dataset=sub_test_dataset, # torch TensorDataset format batch_size=len(sub_test_dataset), # all test data ) weights = np.load(join(W2V_DIR, "word2vex_matrix.npy")) model = torch.load(join(CHECKPOINT_DIR, "LSTM_model.ckpt"), map_location='cpu')
import os from os.path import join from settings import W2V_DIR, DATA_DIR, CHECKPOINT_DIR, load_obj, EMB_DIM, HIDDEN_DIM, DROPOUT, TARGET_SIZE, BATCH_SIZE, RNN_HISTORY_PATH from sklearn.metrics import f1_score def setup_seed(seed): torch.manual_seed(seed) np.random.seed(seed) if __name__ == '__main__': # 设置随机数种子 setup_seed(1) device = torch.device("cpu") balanced_train_data = load_obj(DATA_DIR, "train_RNN.pkl") test_data = load_obj(DATA_DIR, "test_RNN.pkl") y_train = torch.tensor(np.array(balanced_train_data["label"]), dtype=torch.long) X_train = torch.tensor(np.array(balanced_train_data["token"]), dtype=torch.long) y_test = torch.tensor(np.array(test_data["label"]), dtype=torch.long) X_test = torch.tensor(np.array(test_data["token"]), dtype=torch.long) test_distr = test_data["distribution"] sub_train_dataset = Data.TensorDataset(X_train, y_train) sub_train_loader = Data.DataLoader( dataset=sub_train_dataset, batch_size=BATCH_SIZE, shuffle=True,