def test(): config = CONFIG() print('加载word2id===========================') word2id = load_word2id(config.word2id_path) config.vocab_size = len(word2id) print('加载test语料库=========================') x, y = load_corpus(config.test_path, word2id, max_sen_len=config.max_sen_len) # x, y = x[:10], y[:10] model = TextCNN(config) with tf.Session() as sess: init_op = tf.global_variables_initializer() sess.run(init_op) saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(config.save_dir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) yhat = model.predict(sess, x) cat, cat2id = cat_to_id() y_cls = np.argmax(y, 1) # 评估 print("Precision, Recall and F1-Score...") print(metrics.classification_report(y_cls, yhat, target_names=cat)) # 混淆矩阵 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_cls, yhat) print(cm)
def train(): config = CONFIG() print('加载word2id===========================') word2id = load_word2id(config.word2id_file) config.vocab_size = len(word2id) print('加载word2vec==========================') word2vec = load_corpus_word2vec(config.corpus_w2v_file) print('加载train语料库========================') train = load_corpus(config.train_file, word2id, max_sen_len=config.max_sen_len) x_tr = train[:-1] y_tr = train[-1] print('加载test语料库==========================') test = load_corpus(config.test_file, word2id, max_sen_len=config.max_sen_len) x_te = test[:-1] y_te = test[-1] print('训练模型===============================') lstm = LSTM(CONFIG, embeddings=word2vec) with tf.Session() as sess: init_op = tf.global_variables_initializer() sess.run(init_op) lstm.fit(sess, x_tr, y_tr, x_te, y_te, config.save_dir, config.print_per_batch)
def main(): # 在训练集上构建一元和二元词典 word2id = load_word2id(length=VOCAB_SIZE) # 为深度学习算法准备数据loader train_loader_dl = DataLoader( dataset=DianPingDataSet("train"), batch_size=64, collate_fn=partial(collate_fn_dl, word2id, SENT_MAX_LEN) ) test_loader_dl = DataLoader( dataset=DianPingDataSet("test"), batch_size=64, collate_fn=partial(collate_fn_dl, word2id, SENT_MAX_LEN) ) vocab_size = len(word2id) print("Vocab Size:", vocab_size) print("加载词向量....") try: embedding = load_embeddings(word2id) except FileNotFoundError: embedding = None # 在深度学习模型上训练测试(CNN, LSTM) print("在BiLSTM模型上训练...") lstm_model = DeepModel(vocab_size, embedding, method="lstm") lstm_model.train_and_eval(train_loader_dl, test_loader_dl) print("在CNN模型上训练...") cnn_model = DeepModel(vocab_size, embedding, method="cnn") cnn_model.train_and_eval(train_loader_dl, test_loader_dl)
def train(): config = CONFIG() print('加载word2id===========================') word2id = load_word2id(config.word2id_path) print('加载word2vec==========================') word2vec = load_corpus_word2vec(config.corpus_word2vec_path) print('加载train语料库========================') x_tr, y_tr = load_corpus(config.train_path, word2id, max_sen_len=config.max_sen_len) print('加载dev语料库==========================') x_val, y_val = load_corpus(config.dev_path, word2id, max_sen_len=config.max_sen_len) print('训练模型===============================') tc = TextCNN(CONFIG, embeddings=word2vec) with tf.Session() as sess: init_op = tf.global_variables_initializer() sess.run(init_op) tc.fit(sess, x_tr, y_tr, x_val, y_val, config.save_dir, config.print_per_batch)
def main(): """在训练集上构建一元词典和二元词典""" word2id = load_word2id(length=VOCAB_SIZE) """prepare dataset""" train_loader = DataLoader( dataset=DPDataSet('train'), batch_size=batch_size, collate_fn=partial(collate_fn_dl, word2id, SENT_MAX_LEN), drop_last=True, pin_memory=True, # num_workers=4, shuffle=True) test_loader = DataLoader(dataset=DPDataSet("test"), batch_size=batch_size, collate_fn=partial(collate_fn_dl, word2id, SENT_MAX_LEN), pin_memory=True, drop_last=True, shuffle=True) vocab_size = len(word2id) print("Vocab Size:", vocab_size) print("加载词向量.....") try: embedding = load_embeddings(word2id) except FileNotFoundError: embedding = None print("测试BiLSTM:") lstm_model = DeepModel(vocab_size, embedding, method="lstm") lstm_model.train_and_evel(train_loader, test_loader) print("测试CNN:") cnn_model = DeepModel(vocab_size, embedding, method="cnn") cnn_model.train_and_evel(train_loader, test_loader) print("测试selfAttention:") att_model = DeepModel(vocab_size, embedding, method="self_att") att_model.train_and_evel(train_loader, test_loader) print("测试LSTM_Attention:") lstm_att_model = DeepModel(vocab_size, embedding, method="lstm_att") lstm_att_model.train_and_evel(train_loader, test_loader) print("测试RCNN:") RCNN_model = DeepModel(vocab_size, embedding, method="rcnn") RCNN_model.train_and_evel(train_loader, test_loader)
def sent_to_id(inputs): """ 将语句进行分词,然后将词语转换为word_to_id中的id编码 """ sentences = [] cut_sents = [jb.cut(w) for w in inputs] config = CONFIG() word2id = load_word2id(config.word2id_path) for cut_sent in cut_sents: sentence = [word2id.get(w, 0) for w in cut_sent] sentence = sentence[:config.max_sen_len] if len(sentence) < config.max_sen_len: sentence += [word2id['_PAD_'] ] * (config.max_sen_len - len(sentence)) sentences.append(sentence) return np.asarray(sentences)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print('device', device) glove_vocab, glove_embeddings = get_glove_embeddings(P.EMBEDDING_DIM) if does_word2id_exist(P) == False: word2id = Word2Id() train_data = get_single_dataset( "data/experiment_data/bidaf/{}_short/{}-v1.1.json".format( P.MERGE_TYPE, "train"), word2id, P.BATCH_SIZE, True, P.MIN_OCCURENCE, True, glove_vocab, False) save_word2id(P, word2id) else: word2id = load_word2id(P) train_data = get_single_dataset( "data/experiment_data/bidaf/{}_short/{}-v1.1.json".format( P.MERGE_TYPE, "train"), word2id, P.BATCH_SIZE, False, P.MIN_OCCURENCE, True, glove_vocab, False) vocab_size = len(word2id.id2w) embeddings_matrix = get_embeddings_matrix(glove_embeddings, word2id, vocab_size, P.EMBEDDING_DIM) # %% Model model = Model(word2id, P.HIDDEN_DIM, P.EMBEDDING_DIM, embeddings_matrix, P.USE_BILINEAR).to(device) saliency_loss_fn = nn.MSELoss() decoder_loss_fn = nn.NLLLoss()
update_w2v = True n_class = 8 max_sen_len = 50 embedding_dim = 50 batch_size = 160 output_channels = 20 n_hidden = 256 n_epoch = 5 learning_rate = 0.01 drop_keep_prob = 0.4 num_filters = 256 kernel_size = 3 config = CONFIG() word2id = load_word2id('./data/word_to_id.txt') print('加载word2vec==========================') word2vec = load_corpus_word2vec('./data/corpus_word2vec.txt') print('加载train语料库========================') train = load_corpus('./data/train/', word2id, max_sen_len=config.max_sen_len) print('加载dev语料库==========================') dev = load_corpus('./data/dev/', word2id, max_sen_len=config.max_sen_len) print('加载test语料库=========================') test = load_corpus('./data/test/', word2id, max_sen_len=config.max_sen_len) x_tr, y_tr = train x_val, y_val = dev config = CONFIG() tc = TextCNN(config=config, embeddings=word2vec)