def eval_datasets(grt_df, sys_df) -> Tuple[Metrics, Metrics, Metrics]: unlabelled_arg_counts = np.zeros(3, dtype=np.float32) labelled_arg_counts = np.zeros(3, dtype=np.float32) unlabelled_role_counts = np.zeros(3, dtype=np.float32) for key, sys_roles, grt_roles in yield_paired_predicates(sys_df, grt_df): local_arg, local_qna, local_role = evaluate(sys_roles, grt_roles) unlabelled_arg_counts += np.array(local_arg.as_tuple()) labelled_arg_counts += np.array(local_qna.as_tuple()) unlabelled_role_counts += np.array(local_role.as_tuple()) unlabelled_arg_counts = Metrics(*unlabelled_arg_counts) labelled_arg_counts = Metrics(*labelled_arg_counts) unlabelled_role_counts = Metrics(*unlabelled_role_counts) return unlabelled_arg_counts, labelled_arg_counts, unlabelled_role_counts
def dev_test(self, dev_x, dev_y, word2id, tag2id): batches_x, batches_y, batches_seq_len = get_batches( dev_x, dev_y, word2id, tag2id, self.bilstm.batch_size) pred_lists = [] labels = [] id2tag = dict((id_, tag) for tag, id_ in tag2id.items()) for i in range(len(batches_x)): pred_labels = self.pred_labels(batches_x[i], batches_y[i], batches_seq_len[i]) for j in range(len(pred_labels)): for k in range(batches_seq_len[i][j]): pred_lists.append(id2tag[pred_labels[j][k]]) labels.append(id2tag[batches_y[i][j][k]]) metrics = Metrics(labels, pred_lists) metrics.report_scores()
def main(): print('读取数据...') train_word_lists, train_tag_lists, word2id, tag2id = build_corpus('train') dev_word_lists, dev_tag_lists = build_corpus('dev', maek_vocab = False) test_word_lists, test_tag_lists = build_corpus('test', maek_vocab = False) print('训练HMM模型...') hmm_model = HMMModel(len(tag2id), len(word2id)) hmm_model.train(train_word_lists, train_tag_lists, word2id, tag2id) pred_tag_lists = hmm_model.test(test_word_lists, word2id, tag2id) metrics = Metrics(test_tag_lists, pred_tag_lists) metrics.report_scores() print('训练CRF模型...') crf_model = CRFModel(max_iterations = 90) crf_model.train(train_word_lists, train_tag_lists) pred_tag_lists = crf_model.test(test_word_lists) metrics = Metrics(test_tag_lists, pred_tag_lists) metrics.report_scores() print('训练BiLSTM模型...') word2id, tag2id = extend_maps(word2id, tag2id) bilstm = BiLSTM(len(word2id), len(tag2id)) bilstm.train(train_word_lists, train_tag_lists, dev_word_lists, dev_tag_lists, word2id, tag2id, 0.8) bilstm.dev_test(test_word_lists, test_tag_lists, word2id, tag2id) bilstm.close_sess() print('训练BiLSTM-CRF模型...') bilstm_crf = BiLSTM_CRF(len(word2id), len(tag2id)) bilstm_crf.train(train_word_lists, train_tag_lists, dev_word_lists, dev_tag_lists, word2id, tag2id, 0.8) bilstm_crf.dev_test(test_word_lists, test_tag_lists, word2id, tag2id) bilstm_crf.close_sess()
DIM = 256 filters = 256 print('num_words = {}, maxlen = {}'.format(num_words, maxlen)) # 数据集和标签 fact = np.load(r"../data/train_word_seg_data{}_numwords{}.npy".format( maxlen, num_words)) labels = np.load(r"../data/train_label_from_zero_onehot.npy") fact_train, fact_test, labels_train, labels_test = train_test_split( fact, labels, test_size=0.1, random_state=1) del labels del fact gc.collect() print("data have been loaded") metrics = Metrics() data_input = Input(shape=[maxlen]) word_vec = Embedding(input_dim=num_words + 1, input_length=maxlen, output_dim=DIM, mask_zero=0, name='Embedding')(data_input) x = Bidirectional(CuDNNGRU(filters, return_sequences=True))(word_vec) x = Bidirectional(CuDNNGRU(filters, return_sequences=True))(x) x = GlobalMaxPooling1D()(x) x = BatchNormalization()(x) x = Dense(labels_train.shape[1], activation="sigmoid")(x) model = Model(inputs=data_input, outputs=x) model.summary() model = multi_gpu_model(model, gpus=2) adam = keras.optimizers.adam(lr=0.0001)