keep_prob = 0.8 # dropout保留比例 num_layers = 1 # LSTM层数 batch_size = 50 # 每个batch的大小 learning_rate = 0.01 # 学习率 num_epochs = 2 # 训练数据迭代次数 add_feature_emb = True # 是否加特征名embedding add_keyword_attention = True # 是否加关键词attention add_second_attention = True # 是否加第二层的attention print('loading and processing data ...') process = DataProcess(add_feature_emb, add_keyword_attention) train_data, train_label = process.load_data('train_data_10.csv') test_data, test_label = process.load_data('test_data_10.csv') merge_data = process.merge_feature_text(train_data) word_vocab, feature_vocab = process.build_vocab(merge_data) vocab_size = len(word_vocab) print('the vocabulary size is {}'.format(vocab_size)) train_sentences, train_labels = process.creat_id_sentences( train_data, train_label, word_vocab, feature_vocab) test_sentences, test_labels = process.creat_id_sentences( test_data, test_label, word_vocab, feature_vocab) # features = process.feature_names num_features = len(features) num_classes = len(set(train_label)) test_classes = len(set(test_label)) print('num_classes = {}, num_features = {}'.format(num_classes, num_features)) print('test num_classes = {}'.format(test_classes))
'患者', '正常', '明显', '每次', '入院', '出院', '年', '月', '日', '天', '术后', '考虑', '显示', '我院', '外院' ] train_data_path = '/home/yanrui/ICD/data/train_data.csv' test_data_path = '/home/yanrui/ICD/data/test_data.csv' results_path = '/home/yanrui/ICD/results/our_model_results.txt' print('loading data ...') process = DataProcess(add_entry_emb, entry_names, label_name, stop_words) train_text, train_label = process.load_data(train_data_path) test_text, test_label = process.load_data(test_data_path) print('building word and entry vocabulary...') merge_text = process.merge_entry_text(train_text) word_vocab, entry_vocab = process.build_vocab(merge_text, word_freq) vocab_size = len(word_vocab) num_entries = len(entry_names) print('transform text to index ...') train_x = process.text_to_index(train_text, word_vocab, entry_vocab) test_x = process.text_to_index(test_text, word_vocab, entry_vocab) num_classes = len(set(train_label)) test_classes = len(set(test_label)) train_y, label_list = process.label_to_onehot(train_label) test_y, _ = process.label_to_onehot(test_label) sentences_length = [len(x) for x in train_x] mean_seq_length = np.mean(sentences_length) max_seq_length = np.max(sentences_length)