keyword_num = 10 # 关键词(特征词)个数 test_rate = 0.1 # 测试集比例 hidden_dim = 200 # LSTM的隐层神经元个数(输出维度) word_emb_dim = 100 # 词向量维度 feature_emb_dim = 50 # 特征名称embedding维度 keep_prob = 0.8 # dropout保留比例 num_layers = 1 # LSTM层数 batch_size = 200 # 每个batch的大小 learning_rate = 0.0001 # 学习率 num_epochs = 10 # 训练数据迭代次数 add_feature = False # 是否加特征名embedding add_keyword_attention = False # 是否加关键词attention print('loading data ...') text_data = LoadData('diag_code_data.csv', filter_num, add_feature) word_vocab, feature_vocab = text_data.build_vocab() vocab_size = len(word_vocab) print('the vocabulary size is {}'.format(vocab_size)) label = text_data.label features = text_data.feature_names num_features = len(features) num_classes = len(set(label)) print('num_classes = {}, num_features = {}'.format(num_classes, num_features)) sentences = text_data.creat_id_sentences(word_vocab, feature_vocab) sentences_length = [len(sentence) for sentence in sentences] mean_seq_length = np.mean(sentences_length) max_seq_length = np.max(sentences_length) print('mean_seq_length = {}, max_seq_length = {}'.format( mean_seq_length, max_seq_length))