def predict(input_str): with open(config.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) """ 用cpu预测 """ model = torch.load(os.path.join(config.save_dir,"medical_ner_f1_0.976.ckpt"), map_location="cpu" ) model.eval() if not input_str: input_str = input("请输入文本: ") _, char_ids, seg_ids, _ = prepare_dataset([input_str], char_to_id, tag_to_id, test=True)[0] char_tensor = torch.LongTensor(char_ids).view(1,-1) seg_tensor = torch.LongTensor(seg_ids).view(1,-1) with torch.no_grad(): """ 得到维特比解码后的路径,并转换为标签 """ paths = model(char_tensor,seg_tensor) tags = [id_to_tag[idx] for idx in paths[0]] pprint(result_to_json(input_str, tags))
def cpu_predict(input_str): with open(config.data_proc_file, "rb") as f: train_data, dev_data, test_data = pickle.load(f) char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) emb_matrix = pickle.load(f) with open(config.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) device = torch.device("cuda" if None else "cpu") model = NERLSTM_CRF(config, char_to_id, tag_to_id, emb_matrix, device) state_dict = torch.load(os.path.join(config.save_dir, "medical_ner.ckpt"), map_location="cpu") model.load_state_dict(state_dict) """ 用cpu预测 """ model.eval() if not input_str: input_str = input("请输入文本: ") _, char_ids, seg_ids, _ = prepare_dataset([input_str], char_to_id, tag_to_id, test=True)[0] char_tensor = torch.LongTensor(char_ids).view(1, -1) seg_tensor = torch.LongTensor(seg_ids).view(1, -1) with torch.no_grad(): """ 得到维特比解码后的路径,并转换为标签 """ paths = model(char_tensor, seg_tensor) tags = [id_to_tag[idx] for idx in paths[0]] pprint(result_to_json(input_str, tags))
def predict(input_str): with open(config.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) model = torch.load(os.path.join(config.save_dir, "medical_ner_0.9723.ckpt"), map_location="cpu") if not input_str: input_str = input("请输入文本: ") char_ids, seg_ids, _ = prepare_dataset([input_str], char_to_id, tag_to_id, test=True)[0] char_tensor = torch.LongTensor(char_ids).view(1, -1) seg_tensor = torch.LongTensor(seg_ids).view(1, -1) """ 得到维特比解码后的路径,并转换为标签 """ paths = model(char_tensor, seg_tensor) tags = [id_to_tag[idx] for idx in paths[0]] """ 把开头和结尾的<start>,<end>标签去掉 """ tags.pop(0) tags.pop(-1) pprint(result_to_json(input_str, tags))
def train(): # 1、加载数据集 train_sentences = data_loader.load_sentences(FLAGS.train_file) dev_sentences = data_loader.load_sentences(FLAGS.dev_file) test_sentences = data_loader.load_sentences(FLAGS.test_file) # 2、转换编码 BIO->BIOES data_loader.update_tag_scheme(train_sentences, FLAGS.tag_schema) data_loader.update_tag_scheme(dev_sentences, FLAGS.tag_schema) data_loader.update_tag_scheme(test_sentences, FLAGS.tag_schema) # 3、创建单词映射与标签映射 if not os.path.isfile(FLAGS.map_file): _, word_to_id, id_to_word = data_loader.word_mapping(train_sentences) _, tag_to_id, id_to_tag = data_loader.tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: # 序列化pickle.dump(obj, file, [,protocol]),,序列化对象,将对象obj保存到文件file中去。 pickle.dump([word_to_id, id_to_word, tag_to_id, id_to_tag], f) else: # 反序列化对象,将文件中的数据解析为一个python对象。file中有read()接口和readline()接口 with open(FLAGS.map_file, "rb") as f: word_to_id, id_to_word, tag_to_id, id_to_tag = pickle.load(f) # 4、数据预处理 train_data = data_loader.prepare_dataset(train_sentences, word_to_id, tag_to_id) dev_data = data_loader.prepare_dataset(dev_sentences, word_to_id, tag_to_id) test_data = data_loader.prepare_dataset(test_sentences, word_to_id, tag_to_id) model_utils.make_path(FLAGS) config = model_utils.config_model(FLAGS, word_to_id, tag_to_id)
def predict_text(model, input_str): if not input_str: input_str = input("请输入文本: ") _, char_ids, seg_ids, _ = prepare_dataset([input_str], char_to_id, tag_to_id, test=True)[0] char_tensor = torch.LongTensor(char_ids).view(1, -1) seg_tensor = torch.LongTensor(seg_ids).view(1, -1) with torch.no_grad(): """ 得到维特比解码后的路径,并转换为标签 """ paths = model(char_tensor, seg_tensor) tags = [id_to_tag[idx] for idx in paths[0]] return result_to_json(input_str, tags)
def train(): # 加载数据集 train_sentences = data_loader.load_sentences(FLAGS.train_file) dev_sentences = data_loader.load_sentences(FLAGS.dev_file) test_sentences = data_loader.load_sentences(FLAGS.test_file) # 转换编码 data_loader.update_tag_scheme(train_sentences, FLAGS.tag_schema) data_loader.update_tag_scheme(dev_sentences, FLAGS.tag_schema) data_loader.update_tag_scheme(test_sentences, FLAGS.tag_schema) # 创建单词和词典映射 if not os.path.isfile(FLAGS.map_file): if FLAGS.pre_emb: dico_words_train = data_loader.word_mapping(train_sentences)[0] dico_word, word_to_id, id_to_word = data_utils.augment_with_pretrained( dico_words_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _, word_to_id, id_to_word = data_loader.word_mapping( train_sentences) _, tag_to_id, id_to_tag = data_loader.tag_mapping(train_sentences) with open(FLAGS.map_file, 'wb') as f: pickle.dump([word_to_id, id_to_word, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, 'rb') as f: word_to_id, id_to_word, tag_to_id, id_to_tag = pickle.load(f) # 准备数据 train_data = data_loader.prepare_dataset(train_sentences, word_to_id, tag_to_id) dev_data = data_loader.prepare_dataset(dev_sentences, word_to_id, tag_to_id) test_data = data_loader.prepare_dataset(test_sentences, word_to_id, tag_to_id) # 将数据分批处理 train_manager = data_utils.BatchManager(train_data, FLAGS.batch_size) dev_manager = data_utils.BatchManager(dev_data, FLAGS.batch_size) test_manager = data_utils.BatchManager(test_data, FLAGS.batch_size) # 创建不存在的文件夹 model_utils.make_path(FLAGS) # 判断配置文件 if os.path.isfile(FLAGS.config_file): config = model_utils.load_config(FLAGS.config_file) else: config = model_utils.config_model(FLAGS, word_to_id, tag_to_id) model_utils.save_config(config, FLAGS.config_file) # 配置印logger log_path = os.path.join('log', FLAGS.log_file) logger = model_utils.get_logger(log_path) model_utils.print_config(config, logger) tf_config = tf.ConfigProto(allow_soft_placement=True) tf_config.gpu_options.allow_growth = True step_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = model_utils.create(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_word, logger) logger.info('开始训练') loss = [] start = time.time() for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.setps_chech == 0: iteration = step // step_per_epoch + 1 logger.info( "iteration{}: step{}/{}, NER loss:{:>9.6f}".format( iteration, step % step_per_epoch, step_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, 'dev', dev_manager, id_to_tag, logger) if best: model_utils.save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, 'test', test_manager, id_to_tag, logger) t = time.time() - start logger.info('cost time: %f' % t)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1) # import code and packages import data_loader, assignment_slp_v5 import warnings, os # create an empty dictionary and train the model train_few_feature = {} trained_model = assignment_slp_v5.train(X_train,y_train,0,train_few_feature) # test the modeled set and create an empty dictionary test_new_feature = {} assignment_slp_v5.test(trained_model, X_test, y_test, test_new_feature) # import the validation file validation_file = data_loader.prepare_dataset("./input/Validation_call.txt") # remove the sample and the subgroup columns ftl.remove('Sample') ftl.remove('Subgroup') # extract only certain features from the validation_file go_validation = validation_file[ftl] # make predictions predictions = trained_model.predict(go_validation) # export the predictions to a file: 'predictions.txt' import csv out_put = pd.DataFrame({'Sample': validation_file.index, 'Subgroup': predictions}) out_put.to_csv('./output/predictions.txt', index=None, quotechar='"', sep='\t', quoting=csv.QUOTE_NONNUMERIC)
with open(FLAGS.map_file, "wb") as f: pickle.dump([word_to_id, id_to_word, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, 'rb') as f: word_to_id, id_to_word, tag_to_id, id_to_tag = pickle.load(f) train_data = data_loader.prepare_dataset( train_sentences, word_to_id, tag_to_id ) dev_data = data_loader.prepare_dataset( dev_sentences, word_to_id, tag_to_id ) test_data = data_loader.prepare_dataset(
def train(): # 加载数据集 train_sentences = data_loader.load_sentences(FLAGS.train_file) dev_sentences = data_loader.load_sentences(FLAGS.dev_file) test_sentences = data_loader.load_sentences(FLAGS.test_file) # 转换编码 bio转bioes data_loader.update_tag_scheme(train_sentences, FLAGS.tag_schema) data_loader.update_tag_scheme(test_sentences, FLAGS.tag_schema) data_loader.update_tag_scheme(dev_sentences, FLAGS.tag_schema) # 创建单词映射及标签映射 if not os.path.isfile(FLAGS.map_file): if FLAGS.pre_emb: dico_words_train = data_loader.word_mapping(train_sentences)[0] dico_word, word_to_id, id_to_word = data_utils.augment_with_pretrained( dico_words_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable( [[w[0] for w in s] for s in test_sentences] ) ) ) else: _, word_to_id, id_to_word = data_loader.word_mapping(train_sentences) _, tag_to_id, id_to_tag = data_loader.tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([word_to_id, id_to_word, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, 'rb') as f: word_to_id, id_to_word, tag_to_id, id_to_tag = pickle.load(f) train_data = data_loader.prepare_dataset( train_sentences, word_to_id, tag_to_id ) dev_data = data_loader.prepare_dataset( dev_sentences, word_to_id, tag_to_id ) test_data = data_loader.prepare_dataset( test_sentences, word_to_id, tag_to_id ) train_manager = data_utils.BatchManager(train_data, FLAGS.batch_size) dev_manager = data_utils.BatchManager(dev_data, FLAGS.batch_size) test_manager = data_utils.BatchManager(test_data, FLAGS.batch_size) print('train_data_num %i, dev_data_num %i, test_data_num %i' % (len(train_data), len(dev_data), len(test_data))) model_utils.make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = model_utils.load_config(FLAGS.config_file) else: config = model_utils.config_model(FLAGS, word_to_id, tag_to_id) model_utils.save_config(config, FLAGS.config_file) log_path = os.path.join("log", FLAGS.log_file) logger = model_utils.get_logger(log_path) model_utils.print_config(config, logger) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch =train_manager.len_data with tf.Session(config = tf_config) as sess: model = model_utils.create(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_word, logger) logger.info("开始训练") loss = [] for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.setps_chech== 0: iterstion = step // steps_per_epoch + 1 logger.info("iteration:{} step{}/{},NER loss:{:>9.6f}".format(iterstion, step%steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess,model,"dev", dev_manager, id_to_tag, logger) if best: model_utils.save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
features = feature_importances[ feature_importances['coef_abs'] > threshold]['feature'].values return features # features_HER2['coef_abs'] = features_HER2['coef'].abs() # feature_importances = features_HER2.sort_values(by='coef_abs', ascending=False) # feature_importances[feature_importances['coef_abs'] > 0.1]['feature'].values features_HER2 = feature_weights('HER2+', 0.2) plt.plot(features_HER2['coef_mean']) features_HR = feature_weights('HR+', 0.4) plt.plot(features_HR['coef_mean']) features_TN = feature_weights('Triple Neg', 0.4) plt.plot(features_TN['coef_mean']) len(features_HER2), len(features_HR), len(features_TN) final_features = list( set(features_HER2.tolist() + features_HR.tolist() + features_TN.tolist())) df_features = pd.DataFrame({'features': final_features}) df_features.to_csv('final_features.csv', index=False) len(final_features) validation_filename = 'data/Validation_call.txt' validation = data_loader.prepare_dataset(validation_filename)