示例#1
0
文件: main.py 项目: orangerfun/NLP
def predict(input_str):
    
    with open(config.map_file, "rb") as f:
        char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)
    
    """ 用cpu预测 """
    model = torch.load(os.path.join(config.save_dir,"medical_ner_f1_0.976.ckpt"), 
                       map_location="cpu"
    )
    model.eval()
    
    if not input_str:
        input_str = input("请输入文本: ")    
    
    _, char_ids, seg_ids, _ = prepare_dataset([input_str], char_to_id, tag_to_id, test=True)[0]
    char_tensor = torch.LongTensor(char_ids).view(1,-1)
    seg_tensor = torch.LongTensor(seg_ids).view(1,-1)
    
    with torch.no_grad():
        
        """ 得到维特比解码后的路径,并转换为标签 """
        paths = model(char_tensor,seg_tensor)    
        tags = [id_to_tag[idx] for idx in paths[0]]
    
    pprint(result_to_json(input_str, tags))
示例#2
0
def cpu_predict(input_str):
    with open(config.data_proc_file, "rb") as f:
        train_data, dev_data, test_data = pickle.load(f)
        char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)
        emb_matrix = pickle.load(f)

    with open(config.map_file, "rb") as f:
        char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    device = torch.device("cuda" if None else "cpu")
    model = NERLSTM_CRF(config, char_to_id, tag_to_id, emb_matrix, device)
    state_dict = torch.load(os.path.join(config.save_dir, "medical_ner.ckpt"),
                            map_location="cpu")
    model.load_state_dict(state_dict)
    """ 用cpu预测 """

    model.eval()
    if not input_str:
        input_str = input("请输入文本: ")

    _, char_ids, seg_ids, _ = prepare_dataset([input_str],
                                              char_to_id,
                                              tag_to_id,
                                              test=True)[0]
    char_tensor = torch.LongTensor(char_ids).view(1, -1)
    seg_tensor = torch.LongTensor(seg_ids).view(1, -1)

    with torch.no_grad():
        """ 得到维特比解码后的路径,并转换为标签 """
        paths = model(char_tensor, seg_tensor)
        tags = [id_to_tag[idx] for idx in paths[0]]

    pprint(result_to_json(input_str, tags))
示例#3
0
def predict(input_str):

    with open(config.map_file, "rb") as f:
        char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    model = torch.load(os.path.join(config.save_dir,
                                    "medical_ner_0.9723.ckpt"),
                       map_location="cpu")

    if not input_str:
        input_str = input("请输入文本: ")

    char_ids, seg_ids, _ = prepare_dataset([input_str],
                                           char_to_id,
                                           tag_to_id,
                                           test=True)[0]
    char_tensor = torch.LongTensor(char_ids).view(1, -1)
    seg_tensor = torch.LongTensor(seg_ids).view(1, -1)
    """ 得到维特比解码后的路径,并转换为标签 """
    paths = model(char_tensor, seg_tensor)
    tags = [id_to_tag[idx] for idx in paths[0]]
    """ 把开头和结尾的<start>,<end>标签去掉 """
    tags.pop(0)
    tags.pop(-1)

    pprint(result_to_json(input_str, tags))
示例#4
0
def train():
    # 1、加载数据集
    train_sentences = data_loader.load_sentences(FLAGS.train_file)
    dev_sentences = data_loader.load_sentences(FLAGS.dev_file)
    test_sentences = data_loader.load_sentences(FLAGS.test_file)

    # 2、转换编码 BIO->BIOES
    data_loader.update_tag_scheme(train_sentences, FLAGS.tag_schema)
    data_loader.update_tag_scheme(dev_sentences, FLAGS.tag_schema)
    data_loader.update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # 3、创建单词映射与标签映射
    if not os.path.isfile(FLAGS.map_file):
        _, word_to_id, id_to_word = data_loader.word_mapping(train_sentences)
        _, tag_to_id, id_to_tag = data_loader.tag_mapping(train_sentences)

        with open(FLAGS.map_file, "wb") as f:
            # 序列化pickle.dump(obj, file, [,protocol]),,序列化对象,将对象obj保存到文件file中去。
            pickle.dump([word_to_id, id_to_word, tag_to_id, id_to_tag], f)
    else:
        # 反序列化对象,将文件中的数据解析为一个python对象。file中有read()接口和readline()接口
        with open(FLAGS.map_file, "rb") as f:
            word_to_id, id_to_word, tag_to_id, id_to_tag = pickle.load(f)

    # 4、数据预处理
    train_data = data_loader.prepare_dataset(train_sentences, word_to_id,
                                             tag_to_id)

    dev_data = data_loader.prepare_dataset(dev_sentences, word_to_id,
                                           tag_to_id)

    test_data = data_loader.prepare_dataset(test_sentences, word_to_id,
                                            tag_to_id)

    model_utils.make_path(FLAGS)

    config = model_utils.config_model(FLAGS, word_to_id, tag_to_id)
示例#5
0
    def predict_text(model, input_str):
        if not input_str:
            input_str = input("请输入文本: ")

        _, char_ids, seg_ids, _ = prepare_dataset([input_str],
                                                  char_to_id,
                                                  tag_to_id,
                                                  test=True)[0]
        char_tensor = torch.LongTensor(char_ids).view(1, -1)
        seg_tensor = torch.LongTensor(seg_ids).view(1, -1)

        with torch.no_grad():
            """ 得到维特比解码后的路径,并转换为标签 """
            paths = model(char_tensor, seg_tensor)
            tags = [id_to_tag[idx] for idx in paths[0]]

        return result_to_json(input_str, tags)
示例#6
0
def train():
    # 加载数据集
    train_sentences = data_loader.load_sentences(FLAGS.train_file)
    dev_sentences = data_loader.load_sentences(FLAGS.dev_file)
    test_sentences = data_loader.load_sentences(FLAGS.test_file)

    # 转换编码
    data_loader.update_tag_scheme(train_sentences, FLAGS.tag_schema)
    data_loader.update_tag_scheme(dev_sentences, FLAGS.tag_schema)
    data_loader.update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # 创建单词和词典映射
    if not os.path.isfile(FLAGS.map_file):
        if FLAGS.pre_emb:
            dico_words_train = data_loader.word_mapping(train_sentences)[0]
            dico_word, word_to_id, id_to_word = data_utils.augment_with_pretrained(
                dico_words_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _, word_to_id, id_to_word = data_loader.word_mapping(
                train_sentences)
        _, tag_to_id, id_to_tag = data_loader.tag_mapping(train_sentences)
        with open(FLAGS.map_file, 'wb') as f:
            pickle.dump([word_to_id, id_to_word, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, 'rb') as f:
            word_to_id, id_to_word, tag_to_id, id_to_tag = pickle.load(f)

    # 准备数据
    train_data = data_loader.prepare_dataset(train_sentences, word_to_id,
                                             tag_to_id)
    dev_data = data_loader.prepare_dataset(dev_sentences, word_to_id,
                                           tag_to_id)
    test_data = data_loader.prepare_dataset(test_sentences, word_to_id,
                                            tag_to_id)

    # 将数据分批处理
    train_manager = data_utils.BatchManager(train_data, FLAGS.batch_size)
    dev_manager = data_utils.BatchManager(dev_data, FLAGS.batch_size)
    test_manager = data_utils.BatchManager(test_data, FLAGS.batch_size)

    # 创建不存在的文件夹
    model_utils.make_path(FLAGS)

    # 判断配置文件
    if os.path.isfile(FLAGS.config_file):
        config = model_utils.load_config(FLAGS.config_file)
    else:
        config = model_utils.config_model(FLAGS, word_to_id, tag_to_id)
        model_utils.save_config(config, FLAGS.config_file)

    # 配置印logger
    log_path = os.path.join('log', FLAGS.log_file)
    logger = model_utils.get_logger(log_path)
    model_utils.print_config(config, logger)

    tf_config = tf.ConfigProto(allow_soft_placement=True)
    tf_config.gpu_options.allow_growth = True

    step_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = model_utils.create(sess, Model, FLAGS.ckpt_path, load_word2vec,
                                   config, id_to_word, logger)
        logger.info('开始训练')
        loss = []
        start = time.time()
        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.setps_chech == 0:
                    iteration = step // step_per_epoch + 1
                    logger.info(
                        "iteration{}: step{}/{}, NER loss:{:>9.6f}".format(
                            iteration, step % step_per_epoch, step_per_epoch,
                            np.mean(loss)))
                    loss = []
            best = evaluate(sess, model, 'dev', dev_manager, id_to_tag, logger)

            if best:
                model_utils.save_model(sess, model, FLAGS.ckpt_path, logger)
            evaluate(sess, model, 'test', test_manager, id_to_tag, logger)
        t = time.time() - start
        logger.info('cost time: %f' % t)
示例#7
0
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

# import code and packages
import data_loader, assignment_slp_v5
import warnings, os

# create an empty dictionary and train the model
train_few_feature = {}
trained_model = assignment_slp_v5.train(X_train,y_train,0,train_few_feature)

# test the modeled set and create an empty dictionary
test_new_feature = {}
assignment_slp_v5.test(trained_model, X_test, y_test, test_new_feature)

# import the validation file
validation_file = data_loader.prepare_dataset("./input/Validation_call.txt")

# remove the sample and the subgroup columns
ftl.remove('Sample')
ftl.remove('Subgroup')

# extract only certain features from the validation_file
go_validation = validation_file[ftl]

# make predictions
predictions = trained_model.predict(go_validation)

# export the predictions to a file: 'predictions.txt'
import csv
out_put = pd.DataFrame({'Sample': validation_file.index, 'Subgroup': predictions})
out_put.to_csv('./output/predictions.txt', index=None, quotechar='"', sep='\t', quoting=csv.QUOTE_NONNUMERIC)
	with open(FLAGS.map_file, "wb") as f:

		pickle.dump([word_to_id, id_to_word, tag_to_id, id_to_tag], f)

else:

	with open(FLAGS.map_file, 'rb') as f:

		word_to_id, id_to_word, tag_to_id, id_to_tag = pickle.load(f)



		train_data = data_loader.prepare_dataset(

		train_sentences, word_to_id, tag_to_id

		)



		dev_data = data_loader.prepare_dataset(

		dev_sentences, word_to_id, tag_to_id

		)



		test_data = data_loader.prepare_dataset(
示例#9
0
def train():
    # 加载数据集
    train_sentences = data_loader.load_sentences(FLAGS.train_file)
    dev_sentences = data_loader.load_sentences(FLAGS.dev_file)
    test_sentences = data_loader.load_sentences(FLAGS.test_file)

    # 转换编码 bio转bioes
    data_loader.update_tag_scheme(train_sentences, FLAGS.tag_schema)
    data_loader.update_tag_scheme(test_sentences, FLAGS.tag_schema)
    data_loader.update_tag_scheme(dev_sentences, FLAGS.tag_schema)

    # 创建单词映射及标签映射
    if not os.path.isfile(FLAGS.map_file):
        if FLAGS.pre_emb:
            dico_words_train = data_loader.word_mapping(train_sentences)[0]
            dico_word, word_to_id, id_to_word = data_utils.augment_with_pretrained(
                dico_words_train.copy(),
                FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable(
                        [[w[0] for w in s] for s in test_sentences]
                    )
                )
            )
        else:
            _, word_to_id, id_to_word = data_loader.word_mapping(train_sentences)

        _, tag_to_id, id_to_tag = data_loader.tag_mapping(train_sentences)

        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([word_to_id, id_to_word, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, 'rb') as f:
            word_to_id, id_to_word, tag_to_id, id_to_tag = pickle.load(f)

    train_data = data_loader.prepare_dataset(
        train_sentences, word_to_id, tag_to_id
    )

    dev_data = data_loader.prepare_dataset(
        dev_sentences, word_to_id, tag_to_id
    )

    test_data = data_loader.prepare_dataset(
        test_sentences, word_to_id, tag_to_id
    )

    train_manager = data_utils.BatchManager(train_data, FLAGS.batch_size)
    dev_manager = data_utils.BatchManager(dev_data, FLAGS.batch_size)
    test_manager = data_utils.BatchManager(test_data, FLAGS.batch_size)

    print('train_data_num %i, dev_data_num %i, test_data_num %i' % (len(train_data), len(dev_data), len(test_data)))

    model_utils.make_path(FLAGS)

    if os.path.isfile(FLAGS.config_file):
        config = model_utils.load_config(FLAGS.config_file)
    else:
        config = model_utils.config_model(FLAGS, word_to_id, tag_to_id)
        model_utils.save_config(config, FLAGS.config_file)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = model_utils.get_logger(log_path)
    model_utils.print_config(config, logger)

    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch =train_manager.len_data
    with tf.Session(config = tf_config) as sess:
        model = model_utils.create(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_word, logger)
        logger.info("开始训练")
        loss = []
        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.setps_chech== 0:
                    iterstion = step // steps_per_epoch + 1
                    logger.info("iteration:{} step{}/{},NER loss:{:>9.6f}".format(iterstion, step%steps_per_epoch, steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess,model,"dev", dev_manager, id_to_tag, logger)

            if best:
                model_utils.save_model(sess, model, FLAGS.ckpt_path, logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
示例#10
0
    features = feature_importances[
        feature_importances['coef_abs'] > threshold]['feature'].values

    return features


# features_HER2['coef_abs'] = features_HER2['coef'].abs()
# feature_importances = features_HER2.sort_values(by='coef_abs', ascending=False)
# feature_importances[feature_importances['coef_abs'] > 0.1]['feature'].values

features_HER2 = feature_weights('HER2+', 0.2)

plt.plot(features_HER2['coef_mean'])

features_HR = feature_weights('HR+', 0.4)
plt.plot(features_HR['coef_mean'])

features_TN = feature_weights('Triple Neg', 0.4)
plt.plot(features_TN['coef_mean'])

len(features_HER2), len(features_HR), len(features_TN)

final_features = list(
    set(features_HER2.tolist() + features_HR.tolist() + features_TN.tolist()))
df_features = pd.DataFrame({'features': final_features})
df_features.to_csv('final_features.csv', index=False)
len(final_features)

validation_filename = 'data/Validation_call.txt'
validation = data_loader.prepare_dataset(validation_filename)