示例#1
0
    def train(self):
        batch_manager = BatchManager()
        self.head_input_size = batch_manager.head_vocab_size
        self.tail_input_size = batch_manager.tail_vocab_size
        self.relation_input_size = batch_manager.relation_vocab_size
        data_map = {
            "head_size": self.head_input_size,
            "tail_size": self.tail_input_size,
            "relation_size": self.relation_input_size,
            "head_vocab": batch_manager.head_vocab,
            "tail_vocab": batch_manager.tail_vocab,
            "relation_vocab": batch_manager.relation_vocab
        }
        f = open("models/data_map.pkl", "wb")
        pickle.dump(data_map, f)
        f.close()

        self.init_model()
        with tf.Session() as sess:
            ckpt = tf.train.get_checkpoint_state(self.checkpoint_dir)
            if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
                print("[->] restore model")
                self.saver.restore(sess, ckpt.model_checkpoint_path)
            else:
                print("[->] no model, initializing")
                sess.run(tf.global_variables_initializer())

            for i in range(200):
                print("epoch {}".format(i))
                for batch in batch_manager.get_batch():
                    loss = self.step(batch, sess)
                    print("\tloss: {}".format(loss))
                    self.saver.save(sess, self.checkpoint_path)
示例#2
0
 def get_data(self):
     train_valid_test = os.path.join(FLAGS.pkl_dir, "train_valid_test.pkl")
     if os.path.exists(train_valid_test):  # 若train_valid_test已被处理和存储
         with open(train_valid_test, 'rb') as data_f:
             train_data, valid_data, test_data, true_label_pert = pickle.load(
                 data_f)
     else:  # 读取数据集并创建训练集、验证集和测试集
         with open(FLAGS.traning_data_path, "r",
                   encoding="utf-8") as data_f:
             all_data = csv.reader(data_f, delimiter='\t', quotechar='|')
             # 获取tfidf值
             tfidf_path = "data/tfidf.txt"  # 存储tfidf值的文件路径
             if not os.path.exists(tfidf_path):
                 get_tfidf_and_save(all_data, tfidf_path)
             tfidf_dict = load_tfidf_dict(tfidf_path)
             # 获取fasttext词向量
             fasttext_path = FLAGS.fasttext_model_path
             fasttext_dict = load_vector(fasttext_path)
             # 获取word2vec词向量
             word2vec_path = "data/word2vec_word_model.txt"
             word2vec_dict = load_vector(word2vec_path)
         with open(FLAGS.traning_data_path, "r",
                   encoding="utf-8") as data_f:
             all_data = csv.reader(data_f, delimiter='\t', quotechar='|')
             # 基于句子的长度和包含的词汇、tfidf值、fasttext词向量、word2vec词向量进行特征工程,并获取相应的特征向量
             features_vector = features_engineer(all_data,
                                                 fasttext_dict,
                                                 word2vec_dict,
                                                 tfidf_dict,
                                                 FLAGS.tokenize_style,
                                                 n_gram=8)
         with open(FLAGS.traning_data_path, "r",
                   encoding="utf-8") as data_f:
             all_data = csv.reader(data_f, delimiter='\t', quotechar='|')
             # 语句序列化,将句子中的word映射成index,作为输入特征
             sentences_1, sentences_2, labels = sentence_word_to_index(
                 all_data, self.word_to_index, self.label_to_index,
                 FLAGS.tokenize_style)
             """
             打乱数据、padding、添加features_vector到数据中并根据比例分割成train、valid、test数据,
             train、valid、test里面又依次包含sentences_1,sentences_2,features_vector,labels四种数据
             """
             train_data, valid_data, test_data, true_label_pert = shuffle_padding_split(
                 sentences_1, sentences_2, labels, features_vector,
                 train_valid_test, FLAGS.sentence_len)
     self.features_vector_size = len(train_data[2][0])
     # print("features_vector_size:", self.features_vector_size)
     print("训练集大小:", len(train_data[0]), "验证集大小:", len(valid_data[0]),
           "正样本比例:", true_label_pert)
     # 获取train、valid、test数据的batch生成类
     self.train_batch_manager = BatchManager(train_data,
                                             int(FLAGS.batch_size))
     print("训练集批次数量:", self.train_batch_manager.len_data)
     self.valid_batch_manager = BatchManager(valid_data,
                                             int(FLAGS.batch_size))
     self.test_batch_manager = BatchManager(test_data,
                                            int(FLAGS.batch_size))
def train():
    #加载训练用的数据
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)
    #加载验证集和测试集合
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)
    # Use selected tagging scheme (IOB / IOBES)使用选定的标记方案I:中间,O:其他,B:开始 | E:结束,S:单个
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)
    update_tag_scheme(dev_sentences, FLAGS.tag_schema)
    _c, char_to_id, id_to_char = char_mapping(
        train_sentences, FLAGS.lower)  #统计每个字的频率以及为每个字分配一个id
    _t, tag_to_id, id_to_tag = tag_mapping(
        train_sentences, FLAGS.id_to_tag_path,
        FLAGS.tag_to_id_path)  #统计每个命名实体的频率以及为每个命名实体分配一个id
    #将字典写入pkl文件中
    with open(FLAGS.map_file, "wb") as f:
        pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    #准备数据,获取包含索引的列表集合,得到用于输入网络进行训练的数据
    train_data = prepare_dataset(  # train_data[0][0]:一句话;train_data[0][1]:单个字的编号;train_data[0][2]:切词之后,切词特征:词的大小是一个字的话是0,词的大小是2以上的话:1,2.。。,2,3; train_data[0][3]:每个字的标签
        train_sentences, char_to_id, tag_to_id, FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    train_manager = BatchManager(
        train_data, FLAGS.batch_size)  # 将数据拆分成以60句话为一个batch,得到一个可迭代对象
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    config = config_model(char_to_id, tag_to_id)  #补全参数配置
    #限制GPU的使用
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, load_word2vec, config, id_to_char)
        saver = tf.train.Saver()  # 用于保存模型
        with tf.device("/cpu:0"):
            for i in range(100):
                for batch in train_manager.iter_batch(shuffle=True):
                    step, batch_loss = model.run_step(
                        sess, True, batch)  # 按批次训练模型 这个是训练的开始,可以从这里倒着找整个网络怎么训练
                #每训练5次做一次验证并计算模型的f1
                if (i + 1) % 1 == 0:
                    f1 = evaluate(sess, model, "dev", dev_manager, id_to_tag)
                    print("验证集的F1系数:", f1)
                #每训练20次保存一次模型
                if (i + 10) % 1 == 0:
                    saver.save(sess, save_path=FLAGS.ckpt_path)
示例#4
0
def main(_):
    if not os.path.isdir(FLAGS.log_path):
        os.makedirs(FLAGS.log_path)
    if not os.path.isdir(FLAGS.model_path):
        os.makedirs(FLAGS.model_path)
    if not os.path.isdir(FLAGS.result_path):
        os.makedirs(FLAGS.result_path)

    tag_to_id = FLAGS.tag_to_id

    # specific_file = "data/mor-test/test_set.mor"
    specific_file = "../mor_v1_addr.test"#"../addr_all.test"#""#"data/mor-test_code/mor_iter_v1/mor_person_label_v2.txt"#FLAGS.test_file#""data/rule_gen/rule_gen.test_code"#"data/sighan/sighan.test_code"#
    # load data
    id_to_word, id_to_tag, _, _, test_data = load_data(FLAGS, tag_to_id, only_use_test=True, specific_file=specific_file)
    test_manager = BatchManager(test_data, len(id_to_tag), FLAGS.word_max_len, FLAGS.valid_batch_size)

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        model = create_model(sess, id_to_word, id_to_tag)
        # test model
        model.logger.info("testing ner")
        ckpt = tf.train.get_checkpoint_state(FLAGS.model_path)
        model.logger.info("Reading model parameters from %s" % ckpt.model_checkpoint_path)
        model.saver.restore(sess, ckpt.model_checkpoint_path)
        ner_results = model.predict(sess, test_manager)
        eval_lines = conll_eval(ner_results, FLAGS.result_path)
        for line in eval_lines:
            model.logger.info(line)
 def predict():
     """
     对一个数据集进行实体识别
     :return:
     """
     config = load_config(FLAGS.config_file)
     logger = get_logger(FLAGS.log_file)
     tf_config = tf.ConfigProto()
     tf_config.gpu_options.allow_growth = True  # limit GPU memory
     # 从训练阶段生成的map_file中恢复各映射字典
     with open(FLAGS.map_file, "rb") as f:
         char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)
     test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower,
                                     FLAGS.zeros)
     test_data = prepare_dataset(test_sentences,
                                 char_to_id,
                                 tag_to_id,
                                 FLAGS.lower,
                                 train=False)
     test_manager = BatchManager(test_data, 1)
     with tf.Session(config=tf_config) as sess:
         model = create_model(sess, Model, FLAGS.ckpt_path, config,
                              id_to_char, logger)
         logger.info("predict data......")
         ner_results = model.predict(sess, test_manager, id_to_tag)
         result_write_evaluate(ner_results, FLAGS.result_path, "test")
示例#6
0
文件: main.py 项目: wshzd/NER
def main(_):

    if FLAGS.train:
        if FLAGS.clean:
            clean(FLAGS)
        train()
    else:
        # 下面使用testdata来进行评估模型
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)
        log_path = os.path.join("log", FLAGS.log_file)
        config = load_config(FLAGS.config_file)
        logger = get_logger(log_path)
        tf_config = tf.ConfigProto(allow_soft_placement=True,
                                   log_device_placement=True)
        test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower,
                                        FLAGS.zeros)
        test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                    FLAGS.lower)
        test_manager = BatchManager(test_data, 100)
        with tf.Session(config=tf_config) as sess:
            sess.run(tf.global_variables_initializer())
            model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                                 config, id_to_char, logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
示例#7
0
文件: main.py 项目: aiedward/Cner_v1
def test():
    # 加载配置文件
    config = load_config(FLAGS.config_file)
    # 加载日志管理器
    log_path = os.path.join("log", FLAGS.test_log_file)
    logger = get_logger(log_path)
    # 配置GPU
    tf_config = tf.ConfigProto()
    # 加载数据集
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)
    # 读取词典
    with open(FLAGS.map_file, "rb") as f:
        char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)
    # 格式化test
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    # 加载batch
    test_manager = BatchManager(test_data, 20)
    with tf.Session(config=tf_config) as sess:
        logger.info("start testing...")
        start = time.time()
        # 根据保存的模型读取模型
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        # 获取testbatch
        evaluate(sess, model, "test", test_manager, id_to_tag, logger)
        logger.info("The best_f1 on test_dataset is {}".format(
            model.best_test_f1.eval()))
        logger.info('Time test for 10 batch is {} sec\n'.format(time.time() -
                                                                start))
示例#8
0
 def get_batch_data(self):
     """
     得到训练集和验证集的batch管理类:首先基于各映射字典对训练集和验证集的语句序列进行处理,得到每个语句的各特征列表以及
     真实标签列表,然后获取batch管理类,用于生成batch数据
     :return:
     """
     if not os.path.isfile(FLAGS.train_dev_file):
         train_data = prepare_dataset(self.train_sentences, self.char_to_id, self.tag_to_id, FLAGS.lower)
         dev_data = prepare_dataset(self.dev_sentences, self.char_to_id, self.tag_to_id, FLAGS.lower)
         with open(FLAGS.train_dev_file, "wb") as f:
             pickle.dump([train_data, dev_data], f)
     else:
         with open(FLAGS.train_dev_file, "rb") as f:
             train_data, dev_data = pickle.load(f)
     print("%i / %i  sentences in train / dev ." % (len(train_data), len(dev_data)))
     self.train_batch_manager = BatchManager(train_data, int(FLAGS.batch_size))
     self.dev_batch_manager = BatchManager(dev_data, int(FLAGS.batch_size))
示例#9
0
def main():
    os.environ['PYTHONHASHSEED'] = str(args.seed)
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

    train_sentences = load_sentences(args.train_file)
    dev_sentences = load_sentences(args.dev_file)
    test_sentences = load_sentences(args.test_file)

    update_tag_scheme(train_sentences, args.tag_schema)
    update_tag_scheme(test_sentences, args.tag_schema)
    update_tag_scheme(dev_sentences, args.tag_schema)

    with open(args.map_file, 'rb') as f:
        char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id)

    train_manager = BatchManager(train_data, args.batch_size, args.num_steps)
    dev_manager = BatchManager(dev_data, 100, args.num_steps)
    test_manager = BatchManager(test_data, 100, args.num_steps)

    if args.cuda >= 0:
        torch.cuda.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)
        device = torch.device(args.cuda)
    else:
        device = torch.device('cpu')
    print("device: ", device)

    if args.train:
        train(id_to_char, id_to_tag, train_manager, dev_manager, device)
    f1, res_info = eval_model(id_to_char, id_to_tag, test_manager, device,
                              args.log_name)
    log_handler.info("\n resinfo {} \v F1: {} ".format(res_info, f1))
示例#10
0
def train():
    # -----------------------------------数据准备-------------------------------------
    train_manager = BatchManager(batch_size=20, name='train')
    test_manager = BatchManager(batch_size=100, name='test')

    # -----------------------------------读取字典-------------------------------------
    mapping_dict = get_dict(dict_file)

    # -----------------------------------搭建模型-------------------------------------
    model = Model(mapping_dict)

    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        for i in range(5):
            j = 1
            for batch in train_manager.iter_batch(shuffle=True):
                start = time.time()
                loss = model.run_step(sess, batch)
                end = time.time()
                if j % 5 == 0:
                    print('epoch:{},step:{}/{},loss:{},elapse:{},estimate:{}'.
                          format(i + 1, j, train_manager.len_data, loss,
                                 end - start,
                                 (end - start) * (train_manager.len_data - j)))
                j += 1
            for batch in test_manager.iter_batch(shuffle=True):
                test_result = model.predict(sess,
                                            batch,
                                            istrain=False,
                                            istest=True)
                print('precision rate:{} %', test_result[1])
示例#11
0
def test():
    make_path(FLAGS)
    config = load_config(FLAGS.config_file)
    with open(FLAGS.map_file, "rb") as f:
        char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id)
    test_manager = BatchManager(test_data, 100)
    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    os.environ["CUDA_VISIBLE_DEVICES"] = "3"
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)
    tf_config = tf.ConfigProto(gpu_options=gpu_options)
    tf_config.gpu_options.allow_growth = True
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        evaluate(sess, model, "test", test_manager, id_to_tag, logger)
示例#12
0
def test(param):
    # 检查参数
    assert param.clip < 5.1, "gradient clip should't be too much"
    assert 0 <= param.dropout < 1, "dropout rate between 0 and 1"
    assert param.lr > 0, "learning rate must larger than zero"
    # 获取batch_manager
    test_manager = BatchManager(param.test_batch_size, name='test')
    number_dataset = test_manager.len_data
    print("total of number test data is {}".format(number_dataset))
    # 配置日志
    logger = get_logger(param.test_log_file)
    # 读取字典
    mapping_dict = get_dict(param.dict_file)
    # 搭建模型
    model = Model(param, mapping_dict)
    # 配置GPU参数
    gpu_config = tf.ConfigProto()
    with tf.Session(config=gpu_config) as sess:
        logger.info("start testing...")
        start = time.time()
        # 首先检查模型是否存在
        ckpt_path = param.ckpt_path
        ckpt = tf.train.get_checkpoint_state(ckpt_path)
        # 看是否存在训练好的模型
        if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
            logger.info("Reading model parameters from {}".format(
                ckpt.model_checkpoint_path))
            # 如果存在就进行重新加载
            model.saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            logger.info("Cannot find the ckpt files!")
        # 开始评估
        evaluate(sess, param, model, "test", test_manager, logger)
        logger.info("The best_f1 on test_dataset is {:.2f}".format(
            model.best_test_f1.eval()))
        logger.info('Time test for {:.2f} batch is {:.2f} sec\n'.format(
            param.test_batch_size,
            time.time() - start))
示例#13
0
def train():
    # load data sets
    # 句子集合 = [[句子1],[句子2],[句子3]],句子1 = [我 O,在 O,。。。]
    #<class 'list'>: [['海', 'O'], ['钓', 'O'], ['比', 'O'], ['赛', 'O'], ['地', 'O'], ['点', 'O'], ['在', 'O'], ['厦', 'B-LOC'], ['门', 'I-LOC'], ['与', 'O'], ['金', 'B-LOC'], ['门', 'I-LOC'], ['之', 'O'], ['间', 'O'], ['的', 'O'], ['海', 'O'], ['域', 'O'], ['。', 'O']]
    # train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros)
    # dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    # test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)
    from xlnet_base.xlnet_data_utils import XLNetDataUtils
    sp_model = spm.SentencePieceProcessor()
    sp_model.Load('./chinese_xlnet_base_L-12_H-768_A-12/spiece.model')

    train_data = XLNetDataUtils(sp_model,
                                batch_size=FLAGS.batch_size,
                                entry="train")
    dev_data = XLNetDataUtils(sp_model,
                              batch_size=FLAGS.batch_size,
                              entry="dev")
    test_data = XLNetDataUtils(sp_model,
                               batch_size=FLAGS.batch_size,
                               entry="test")
    dev_batch = dev_data.iteration()

    def datapadding(data):
        alldatalist = []

        datalist = data.data
        max_length = 64
        for i in range(len(datalist)):
            tmpdatalist = []
            token = datalist[i][0]
            segmentid = datalist[i][1]
            inputid = datalist[i][2]
            inputmask = datalist[i][3]
            labellist = datalist[i][4]
            #token label
            if len(labellist) < max_length:
                for i in range(max_length - len(token)):
                    labellist.append(0)
            elif len(labellist) > max_length:
                tmplabellist = []
                for i in range(max_length):
                    tmplabellist.append(labellist[i])
                labellist = tmplabellist
            #segmentid inputid inputmask
            if len(segmentid) < max_length:
                for i in range(max_length - len(segmentid)):
                    segmentid.append(0)
                    inputid.append(0)
                    inputmask.append(0)
            elif len(segmentid) > max_length:
                tmpsegmentid = []
                tmpinputid = []
                tmpinputmask = []
                for i in range(max_length):
                    tmpsegmentid.append(segmentid[i])
                    tmpinputid.append(inputid[i])
                    tmpinputmask.append(inputmask[i])
                segmentid = tmpsegmentid
                inputid = tmpinputid
                inputmask = tmpinputmask
            tmpdatalist.append(token)
            tmpdatalist.append(segmentid)
            tmpdatalist.append(inputid)
            tmpdatalist.append(inputmask)
            tmpdatalist.append(labellist)
            alldatalist.append(tmpdatalist)
        return alldatalist

    ftraindata = datapadding(train_data)

    fdevdata = datapadding(dev_data)
    ftestdata = datapadding(test_data)
    print(len(ftraindata))
    print(len(fdevdata))
    print(len(ftestdata))
    # traindata = {
    #     "batch_size": train_data.batch_size,
    #     "input_size": train_data.input_size,
    #     "vocab": train_data.vocab,
    #     "tag_map": train_data.tag_map,
    # }
    # devdata = {
    #     "batch_size": dev_data.batch_size,
    #     "input_size": dev_data.input_size,
    #     "vocab": dev_data.vocab,
    #     "tag_map": dev_data.tag_map,
    # }
    # testdata = {
    #     "batch_size": test_data.batch_size,
    #     "input_size": test_data.input_size,
    #     "vocab": test_data.vocab,
    #     "tag_map": test_data.tag_map,
    # }
    # if not os.path.exists("./model/train_data_map.pkl"):
    #     f = open("./model/train_data_map.pkl", "wb")
    #     pickle.dump(traindata, f)
    #     f.close()
    # if not os.path.exists("./model/dev_data_map.pkl"):
    #     f = open("./model/dev_data_map.pkl", "wb")
    #     pickle.dump(devdata, f)
    #     f.close()
    # if not os.path.exists("./model/test_data_map.pkl"):
    #     f = open("./model/test_data_map.pkl", "wb")
    #     pickle.dump(testdata, f)
    #     f.close()

    # Use selected tagging scheme (IOB / IOBES)
    #update_tag_scheme(train_sentences, FLAGS.tag_schema)
    #update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # Create a dictionary and a mapping for tags
        '''
         _t:{'O': 869087, 'B-LOC': 16571, 'I-LOC': 22531, 'B-PER': 8144, 'I-PER': 15881, 'B-ORG': 9277, 'I-ORG': 37689, '[SEP]': 8, '[CLS]': 10}
         id_to_tag:{0: 'O', 1: 'I-ORG', 2: 'I-LOC', 3: 'B-LOC', 4: 'I-PER', 5: 'B-ORG', 6: 'B-PER', 7: '[CLS]', 8: '[SEP]'}
         tag_to_id:{'O': 0, 'I-ORG': 1, 'I-LOC': 2, 'B-LOC': 3, 'I-PER': 4, 'B-ORG': 5, 'B-PER': 6, '[CLS]': 7, '[SEP]': 8}
        '''

        tag_to_id = train_data.tag_map
        id_to_tag = {v: k for k, v in tag_to_id.items()}
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    '''
    [['在', '这', '里', '恕', '弟', '不', '恭', '之', '罪', ',', '敢', '在', '尊', '前', '一', '诤', ':', '前', '人', '论',
    '书', ',', '每', '曰', '“', '字', '字', '有', '来', '历', ',', '笔', '笔', '有', '出', '处', '”', ',', '细', '读', '公', 
    '字', ',', '何', '尝', '跳', '出', '前', '人', '藩', '篱', ',', '自', '隶', '变', '而', '后', ',', '直', '至', '明', '季',
    ',', '兄', '有', '何', '新', '出', '?'], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1762, 6821, 7027, 2609, 2475, 679, 2621, 722,
    5389, 8024, 3140, 1762, 2203, 1184, 671, 6420, 8038, 1184, 782, 6389, 741, 8024, 3680, 3288, 100, 2099, 2099, 3300,
    3341, 1325, 8024, 5011, 5011, 3300, 1139, 1905, 100, 8024, 5301, 6438, 1062, 2099, 8024, 862, 2214, 6663, 1139, 
    1184, 782, 5974, 5075, 8024, 5632, 7405, 1359, 5445, 1400, 8024, 4684, 5635, 3209, 2108, 8024, 1040, 3300, 862, 
    3173, 1139, 8043, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
    '''

    # train_data = prepare_dataset(
    #     train_sentences, FLAGS.max_seq_len, tag_to_id, FLAGS.lower
    # )
    # dev_data = prepare_dataset(
    #     dev_sentences, FLAGS.max_seq_len, tag_to_id, FLAGS.lower
    # )
    # test_data = prepare_dataset(
    #     test_sentences, FLAGS.max_seq_len, tag_to_id, FLAGS.lower
    # )

    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data.data), len(dev_data.data), len(test_data.data)))

    train_manager = BatchManager(ftraindata, FLAGS.batch_size)
    dev_manager = BatchManager(fdevdata, FLAGS.batch_size)
    test_manager = BatchManager(ftestdata, FLAGS.batch_size)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, config, logger)
        logger.info("start training")
        loss = []
        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)

                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess,
                           model,
                           FLAGS.ckpt_path,
                           logger,
                           global_steps=step)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
示例#14
0
def train():
    # load data sets
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(),
                FLAGS.emb_file,
                list(itertools.chain.from_iterable(
                    [[w[0] for w in s] for s in test_sentences])
                )
            )
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(
        train_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    dev_data = prepare_dataset(
        dev_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    test_data = prepare_dataset(
        test_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    print("%i / %i / %i sentences in train / dev / test." % (
        len(train_data), 0, len(test_data)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger)
        logger.info("start training")
        loss = []

        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                #print batch
                step, batch_loss = model.run_step(sess, True, batch)
                #print step
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                        iteration, step%steps_per_epoch, steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
示例#15
0
def do_train(config):
    train, dev, test = load_data(config)  # 加载数据
    word_to_id, id_to_word, tag_to_id, id_to_tag = create_maps(train, config)  # 创建或读取maps

    # 配置信息及保存
    config["num_chars"] = len(word_to_id)  # 词总数
    config["num_tags"] = len(tag_to_id)  # 标签总数
    with open(config["config_file"], "w") as f:
        json.dump(config, f, ensure_ascii=False, indent=4)

    # 数据处理
    train_data = prepare_dataset(train, word_to_id, tag_to_id, config["lower"])
    dev_data = prepare_dataset(dev, word_to_id, tag_to_id, config["lower"])
    test_data = prepare_dataset(test, word_to_id, tag_to_id, config["lower"])

    print("train/dev/test 句子数:{} / {} / {}".format(len(train_data), len(dev_data), len(test_data)))

    # 分batch
    train_manager = BatchManager(train_data, config["batch_size"])
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)

    steps_per_epoch = train_manager.len_data  # 每个轮次的steps

    # 创建相关路径
    make_path(config)

    # logger
    logger = get_logger(config["log_file"])

    # GPU限制
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    with tf.Session(config=tf_config) as sess:
        # 创建模型, 可以提供使用现有参数配置
        model = Model(config)

        ckpt = tf.train.get_checkpoint_state(config["ckpt_path"])  # 从模型路径获取ckpt
        if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):  # 现有模型
            logger.info("读取现有模型...")
            model.saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            logger.info("新建模型...")
            sess.run(tf.global_variables_initializer())  # 不使用预训练的embeddings

            # 如果使用预训练的embeddings
            if config["pre_emb"]:
                emb_weights = sess.run(model.char_lookup.read_value())
                emb_weights = load_word2vec(config["emb_file"], id_to_word, config["char_dim"], emb_weights)
                sess.run(model.char_lookup.assign(emb_weights))
                logger.info("Load pre-trained embedding.")

        logger.info("开始训练...")
        loss = []
        for i in range(config["max_epoch"]):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)

                if step % config["steps_check"] == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, NER loss:{:>9.6f}".format(
                        iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss)))

                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger, config)
            if best:
                save_model(sess, model, config["ckpt_path"], logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger, config)
def main():
    # load data sets
    global args
    args = parser.parse_args()
    pp.pprint(vars(args))
    # running_name = 'X'
    use_cuda = cuda_model.ifUseCuda(args.gpu_id, args.multiGpu)
    # use_cuda = False

    # train_file = 'data/example.train'
    # dev_file = 'data/example.dev'
    test_file = 'data/example.test'
    # embedding_file = 'data/vec.txt'
    map_file = 'map.pkl'
    # config_file = 'config_file_pytorch'
    tag_file = 'tag.pkl'
    # embedding_easy_file = 'data/easy_embedding.npy'
    # train_sentences = load_sentences(train_file)
    # dev_sentences = load_sentences(dev_file)
    test_sentences = load_sentences(test_file)
    # train_sentences = dev_sentences
    # update_tag_scheme(train_sentences, args.tag_schema)
    update_tag_scheme(test_sentences, args.tag_schema)
    # update_tag_scheme(dev_sentences, args.tag_schema)

    if not os.path.isfile(tag_file):
        print("Tag file {:s} Not found".format(tag_file))
        sys.exit(-1)
    else:
        with open(tag_file, 'rb') as t:
            tag_to_id, id_to_tag = pickle.load(t)

    if not os.path.isfile(map_file):
        print("Map file {:s} Not found".format(map_file))
        # create dictionary for word
        # dico_chars_train = char_mapping(train_sentences)[0]
        # dico_chars, char_to_id, id_to_char = augment_with_pretrained(
        #     dico_chars_train.copy(),
        #     embedding_file,
        #     list(itertools.chain.from_iterable(
        #         [[w[0] for w in s] for s in test_sentences])
        #     )
        # )
        # # _, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        #
        # with open(map_file, "wb") as f:
        #     pickle.dump([char_to_id, id_to_char], f)
    else:
        with open(map_file, "rb") as f:
            char_to_id, id_to_char = pickle.load(f)

    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id)

    print("{:d} sentences in  test.".format(len(test_data)))

    test_manager = BatchManager(test_data, 1)

    save_places = dir_utils.save_places(args.eval)

    # log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(
        os.path.join(save_places.log_save_dir,
                     'evaluation-{:d}.txt'.format(args.fileid)))
    config = config_model(char_to_id, tag_to_id, args)
    print_config(config, logger)

    logger.info("start training")

    #Update: create model and embedding!
    model = NERModel.CNERPointer(char_dim=args.char_dim,
                                 seg_dim=args.seg_dim,
                                 hidden_dim=args.hidden_dim,
                                 max_length=15,
                                 output_classes=4,
                                 dropout=args.dropout,
                                 embedding_path=None,
                                 id_to_word=id_to_char,
                                 easy_load=None)
    print("Number of Params\t{:d}".format(
        sum([p.data.nelement() for p in model.parameters()])))

    #Update: this won't work!
    # model = cuda_model.convertModel2Cuda(model, gpu_id=args.gpu_id, multiGpu=args.multiGpu)
    if use_cuda:
        model = model.cuda()

    model.eval()
    if args.eval is not None:
        # if os.path.isfile(args.resume):
        ckpt_filename = os.path.join(
            save_places.model_save_dir,
            'checkpoint_{:04d}.pth.tar'.format(args.fileid))
        assert os.path.isfile(
            ckpt_filename), 'Error: no checkpoint directory found!'

        checkpoint = torch.load(ckpt_filename,
                                map_location=lambda storage, loc: storage)
        model.load_state_dict(checkpoint['state_dict'], strict=True)
        train_iou = checkpoint['IoU']
        print("=> loading checkpoint '{}', current iou: {:.04f}".format(
            ckpt_filename, train_iou))

    ner_results = evaluate(model, test_manager, id_to_tag, use_cuda, max_len=5)
    eval_lines = test_ner(ner_results, save_places.summary_save_dir)
    for line in eval_lines:
        logger.info(line)
    f1 = float(eval_lines[1].strip().split()[-1])
    return f1
示例#17
0
def annotation():
    #train()
    #读入整体的csv文件
    try:
        whole_data = pd.read_csv(raw_data, sep=',', encoding='UTF-8')
    except UnicodeEncodeError:
        whole_data = pd.read_csv(raw_data, sep=',', encoding='GBK', errors='ignore')
    except Exception as e:
        print(e)

    row_num = whole_data.shape[0]
    print(row_num)

    # -----------------------------------读取字典-------------------------------------
    mapping_dict = get_dict(dict_file)

    # -----------------------------------搭建模型-------------------------------------
    model = Model(mapping_dict)

    list = ['Per', 'Com', 'Time', 'Job', 'Nat', 'Bir', 'Age', 'Gdr', 'Uni', 'Edu', 'Sch', 'Col', 'Maj', 'Zhi', 'Hon']
    feature_dataframe = pd.DataFrame(columns=list)

    #创建test文件夹,并遍历所有的数据进行预测
    for i in range(whole_data):
        # 单纯创建只能创建两层,用shutil可以创建多层
        if os.path.exists('data/Test'):
            shutil.rmtree('data/Test')
        if not os.path.exists('data/Test'):
            os.makedirs('data/Test')

        cur_data = whole_data['ManagerResume'][i]
        print(cur_data)

        filename = 'data/Test/need_annotation.txt'
        with open(filename, 'w') as f:  # 如果filename不存在会自动创建, 'w'表示写数据,写之前会清空文件中的原有数据!
            f.write(cur_data)

        task_process(split_text)
        get_data('task')

        # -----------------------------------数据准备-------------------------------------
        task_manager = BatchManager(batch_size=1, name='task')

        # -----------------------------------搭建模型-------------------------------------

        item_T = {}
        item_T = pd.DataFrame(item_T)

        init = tf.global_variables_initializer()
        with tf.Session() as sess:
            sess.run(init)
            for i in range(1):
                for batch in task_manager.iter_batch(shuffle=True):
                    task_result,item = model.predict(sess, batch, istrain=False, istest=False)
                    #item_Entity = pd.DataFrame(item['entities'])
                    #item_T = item_T.append(item_Entity)
                    item_T = pd.DataFrame(item['entities'])

                    #print('predict result:{} %', task_result)
                    print(item_T)
                    #num_samples = len(item)  # 获取有多少句话  等于是有多少个样本
                    #print(num_samples)

        # -------------------------------存储标注完的数据----------------------------------
        f_Key = {}

        for feature in list:
            l_type =[]
            for j in range(item_T.shape[0]):
                if(item_T['type'].iloc[j] == feature):
                    return_word = [item_T['word'].iloc[j]]
                    l_type = l_type+return_word
            f_Key.update({feature:l_type})

        feature_dataframe = feature_dataframe.append(f_Key,ignore_index=True)

    FinalResult = pd.concat([whole_data,feature_dataframe], axis=1)
    fpath = 'FinalResult.csv'
    pd.DataFrame(FinalResult).to_csv(fpath)
def train():
    # load data sets
    train_sentences = load_sentences(
        FLAGS.train_file, FLAGS.lower,
        FLAGS.zeros)  # dimension:num_sentence*len_sentence*2
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(
        train_sentences,
        FLAGS.tag_schema)  # dimension:num_sentence*len_sentence*2
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:  # 如果使用预训练的词嵌入
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[
                0]  # dico_chars_train dimension: 训练数据集中出现的字符类别数*2,
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(  # 利用测试数据样本集中的字对dico_chars_train进行补充
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:  # 创建map_file文件
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(
        train_sentences, char_to_id, tag_to_id,
        FLAGS.lower)  # dimension: NumSentence*4*LenSentence
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), len(dev_data), len(test_data)))

    train_manager = BatchManager(
        train_data, FLAGS.batch_size
    )  # batch_data dimension: BatchNum*4*BatchSize*MaxLenSentence
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)

    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):  # 若已有config_file则读取加载
        config = load_config(FLAGS.config_file)
    else:  # 若没有config_file则新建并保存为文件
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)  # 将config打印到日志文件

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True  # 动态申请内存
    steps_per_epoch = train_manager.len_data  # len_data: ceil(NumSentence/BatchSize)
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        for i in range(FLAGS.max_epoch):  # 括号中数字是epoach数量
            for batch in train_manager.iter_batch(
                    shuffle=True
            ):  # 一次从batch_data中取出一个batch,Shuffle为True表示打乱batch_data的顺序
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
                evaluate(sess, model, "test", test_manager, id_to_tag, logger)

    # View the tensorboard graph by running the following code and then going to the terminal and typing:
    # tensorboard --logdir = tensorboard_logs
    merged = tf.summary.merge_all()
    if not os.path.exists('tensorboard_logs/'):
        os.makedirs('tensorboard_logs/')
    my_writer = tf.summary.FileWriter('tensorboard_logs/', sess.graph)
示例#19
0
class Main:
    def __init__(self):
        self.train_sentences = None  # 用于存储训练集语句中的字符及标签
        self.dev_sentences = None  # 用于存储验证集语句中字符及标签
        self.char_to_id = None  # 字符char到索引id的映射字典
        self.id_to_char = None  # 索引id到字符char的映射字典
        self.tag_to_id = None   # 标签tag到索引id的映射字典
        self.id_to_tag = None   # 索引id到标签tag的映射字典
        self.train_batch_manager = None   # 训练集的batch管理类
        self.dev_batch_manager = None  # 验证集的batch管理类

    @staticmethod
    def config_model(char_to_id, tag_to_id):
        """
        设置模型参数
        :param char_to_id:词到索引的映射字典
        :param tag_to_id:标签到索引的映射字典
        :return:config:dict
        """
        config = OrderedDict()
        config["num_chars"] = len(char_to_id)
        config["char_dim"] = FLAGS.char_dim
        config["num_tags"] = len(tag_to_id)
        config["seg_dim"] = FLAGS.seg_dim
        config["lstm_dim"] = FLAGS.lstm_dim
        config["batch_size"] = FLAGS.batch_size
        config["emb_file"] = FLAGS.emb_file
        config["clip"] = FLAGS.clip
        config["dropout_keep"] = 1.0 - FLAGS.dropout
        config["optimizer"] = FLAGS.optimizer
        config["lr"] = FLAGS.lr
        config["tag_schema"] = FLAGS.tag_schema
        config["pre_emb"] = FLAGS.pre_emb
        config["zeros"] = FLAGS.zeros
        config["lower"] = FLAGS.lower
        return config

    @staticmethod
    def evaluate(sess, model, name, data, id_to_tag, logger):
        if name == "dev":
            logger.info("evaluate dev data......")
            ner_results = model.predict(sess, data, id_to_tag)  # 对验证集进行预测,得到对各个实体的预测
            # 将预测结果写入到原数据并输出,然后计算并评估识别性能
            eval_lines = result_write_evaluate(ner_results, FLAGS.result_path, name, size_train_data)
            for line in eval_lines:
                logger.info(line)
            f1 = float(eval_lines[1].strip().split()[-1])
            best_test_f1 = model.best_dev_f1.eval()
            if f1 > best_test_f1:
                tf.assign(model.best_dev_f1, f1).eval()
                logger.info("new best dev f1 score:{:>.3f}".format(f1))
            return f1 > best_test_f1

    def get_sentences_dict(self):
        """
        加载数据集中的语句,将每个语句的字符和标签存储为列表,然后生成字符和标签与索引id的双向映射字典
        :return:
        """
        # 加载数据集中的语句,将每个语句的字符和标签存储为列表
        self.train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros)
        self.dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
        # print("dev_sentences:", self.dev_sentences)

        # 原数据的标注模式与需要的标注模式不同时用update_tag_scheme函数对标注模式进行转换,转换成指定的IOB或者IOBES
        # update_tag_scheme(train_sentences, FLAGS.tag_schema)
        # update_tag_scheme(test_sentences, FLAGS.tag_schema)

        if not os.path.isfile(FLAGS.map_file):
            # 若map_file不存在,则根据数据集和预训练词向量文件初始化各个映射字典
            # 若使用预训练的词向量
            if FLAGS.pre_emb:
                # 得到train_sentences中字符的字典,键值对为word-词频
                dico_chars_train = char_mapping(self.train_sentences, FLAGS.lower)[0]
                # 用预训练词向量文件扩充字典(目的为尽可能地扩充字典、使更多字符能基于预训练的词向量进行初始化)并得到word与id的双向映射字典。
                dico_chars, self.char_to_id, self.id_to_char = augment_with_pretrained(
                    dico_chars_train.copy(),
                    FLAGS.emb_file,
                    list(itertools.chain.from_iterable(
                        [[w[0] for w in s] for s in self.dev_sentences])
                    )
                )
            else:   # 若不使用预训练的词向量
                _c, self.char_to_id, self.id_to_char = char_mapping(self.train_sentences, FLAGS.lower)
            _t, self.tag_to_id, self.id_to_tag = tag_mapping(self.train_sentences)  # 标签和索引之间的双向映射字典
            print("tag_to_id", self.tag_to_id, len(self.tag_to_id))
            # 将得到的映射字典存入文件,以免重复初始化
            with open(FLAGS.map_file, "wb") as f:
                pickle.dump([self.char_to_id, self.id_to_char, self.tag_to_id, self.id_to_tag], f)
        else:
            # 若map_file存在,则直接从文件中恢复各个映射字典
            with open(FLAGS.map_file, "rb") as f:
                self.char_to_id, self.id_to_char, self.tag_to_id, self.id_to_tag = pickle.load(f)

    def get_batch_data(self):
        """
        得到训练集和验证集的batch管理类:首先基于各映射字典对训练集和验证集的语句序列进行处理,得到每个语句的各特征列表以及
        真实标签列表,然后获取batch管理类,用于生成batch数据
        :return:
        """
        if not os.path.isfile(FLAGS.train_dev_file):
            train_data = prepare_dataset(self.train_sentences, self.char_to_id, self.tag_to_id, FLAGS.lower)
            dev_data = prepare_dataset(self.dev_sentences, self.char_to_id, self.tag_to_id, FLAGS.lower)
            with open(FLAGS.train_dev_file, "wb") as f:
                pickle.dump([train_data, dev_data], f)
        else:
            with open(FLAGS.train_dev_file, "rb") as f:
                train_data, dev_data = pickle.load(f)
        print("%i / %i  sentences in train / dev ." % (len(train_data), len(dev_data)))
        self.train_batch_manager = BatchManager(train_data, int(FLAGS.batch_size))
        self.dev_batch_manager = BatchManager(dev_data, int(FLAGS.batch_size))

    def get_config(self):
        """
        从模型参数配置文件中获取参数或者用config_model函数生成参数并存储
        :return:日志logger及参数列表config
        """
        make_path(FLAGS)
        if os.path.isfile(FLAGS.config_file):
            config = load_config(FLAGS.config_file)
        else:
            config = self.config_model(self.char_to_id, self.tag_to_id)
            save_config(config, FLAGS.config_file)
        log_path = os.path.join("log", FLAGS.log_file)
        logger = get_logger(log_path)
        print_config(config, logger)
        return logger, config

    def train(self):
        self.get_sentences_dict()
        self.get_batch_data()
        logger, config = self.get_config()
        tf_config = tf.ConfigProto()
        tf_config.gpu_options.allow_growth = True  # limit GPU memory
        steps_per_epoch = self.train_batch_manager.len_data  # 每一轮epoch的batch数量
        with tf.Session(config=tf_config) as sess:
            model = create_model(sess, Model, FLAGS.ckpt_path, config, self.id_to_char, logger)
            logger.info("start training")
            loss = []
            for i in range(FLAGS.max_epoch):
                for batch in self.train_batch_manager.iter_batch(shuffle=True):
                    step, batch_loss = model.run_step(sess, True, batch)
                    loss.append(batch_loss)
                    if step % FLAGS.steps_check == 0:
                        iteration = step // steps_per_epoch + 1
                        logger.info("iteration:{} step:{}/{}, ""NER loss:{:>9.6f}".format(
                            iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss)))
                        loss = []
                # 对验证集进行预测和评估
                best = self.evaluate(sess, model, "dev", self.dev_batch_manager, self.id_to_tag, logger)
                if best:
                    save_model(sess, model, FLAGS.ckpt_path, logger)

    @ staticmethod
    def predict():
        """
        对一个数据集进行实体识别
        :return:
        """
        config = load_config(FLAGS.config_file)
        logger = get_logger(FLAGS.log_file)
        tf_config = tf.ConfigProto()
        tf_config.gpu_options.allow_growth = True   # limit GPU memory
        # 从训练阶段生成的map_file中恢复各映射字典
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)
        test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)
        test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower, train=False)
        test_manager = BatchManager(test_data, 1)
        with tf.Session(config=tf_config) as sess:
            model = create_model(sess, Model, FLAGS.ckpt_path, config, id_to_char, logger)
            logger.info("predict data......")
            ner_results = model.predict(sess, test_manager, id_to_tag)
            result_write_evaluate(ner_results, FLAGS.result_path, "test")

    @staticmethod
    def predict_line():
        """
        对一个语句实例进行实体识别测试
        :return:
        """
        config = load_config(FLAGS.config_file)
        logger = get_logger(FLAGS.log_file)
        tf_config = tf.ConfigProto()
        tf_config.gpu_options.allow_growth = True
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)
        with tf.Session(config=tf_config) as sess:
            model = create_model(sess, Model, FLAGS.ckpt_path, config, id_to_char, logger)
            # 对单个句子进行预测
            while True:
                line = input("请输入测试句子:")
                result = model.predict_line(sess, input_from_line(line, char_to_id), id_to_tag)
                print(result)
示例#20
0
文件: main.py 项目: DCdream/DA-CRF
def train():
    # load data sets
    # train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros)
    # dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    all_train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                         FLAGS.zeros)
    train_sentences, dev_sentences = split_train_dev(all_train_sentences)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # update_tag_scheme(dev_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            # dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars_train = char_mapping(all_train_sentences,
                                            FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(all_train_sentences,
                                                      FLAGS.lower)
        # _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(all_train_sentences)
        # _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

# nlp = StanfordCoreNLP(r'E:\DC\dataset\泰一指尚评测数据\stanford-corenlp-full-2017-06-09')
#l_sorted_lexcion = load_lexcion(FLAGS.lexcion_file, nlp)
    l_sorted_lexcion = []
    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 l_sorted_lexcion, FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               l_sorted_lexcion, FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                l_sorted_lexcion, FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), len(dev_data), len(test_data)))

    max_len = max(
        [len(sentence[0]) for sentence in train_data + test_data + dev_data])

    train_manager = BatchManager(train_data, FLAGS.batch_size, max_len)
    dev_manager = BatchManager(dev_data, 800, max_len)
    test_manager = BatchManager(test_data, 800, max_len)

    # random.shuffle(train_data)

    # pad_test_data = pad_data(test_data)
    # pad_dev_data = pad_data(dev_data)

    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id, max_len)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        for i in range(FLAGS.max_epoch):
            random.shuffle(train_data)
            pad_train_data = pad_data(train_data, max_len)
            strings, chars, lexcion_teatures, pos_ids, dep_ids, head_ids, targets = pad_train_data
            for j in range(0, len(strings), FLAGS.batch_size):
                batch = [
                    strings[j:j + FLAGS.batch_size],
                    chars[j:j + FLAGS.batch_size],
                    lexcion_teatures[j:j + FLAGS.batch_size],
                    pos_ids[j:j + FLAGS.batch_size],
                    dep_ids[j:j + FLAGS.batch_size],
                    head_ids[j:j + FLAGS.batch_size],
                    targets[j:j + FLAGS.batch_size]
                ]
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "AS loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger, i)
                evaluate(sess, model, "test", test_manager, id_to_tag, logger)
        evaluate(sess, model, "test", test_manager, id_to_tag, logger)
示例#21
0
def train():
    # load data sets
    datasets = load_sentences(FLAGS.train_file, FLAGS.lower)
    random.shuffle(datasets)
    train_sentences = datasets[:14000]
    test_sentences = datasets[14000:]

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        char_to_id, _ = elmo_char_mapping(FLAGS.elmo_vocab)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i sentences in train / dev." %
          (len(train_data), len(test_data)))

    elmo_batcher = get_batcher()
    train_manager = BatchManager(train_data, FLAGS.batch_size, elmo_batcher)
    test_manager = BatchManager(test_data, FLAGS.batch_size, elmo_batcher)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto(allow_soft_placement=True)
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        elmo_model = load_elmo()
        model = create_model(sess, Model, FLAGS.ckpt_path, elmo_model, config,
                             logger)
        logger.info("start training")
        loss = []
        for i in range(FLAGS.max_epoch):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info(
                        "iteration:{} step:{}/{}, NER loss:{:>9.6f}".format(
                            iteration, step % steps_per_epoch, steps_per_epoch,
                            np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "test", test_manager, id_to_tag,
                            logger)
            # evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
示例#22
0
def train(param):
    # 检查参数
    assert param.clip < 5.1, "gradient clip should't be too much"
    assert 0 <= param.dropout < 1, "dropout rate between 0 and 1"
    assert param.lr > 0, "learning rate must larger than zero"
    # 数据准备
    train_manager = BatchManager(param.batch_size, name='train')
    number_dataset = train_manager.len_data
    print("total of number train data is {}".format(number_dataset))
    # 创建相应的文件夹
    make_path(param)
    # 配置日志
    logger = get_logger(param.train_log_file)
    # 读取字典
    mapping_dict = get_dict(param.dict_file)
    # 读取senc_tag为后续加载词向量做准备
    senc_tag = get_sent_tag(param.sent_tag_file)
    # 加载预训练向量
    dico_chars, char_to_id, id_to_char = augment_with_pretrained(
        mapping_dict['word'][2].copy(), param.emb_file,
        list(
            itertools.chain.from_iterable([[w[0] for w in s]
                                           for s in senc_tag])))
    # 获取总的训练集数据数量
    steps_per_epoch = train_manager.len_data
    # 配置GPU参数
    gpu_config = tf.ConfigProto()
    with tf.Session(config=gpu_config) as sess:
        # 初始化模型
        model = creat_model(sess,
                            Model,
                            param.ckpt_path,
                            load_word2vec,
                            param,
                            id_to_char,
                            logger,
                            map_all=mapping_dict)
        for i in range(param.max_epoch):
            loss = []
            total_loss = 0
            # 初始化时间
            start = time.time()
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, batch)
                # 这里计算平均loss
                loss.append(batch_loss)
                # 这里计算总的loss后面计算全部平均
                total_loss += batch_loss
                if step % 5 == 0:
                    logger.info(
                        "epoch:{}, step:{}/{}, avg_loss:{:>9.4f}".format(
                            i + 1, step % steps_per_epoch, steps_per_epoch,
                            np.mean(loss)))
            # 保存模型
            model.save_model(sess, logger, i)
            logger.info('Epoch {}, total Loss {:.4f}'.format(
                i + 1, total_loss / train_manager.len_data))
            logger.info(
                'Time taken for one epoch {:.4f} min, take {:.2f} h for rest of epoch\n'
                .format((time.time() - start) / 60,
                        ((param.max_epoch - i + 1) *
                         (time.time() - start)) / 3600))
示例#23
0
def train():
    # load data sets
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist, load data if exists maps
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), 0, len(test_data)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)

    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # 设置训练日志目录
    train_log = os.path.join(FLAGS.logdir, "train")
    if not os.path.exists(train_log):
        os.makedirs(train_log)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data  # the nums of batch data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        # 观察所建立的计算图
        train_writer = tf.summary.FileWriter(train_log, sess.graph)
        logger.info("start training")
        loss = []
        dev_f1 = []
        test_f1 = []
        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss, merged = model.run_step(
                    sess, True, batch)  # step是global step
                # 在迭代中输出到结果
                train_writer.add_summary(merged, step)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

            # use dev data to validation the model
            best, dev_f1_value = evaluate(sess, model, "dev", dev_manager,
                                          id_to_tag, logger)
            # store the dev f1
            dev_f1.append(dev_f1_value)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
            # use current the  model to test
            _, test_f1_value = evaluate(sess, model, "test", test_manager,
                                        id_to_tag, logger)
            #   store the test f1
            test_f1.append(test_f1_value)
        # write the dev_f1 and test_f1 to file
        f1_result = {}
        f1_result["dev_f1"] = dev_f1
        f1_result["test_f1"] = test_f1
        write_data_to_file(f1_result, "f1_result")
示例#24
0
文件: main.py 项目: ypycsy/CDTL-PSE
def train(X_train,X_dev,X_test):
    # load data sets
    train_sentences = X_train
    dev_sentences = X_dev
    test_sentences = X_test

    train_sentences_loc = load_sentences(FLAGS.train_file_loc, FLAGS.lower, FLAGS.zeros)
    dev_sentences_loc = load_sentences(FLAGS.dev_file_loc, FLAGS.lower, FLAGS.zeros)
    test_sentences_loc = load_sentences(FLAGS.test_file_loc, FLAGS.lower, FLAGS.zeros)
    train_sentences_org = load_sentences(FLAGS.train_file_org, FLAGS.lower, FLAGS.zeros)
    dev_sentences_org = load_sentences(FLAGS.dev_file_org, FLAGS.lower, FLAGS.zeros)
    test_sentences_org = load_sentences(FLAGS.test_file_org, FLAGS.lower, FLAGS.zeros)
    train_sentences_per = load_sentences(FLAGS.train_file_per, FLAGS.lower, FLAGS.zeros)
    dev_sentences_per = load_sentences(FLAGS.dev_file_per, FLAGS.lower, FLAGS.zeros)
    test_sentences_per = load_sentences(FLAGS.test_file_per, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    update_tag_scheme(train_sentences_loc, FLAGS.tag_schema)
    update_tag_scheme(test_sentences_loc, FLAGS.tag_schema)
    update_tag_scheme(train_sentences_per, FLAGS.tag_schema)
    update_tag_scheme(test_sentences_per, FLAGS.tag_schema)
    update_tag_scheme(train_sentences_org, FLAGS.tag_schema)
    update_tag_scheme(test_sentences_org, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(),
                FLAGS.emb_file,
                list(itertools.chain.from_iterable(
                    [[w[0] for w in s] for s in test_sentences])
                )
            )
            dico_chars_train_loc = char_mapping(train_sentences_loc, FLAGS.lower)[0]
            dico_chars_loc, char_to_id_loc, id_to_char_loc = augment_with_pretrained(
                dico_chars_train_loc.copy(),
                FLAGS.emb_file,
                list(itertools.chain.from_iterable(
                    [[w[0] for w in s] for s in test_sentences_loc])
                )
            )
            dico_chars_train_per = char_mapping(train_sentences_per, FLAGS.lower)[0]
            dico_chars_per, char_to_id_per, id_to_char_per = augment_with_pretrained(
                dico_chars_train_per.copy(),
                FLAGS.emb_file,
                list(itertools.chain.from_iterable(
                    [[w[0] for w in s] for s in test_sentences_per])
                )
            )
            dico_chars_train_org = char_mapping(train_sentences_org, FLAGS.lower)[0]
            dico_chars_org, char_to_id_org, id_to_char_org = augment_with_pretrained(
                dico_chars_train_org.copy(),
                FLAGS.emb_file,
                list(itertools.chain.from_iterable(
                    [[w[0] for w in s] for s in test_sentences_org])
                )
            )
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower)
            _c_loc, char_to_id_loc, id_to_char_loc = char_mapping(train_sentences_loc, FLAGS.lower)
            _c_per, char_to_id_per, id_to_char_per = char_mapping(train_sentences_per, FLAGS.lower)
            _c_org, char_to_id_org, id_to_char_org = char_mapping(train_sentences_org, FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        _t_loc, tag_to_id_loc, id_to_tag_loc = tag_mapping(train_sentences_loc)
        _t_per, tag_to_id_per, id_to_tag_per = tag_mapping(train_sentences_per)
        _t_org, tag_to_id_org, id_to_tag_org = tag_mapping(train_sentences_org)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag,char_to_id_loc, id_to_char_loc, tag_to_id_loc, id_to_tag_loc,char_to_id_per, id_to_char_per, tag_to_id_per, id_to_tag_per,char_to_id_org, id_to_char_org, tag_to_id_org, id_to_tag_org], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag,char_to_id_loc, id_to_char_loc, tag_to_id_loc, id_to_tag_loc,char_to_id_per, id_to_char_per, tag_to_id_per, id_to_tag_per,char_to_id_org, id_to_char_org, tag_to_id_org, id_to_tag_org = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(
        train_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    dev_data = prepare_dataset(
        dev_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    test_data = prepare_dataset(
        test_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    print("%i / %i / %i sentences in train / dev / test." % (
        len(train_data),len(dev_data), len(test_data)))
    train_data_loc = prepare_dataset_ner(
        train_sentences_loc, char_to_id_loc, tag_to_id_loc, FLAGS.lower
    )
    dev_data_loc = prepare_dataset_ner(
        dev_sentences_loc, char_to_id_loc, tag_to_id_loc, FLAGS.lower
    )
    test_data_loc = prepare_dataset_ner(
        test_sentences_loc, char_to_id_loc, tag_to_id_loc, FLAGS.lower
    )
    print("%i / %i / %i sentences_loc in train / dev / test." % (
        len(train_data_loc), len(dev_data_loc), len(test_data_loc)))
    train_data_per = prepare_dataset_ner(
        train_sentences_per, char_to_id_per, tag_to_id_per, FLAGS.lower
    )
    dev_data_per = prepare_dataset_ner(
        dev_sentences_per, char_to_id_per, tag_to_id_per, FLAGS.lower
    )
    test_data_per = prepare_dataset_ner(
        test_sentences_per, char_to_id_per, tag_to_id_per, FLAGS.lower
    )
    print("%i / %i / %i sentences_per in train / dev / test." % (
        len(train_data_per), len(dev_data_per), len(test_data_per)))
    train_data_org = prepare_dataset_ner(
        train_sentences_org, char_to_id_org, tag_to_id_org, FLAGS.lower
    )
    dev_data_org = prepare_dataset_ner(
        dev_sentences_org, char_to_id_org, tag_to_id_org, FLAGS.lower
    )
    test_data_org = prepare_dataset_ner(
        test_sentences_org, char_to_id_org, tag_to_id_org, FLAGS.lower
    )
    print("%i / %i / %i sentences_org in train / dev / test." % (
        len(train_data_org), len(dev_data_org), len(test_data_org)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)

    train_manager_loc = BatchManager(train_data_loc, FLAGS.batch_size)
    train_manager_per = BatchManager(train_data_per, FLAGS.batch_size)
    train_manager_org = BatchManager(train_data_org, FLAGS.batch_size)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id,char_to_id_loc, tag_to_id_loc,char_to_id_per, tag_to_id_per,char_to_id_org, tag_to_id_org)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    steps_per_epoch_loc = train_manager_loc.len_data
    steps_per_epoch_per = train_manager_per.len_data
    steps_per_epoch_org = train_manager_org.len_data
    model = create_model(Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, id_to_char_loc, id_to_char_per, id_to_char_org, logger)

    with tf.Session(config=tf_config, graph = model.graph ) as sess:

        sess.run(tf.global_variables_initializer())
        if config["pre_emb"]:
            emb_weights = sess.run(model.char_lookup.read_value())
            emb_weights_ner = sess.run(model.char_lookup.read_value())
            emb_weights, emb_weights_ner = load_word2vec(config["emb_file"], id_to_char, id_to_char_loc,id_to_char_per,id_to_char_org, config["char_dim"],
                                                    emb_weights, emb_weights_ner)
            sess.run(model.char_lookup.assign(emb_weights))
            logger.info("Load pre-trained embedding.")
        logger.info("start training")
        loss = []
        loss_loc = []
        loss_per = []
        loss_org = []
        for i in range(100):
            for batch_loc in train_manager_loc.iter_batch(shuffle=True):
                    step_loc, batch_loss_loc = model.run_step_ner(sess, True, batch_loc)
                    loss_loc.append(batch_loss_loc)
                    if step_loc % FLAGS.steps_check == 0:
                        iteration_loc = step_loc // steps_per_epoch_loc + 1
                        logger.info("iteration:{} step_loc:{}/{}, "
                                    "NER loss:{:>9.6f}".format(
                            iteration_loc, step_loc % steps_per_epoch_loc, steps_per_epoch_loc, np.mean(loss_loc)))
                        loss_loc = []
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration_1 = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                            "SKILL loss:{:>9.6f}".format(
                        iteration_1, step % steps_per_epoch, steps_per_epoch, np.mean(loss)))
                loss = []
            precision_loc_dev = model.precision(sess, dev_manager, id_to_tag)
            precision_loc_test = model.precision(sess, test_manager, id_to_tag)
            for batch_per in train_manager_per.iter_batch(shuffle=True):
                    step_per, batch_loss_per = model.run_step_ner(sess, True, batch_per)
                    loss_per.append(batch_loss_per)
                    if step_per % FLAGS.steps_check == 0:
                        iteration_per = step_per // steps_per_epoch_per + 1
                        logger.info("iteration:{} step_per:{}/{}, "
                                    "NER loss:{:>9.6f}".format(
                            iteration_per, step_per % steps_per_epoch_per, steps_per_epoch_per, np.mean(loss_per)))
                        loss_per = []
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration_2 = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                            "SKILL loss:{:>9.6f}".format(
                        iteration_2, step % steps_per_epoch, steps_per_epoch, np.mean(loss)))
                loss = []
            precision_per_dev = model.precision(sess, dev_manager, id_to_tag)
            precision_per_test = model.precision(sess, test_manager, id_to_tag)
            for batch_org in train_manager_org.iter_batch(shuffle=True):
                    step_org, batch_loss_org = model.run_step_ner(sess, True, batch_org)
                    loss_org.append(batch_loss_org)
                    if step_org % FLAGS.steps_check == 0:
                        iteration_org = step_org // steps_per_epoch_org + 1
                        logger.info("iteration:{} step_org:{}/{}, "
                                    "NER loss:{:>9.6f}".format(
                            iteration_org, step_org % steps_per_epoch_org, steps_per_epoch_org, np.mean(loss_org)))
                        loss_org = []
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration_3 = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                            "SKILL loss:{:>9.6f}".format(
                        iteration_3, step % steps_per_epoch, steps_per_epoch, np.mean(loss)))
                loss = []
            precision_org_dev = model.precision(sess, dev_manager, id_to_tag)
            precision_org_test = model.precision(sess, test_manager, id_to_tag)
            best = evaluate(sess, model, "dev", dev_manager, id_to_tag,precision_loc_dev,precision_per_dev,precision_org_dev, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
                best_test,results= evaluate(sess, model, "test", test_manager, id_to_tag,precision_loc_test,precision_per_test,precision_org_test, logger)
                with open("CDTL_PSE-result.csv", "a",encoding='utf-8')as st_re:
                    st_re.write(str(results).replace("[", "").replace("]", ""))
                    st_re.write("\n")
示例#25
0
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)   

    train_data = prepare_dataset(
        train_sentences, char_to_id, tag_to_id, FLAGS.lower, FLAGS.self_seg
    )
    dev_data = prepare_dataset(
        dev_sentences, char_to_id, tag_to_id, FLAGS.lower, FLAGS.self_seg
    )
    test_data = prepare_dataset(
        test_sentences, char_to_id, tag_to_id, FLAGS.lower, FLAGS.self_seg
    )
    print("%i / %i / %i sentences in train / dev / test." % (
        len(train_data), len(dev_data), len(test_data)))                 

    #长度不足补0
    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)

    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)
示例#26
0
def main(_):
    if not os.path.isdir(FLAGS.log_path):
        os.makedirs(FLAGS.log_path)
    if not os.path.isdir(FLAGS.model_path):
        os.makedirs(FLAGS.model_path)
    if not os.path.isdir(FLAGS.result_path):
        os.makedirs(FLAGS.result_path)

    tag_to_id = FLAGS.tag_to_id

    # load data
    id_to_word, id_to_tag, train_data, dev_data, test_data = load_data(
        FLAGS, tag_to_id)
    train_manager = BatchManager(train_data, len(id_to_tag),
                                 FLAGS.word_max_len, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, len(id_to_tag), FLAGS.word_max_len,
                               FLAGS.valid_batch_size)
    test_manager = BatchManager(test_data, len(id_to_tag), FLAGS.word_max_len,
                                FLAGS.valid_batch_size)

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2)
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        model = create_model(sess, id_to_word, id_to_tag)
        loss = 0
        best_test_f1 = 0
        steps_per_epoch = len(train_data) // FLAGS.batch_size + 1
        for _ in range(FLAGS.max_epoch):
            iteration = (model.global_step.eval()) // steps_per_epoch + 1
            train_manager.shuffle()
            for batch in train_manager.iter_batch():
                global_step = model.global_step.eval()
                step = global_step % steps_per_epoch
                batch_loss = model.run_step(sess, True, batch)
                loss += batch_loss / FLAGS.steps_per_checkpoint
                if global_step % FLAGS.steps_per_checkpoint == 0:
                    model.logger.info(
                        "iteration:{} step:{}/{}, NER loss:{:>9.6f}".format(
                            iteration, step, steps_per_epoch, loss))
                    loss = 0

            model.logger.info("validating ner")
            ner_results = model.predict(sess, dev_manager)
            eval_lines = conll_eval(ner_results, FLAGS.result_path)
            for line in eval_lines:
                model.logger.info(line)
            test_f1 = float(eval_lines[1].strip().split()[-1])
            if test_f1 > best_test_f1:
                best_test_f1 = test_f1
                model.logger.info("new best f1 score:{:>.3f}".format(test_f1))
                model.logger.info("saving model ...")
                checkpoint_path = os.path.join(FLAGS.model_path,
                                               "translate.ckpt")
                model.saver.save(sess,
                                 checkpoint_path,
                                 global_step=model.global_step)

        # test model
        model.logger.info("testing ner")
        ckpt = tf.train.get_checkpoint_state(FLAGS.model_path)
        model.logger.info("Reading model parameters from %s" %
                          ckpt.model_checkpoint_path)
        model.saver.restore(sess, ckpt.model_checkpoint_path)
        ner_results = model.predict(sess, test_manager)
        eval_lines = conll_eval(ner_results, FLAGS.result_path)
        for line in eval_lines:
            model.logger.info(line)
示例#27
0
	SRL_Model = create_model(Model, logger, word2idx, pumsa2idx, char2idx, label2idx,
							 lemma2idx, word_embedding_matrix)

	if config.mode == "train":
		ELMo_dict, context_embeddings_op, ELMo_context, ELMo_ids = load_ELMo()
		sess.run(tf.global_variables_initializer())

		train_dataset = load_data(config.train_path)
		test_dataset = load_data(config.test_path)

		train_data = prepare_dataset(train_dataset, word2idx, pumsa2idx, char2idx, lemma2idx, label2idx, ELMo_dict)
		test_data = prepare_dataset(test_dataset, word2idx, pumsa2idx, char2idx, lemma2idx, label2idx, ELMo_dict)

		print("%i / %i  sentences in train / dev " % (len(train_data), len(test_data)))

		train_manager = BatchManager(train_data, label2idx)
		test_manager = BatchManager(test_data, label2idx)

		train(sess)

	elif config.mode == "tagging":
		saver = tf.train.Saver()
		saver.restore(sess, config.ckpt_path)

		ELMo_dict, context_embeddings_op, ELMo_context, ELMo_ids = load_ELMo()
		init_ELMo(sess)

		unlabeled_dataset = load_data(config.unlabeld_path)
		unlabeled_data = prepare_dataset(unlabeled_dataset, word2idx, pumsa2idx, char2idx, lemma2idx, label2idx, ELMo_dict)

		print("%i sentences in unlabeled " % len(unlabeled_data))
示例#28
0
def train():
    # 加载数据集
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # 选择tag schema(IOB / IOBES)    I:中间,O:其他,B:开始 | E:结束,S:单个
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)
    update_tag_scheme(dev_sentences, FLAGS.tag_schema)
    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):  # 配置文件:char_to_id, id_to_char, tag_to_id, id_to_tag的数据
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(),
                FLAGS.emb_file,
                list(itertools.chain.from_iterable(
                    [[w[0] for w in s] for s in test_sentences])
                )
            )
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences, FLAGS.id_to_tag_path, FLAGS.tag_to_id_path)
        # with open('maps.txt','w',encoding='utf8') as f1:
        # f1.writelines(str(char_to_id)+" "+id_to_char+" "+str(tag_to_id)+" "+id_to_tag+'\n')
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)  #

    # prepare data, get a collection of list containing index
    # train_data[0][0]:一句话;
    # train_data[0][1]:单个字的编号;
    # train_data[0][2]:切词之后,切词特征:词的大小是一个字的话是0,词的大小是2以上的话:1,2....,2,3;
    # train_data[0][3]:每个字的标签
    train_data = prepare_dataset(
        train_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    dev_data = prepare_dataset(
        dev_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    test_data = prepare_dataset(
        test_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    print("%i / %i / %i sentences in train / dev / test." % (
        len(train_data), 0, len(test_data)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)  # 按batch size将数据拆分
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger)
        logger.info("start training")
        loss = []
        # tf.device("/cpu:0") 指定运行的GPU(默认为GPU:0)
        with tf.device("/cpu:0"):
            for i in range(100):
                # 按批次训练模型。这个是训练的开始,可以从这里倒着找整个网络怎么训练
                for batch in train_manager.iter_batch(shuffle=True):
                    step, batch_loss = model.run_step(sess, True, batch)
                    loss.append(batch_loss)
                    # 打印信息:
                    # iteration:迭代次数,也就是经过多少个epoch;
                    #
                    if step % FLAGS.steps_check == 0:
                        iteration = step // steps_per_epoch + 1
                        logger.info("iteration:{} step:{}/{}, "
                                    "NER loss:{:>9.6f}".format(
                            iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss)))
                        loss = []

                # best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
                if i % 7 == 0:
                    save_model(sess, model, FLAGS.ckpt_path, logger)
示例#29
0
def get_batch_manager(id_to_tag, tag_to_id, text, word_to_id):
    test_file = get_test_data2(text)
    test_data = prepare_data(test_file, word_to_id, tag_to_id, FLAGS.word_max_len)
    # load data,迭代器
    test_manager = BatchManager(test_data, len(id_to_tag), FLAGS.word_max_len, FLAGS.valid_batch_size)
    return test_manager
示例#30
0
def train():
    # load data sets
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), 0, len(test_data)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

            #best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            #if best:
            save_model(sess, model, FLAGS.ckpt_path, logger)
示例#31
0
def train_new():
    train_sent = load_sentences(FLAGS.filepath)

    update_tag_scheme(train_sent, FLAGS.tag_schema)

    if not os.path.isfile(FLAGS.map_file):
        _c, char_to_id, id_to_char = char_mapping(train_sent, FLAGS.lower)
        print("random embedding")

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sent)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # 数据准备,划分验证集和训练集
    np.random.seed(10)
    train_sent_ = np.array(train_sent)
    shuffle_indices = np.random.permutation(np.arange(len(train_sent)))

    sent_shuffled = train_sent_[shuffle_indices]
    dev_sample_index = -1 * int(FLAGS.dev_percentage * float(len(train_sent)))
    train_sent_new, dev_sent = sent_shuffled[:dev_sample_index], sent_shuffled[
        dev_sample_index:]

    train_data = prepare_dataset(train_sent_new, char_to_id, tag_to_id,
                                 FLAGS.lower)
    dev_data = prepare_dataset(dev_sent, char_to_id, tag_to_id, FLAGS.lower)

    print("%i / %i sentences in train." % (len(train_data), len(dev_data)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)

    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = FLAGS.log_file
    logger = get_logger(log_path)
    print_config(config, logger)

    # 根据需求,设置动态使用GPU资源
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:

        fig = plt.figure()
        ax = fig.add_subplot(211)
        ax2 = fig.add_subplot(212)
        plt.grid(True)
        plt.ion()

        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        for i in range(FLAGS.max_epoch):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)

                if step % 20 == 0:
                    ax.scatter(step, np.mean(loss), c='b', marker='.')
                    plt.pause(0.001)

                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []
            best, f1 = evaluate(sess, model, "dev", dev_manager, id_to_tag,
                                logger)
            ax2.scatter(i + 1, f1, c='b', marker='.')
            plt.pause(0.001)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger, "best")