def get_sentences_dict(self):
        """
        加载数据集中的语句,将每个语句的字符和标签存储为列表,然后生成字符和标签与索引id的双向映射字典
        :return:
        """
        # 加载数据集中的语句,将每个语句的字符和标签存储为列表
        self.train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                              FLAGS.zeros)
        self.dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower,
                                            FLAGS.zeros)
        # print("dev_sentences:", self.dev_sentences)

        # 原数据的标注模式与需要的标注模式不同时用update_tag_scheme函数对标注模式进行转换,转换成指定的IOB或者IOBES
        # update_tag_scheme(train_sentences, FLAGS.tag_schema)
        # update_tag_scheme(test_sentences, FLAGS.tag_schema)

        if not os.path.isfile(FLAGS.map_file):
            # 若map_file不存在,则根据数据集和预训练词向量文件初始化各个映射字典
            # 若使用预训练的词向量
            if FLAGS.pre_emb:
                # 得到train_sentences中字符的字典,键值对为word-词频
                dico_chars_train = char_mapping(self.train_sentences,
                                                FLAGS.lower)[0]
                # 用预训练词向量文件扩充字典(目的为尽可能地扩充字典、使更多字符能基于预训练的词向量进行初始化)并得到word与id的双向映射字典。
                dico_chars, self.char_to_id, self.id_to_char = augment_with_pretrained(
                    dico_chars_train.copy(), FLAGS.emb_file,
                    list(
                        itertools.chain.from_iterable(
                            [[w[0] for w in s] for s in self.dev_sentences])))
            else:  # 若不使用预训练的词向量
                _c, self.char_to_id, self.id_to_char = char_mapping(
                    self.train_sentences, FLAGS.lower)
            _t, self.tag_to_id, self.id_to_tag = tag_mapping(
                self.train_sentences)  # 标签和索引之间的双向映射字典
            print("tag_to_id", self.tag_to_id, len(self.tag_to_id))
            # 将得到的映射字典存入文件,以免重复初始化
            with open(FLAGS.map_file, "wb") as f:
                pickle.dump([
                    self.char_to_id, self.id_to_char, self.tag_to_id,
                    self.id_to_tag
                ], f)
        else:
            # 若map_file存在,则直接从文件中恢复各个映射字典
            with open(FLAGS.map_file, "rb") as f:
                self.char_to_id, self.id_to_char, self.tag_to_id, self.id_to_tag = pickle.load(
                    f)
예제 #2
0
    def train(self,
              n_epochs=100,
              freq_eval=1000,
              verbose=True,
              eval_test_set=False):
        """
        :param n_epochs: number of epochs over the training set
        :param freq_eval: evaluate on dev every freq_eval steps
        :return: Saves the model with the best F1-Score, evaluated on the dev set
        """
        # Initialize model
        model = Model(parameters=self.parameters, models_path=models_path)
        print("Model location: %s" % model.model_path)

        # Data parameters
        lower = self.parameters['lower']
        zeros = self.parameters['zeros']
        tag_scheme = self.parameters['tag_scheme']

        # Load sentences
        train_sentences = loader.load_sentences(self.parameters['train'],
                                                lower, zeros)
        dev_sentences = loader.load_sentences(self.parameters['dev'], lower,
                                              zeros)
        test_sentences = loader.load_sentences(self.parameters['test'], lower,
                                               zeros)

        # Use selected tagging scheme (IOB / IOBES)
        update_tag_scheme(train_sentences, tag_scheme)
        update_tag_scheme(dev_sentences, tag_scheme)
        update_tag_scheme(test_sentences, tag_scheme)

        # Create a dictionary / mapping of words
        # If we use pretrained embeddings, we add them to the dictionary.
        if self.parameters['pre_emb']:
            dico_words_train = word_mapping(train_sentences, lower)[0]
            dico_words, word_to_id, id_to_word = augment_with_pretrained(
                dico_words_train.copy(), self.parameters['pre_emb'],
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in dev_sentences +
                                                   test_sentences]))
                if not self.parameters['all_emb'] else None)
        else:
            dico_words, word_to_id, id_to_word = word_mapping(
                train_sentences, lower)
            dico_words_train = dico_words

        # Create a dictionary and a mapping for words / POS tags / tags
        dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
        dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)

        # Index data
        train_data = prepare_dataset(train_sentences, word_to_id, char_to_id,
                                     tag_to_id, lower)
        dev_data = prepare_dataset(dev_sentences, word_to_id, char_to_id,
                                   tag_to_id, lower)
        test_data = prepare_dataset(test_sentences, word_to_id, char_to_id,
                                    tag_to_id, lower)

        print("%i / %i / %i sentences in train / dev / test." %
              (len(train_data), len(dev_data), len(test_data)))

        # Save the mappings to disk
        print('Saving the mappings to disk...')
        model.save_mappings(id_to_word, id_to_char, id_to_tag)

        # Build the model
        f_train, f_eval = model.build(**self.parameters)

        # Reload previous model values
        if self.parameters['reload']:
            print('Reloading previous model...')
            model.reload()

        #
        # Train network
        #
        singletons = set(
            [word_to_id[k] for k, v in dico_words_train.items() if v == 1])
        best_dev = -np.inf
        best_test = -np.inf
        count = 0
        for epoch in range(n_epochs):
            epoch_costs = []
            print("Starting epoch %i at..." % epoch, time.ctime())
            for i, index in enumerate(np.random.permutation(len(train_data))):
                count += 1
                input = create_input(train_data[index], self.parameters, True,
                                     singletons)
                new_cost = f_train(*input)
                epoch_costs.append(new_cost)
                if i % 50 == 0 and i > 0 == 0 and verbose:
                    print("%i, cost average: %f" %
                          (i, np.mean(epoch_costs[-50:])))
                if count % freq_eval == 0:
                    dev_score = evaluate(self.parameters,
                                         f_eval,
                                         dev_sentences,
                                         dev_data,
                                         id_to_tag,
                                         verbose=verbose)
                    if eval_test_set:
                        test_score = evaluate(self.parameters,
                                              f_eval,
                                              test_sentences,
                                              test_data,
                                              id_to_tag,
                                              verbose=verbose)
                    print("Score on dev: %.5f" % dev_score)
                    if eval_test_set:
                        print("Score on test: %.5f" % test_score)
                    if dev_score > best_dev:
                        best_dev = dev_score
                        print("New best score on dev.")
                        print("Saving model to disk...")
                        model.save()
                    if eval_test_set:
                        if test_score > best_test:
                            best_test = test_score
                            print("New best score on test.")
            print(
                "Epoch %i done. Average cost: %f. Ended at..." %
                (epoch, np.mean(epoch_costs)), time.ctime())
        return best_dev
예제 #3
0
파일: main.py 프로젝트: aiedward/Cner_v1
def train():
    # 加载数据集
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    # 判断是否已经存在有映射好的词典
    if not os.path.isfile(FLAGS.map_file):
        # 如果需要预训练向量
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        # 如果不用就直接返回
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)
        # 为标签构建词典映射并保存
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences, dev_sentences,
                                               test_sentences)
        # 保存词典
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    # 如果有处理好的词典就直接读取
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)
    # 格式化输入到网络中的标准数据
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    # 打印统计结果
    print("len{} - len{} sentences in train - dev.".format(
        len(train_data), len(dev_data)))
    # 初始化batch管理器
    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 10)
    # 创建对应的文件夹,确保不出错
    make_path(FLAGS)
    # 读取配置文件
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    # 配置日志
    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)
    # 在训练开始前首先配置GPU
    tf_config = tf.ConfigProto()
    # 获取总共训练周期
    steps_per_epoch = train_manager.len_data
    # 在一个会话中
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        # 开始迭代训练
        for i in range(FLAGS.max_epoch):
            loss = []
            total_loss = 0
            start = time.time()
            # 获取batch
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                # 将计算好的loss加入
                loss.append(batch_loss)
                total_loss += batch_loss
                iteration = step // steps_per_epoch
                # 5步开始打印
                if (step + 1) % FLAGS.steps_check == 0:
                    logger.info(
                        "iteration:{} step:{}/{}, NER loss:{:>9.6f}".format(
                            iteration + 1, (step % steps_per_epoch) + 1,
                            steps_per_epoch, np.mean(loss)))
            # 每1个epoch打印出验证集的F1值,有点浪费训练时间
            # evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            # 每两个周期保存一次
            if (i + 1) % 2 == 0:
                save_model(sess, model, FLAGS.ckpt_path, logger)
            logger.info('Epoch {} total Loss {:.4f}'.format(
                i + 1, total_loss / steps_per_epoch))
            logger.info('Time taken for 1 epoch {} sec\n'.format(time.time() -
                                                                 start))
예제 #4
0
dev_sentences = loader.load_sentences(opts.dev, zeros)
test_sentences = loader.load_sentences(opts.test, zeros)

# Use selected tagging scheme (IOB / IOBES)
update_tag_scheme(train_sentences, tag_scheme)
update_tag_scheme(dev_sentences, tag_scheme)
update_tag_scheme(test_sentences, tag_scheme)

# Create a dictionary / mapping of words
# If we use pretrained embeddings, we add them to the dictionary.
if parameters['pre_emb']:
    dico_words_train = word_mapping(train_sentences, lower)[0]
    dico_words, word_to_id, id_to_word = augment_with_pretrained(
        dico_words_train.copy(),
        parameters['pre_emb'],
        list(itertools.chain.from_iterable(
            [[w[0] for w in s] for s in dev_sentences + test_sentences])
        ) if not parameters['all_emb'] else None
    )
else:
    dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower)
    dico_words_train = dico_words

# Create a dictionary and a mapping for words / POS tags / tags
dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)

# Index data
train_data = prepare_dataset(
    train_sentences, word_to_id, char_to_id, tag_to_id, lower
)
예제 #5
0
파일: train.py 프로젝트: metpallyv/tagger
dev_sentences = loader.load_sentences(opts.dev, lower, zeros)
test_sentences = loader.load_sentences(opts.test, lower, zeros)

# Use selected tagging scheme (IOB / IOBES)
update_tag_scheme(train_sentences, tag_scheme)
update_tag_scheme(dev_sentences, tag_scheme)
update_tag_scheme(test_sentences, tag_scheme)

# Create a dictionary / mapping of words
# If we use pretrained embeddings, we add them to the dictionary.
if parameters['pre_emb']:
    dico_words_train = word_mapping(train_sentences, lower)[0]
    dico_words, word_to_id, id_to_word = augment_with_pretrained(
        dico_words_train.copy(),
        parameters['pre_emb'],
        list(itertools.chain.from_iterable(
            [[w[0] for w in s] for s in dev_sentences + test_sentences])
        ) if not parameters['all_emb'] else None
    )
else:
    dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower)
    dico_words_train = dico_words

# Create a dictionary and a mapping for words / POS tags / tags
dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)

# Index data
train_data = prepare_dataset(
    train_sentences, word_to_id, char_to_id, tag_to_id, lower
)
예제 #6
0
# Load sentences
train_sentences = loader.load_sentences(opts.train, lower, zeros)
dev_sentences = loader.load_sentences(opts.dev, lower, zeros)
test_sentences = loader.load_sentences(opts.test, lower, zeros)

# Use selected tagging scheme (IOB / IOBES)
##update_tag_scheme(train_sentences, tag_scheme)
##update_tag_scheme(dev_sentences, tag_scheme)
##update_tag_scheme(test_sentences, tag_scheme)

# Create a dictionary / mapping of words
# If we use pretrained embeddings, we add them to the dictionary.
if parameters['pre_emb']:
    dico_words_train = word_mapping(train_sentences, lower)[0]
    dico_words, word_to_id, id_to_word = augment_with_pretrained(
        dico_words_train.copy(), parameters['pre_emb'], None)
else:
    dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower)
    dico_words_train = dico_words

# Create a dictionary and a mapping for words / POS tags / tags
dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)

# Index data
train_data = prepare_dataset(train_sentences, word_to_id, char_to_id,
                             tag_to_id, lower)

dev_data = prepare_dataset(dev_sentences, word_to_id, char_to_id, tag_to_id,
                           lower)
예제 #7
0
zeros = parameters['zeros']
#tag_scheme = parameters['tag_scheme']

# Load sentences
train_sentences, train_m = load_ner2line_sentences(opts.train, lower, zeros)
dev_sentences, dev_m = load_ner2line_sentences(opts.dev, lower, zeros)
test_sentences, test_m = load_ner2line_sentences(opts.test, lower, zeros)

# Create a dictionary / mapping of words
# If we use pretrained embeddings, we add them to the dictionary.
if parameters['pre_emb']:
    dico_words_train = word_mapping(train_sentences, lower)[0]
    dico_words, word_to_id, id_to_word, word_embeddings = augment_with_pretrained(
        dico_words_train.copy(), parameters['pre_emb'],
        list(
            itertools.chain.from_iterable(
                [s['tokens'] for s in dev_sentences +
                 test_sentences])) if not parameters['all_emb'] else None,
        parameters['word_dim'])
else:
    dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower)
    dico_words_train = dico_words
    logging.info("Random initialize the word embeddings.")
    word_embeddings = np.random.rand(len(id_to_word), parameters['word_dim'])

# Create a dictionary and a mapping for words / POS tags / tags
dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences,
                                              parameters['mode'])

# Index data
예제 #8
0
파일: main.py 프로젝트: DCdream/DA-CRF
def train():
    # load data sets
    # train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros)
    # dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    all_train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                         FLAGS.zeros)
    train_sentences, dev_sentences = split_train_dev(all_train_sentences)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # update_tag_scheme(dev_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            # dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars_train = char_mapping(all_train_sentences,
                                            FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(all_train_sentences,
                                                      FLAGS.lower)
        # _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(all_train_sentences)
        # _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

# nlp = StanfordCoreNLP(r'E:\DC\dataset\泰一指尚评测数据\stanford-corenlp-full-2017-06-09')
#l_sorted_lexcion = load_lexcion(FLAGS.lexcion_file, nlp)
    l_sorted_lexcion = []
    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 l_sorted_lexcion, FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               l_sorted_lexcion, FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                l_sorted_lexcion, FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), len(dev_data), len(test_data)))

    max_len = max(
        [len(sentence[0]) for sentence in train_data + test_data + dev_data])

    train_manager = BatchManager(train_data, FLAGS.batch_size, max_len)
    dev_manager = BatchManager(dev_data, 800, max_len)
    test_manager = BatchManager(test_data, 800, max_len)

    # random.shuffle(train_data)

    # pad_test_data = pad_data(test_data)
    # pad_dev_data = pad_data(dev_data)

    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id, max_len)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        for i in range(FLAGS.max_epoch):
            random.shuffle(train_data)
            pad_train_data = pad_data(train_data, max_len)
            strings, chars, lexcion_teatures, pos_ids, dep_ids, head_ids, targets = pad_train_data
            for j in range(0, len(strings), FLAGS.batch_size):
                batch = [
                    strings[j:j + FLAGS.batch_size],
                    chars[j:j + FLAGS.batch_size],
                    lexcion_teatures[j:j + FLAGS.batch_size],
                    pos_ids[j:j + FLAGS.batch_size],
                    dep_ids[j:j + FLAGS.batch_size],
                    head_ids[j:j + FLAGS.batch_size],
                    targets[j:j + FLAGS.batch_size]
                ]
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "AS loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger, i)
                evaluate(sess, model, "test", test_manager, id_to_tag, logger)
        evaluate(sess, model, "test", test_manager, id_to_tag, logger)
예제 #9
0
def train():
    # load data sets
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(),
                FLAGS.emb_file,
                list(itertools.chain.from_iterable(
                    [[w[0] for w in s] for s in test_sentences])
                )
            )
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(
        train_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    dev_data = prepare_dataset(
        dev_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    test_data = prepare_dataset(
        test_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    print("%i / %i / %i sentences in train / dev / test." % (
        len(train_data), 0, len(test_data)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger)
        logger.info("start training")
        loss = []

        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                #print batch
                step, batch_loss = model.run_step(sess, True, batch)
                #print step
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                        iteration, step%steps_per_epoch, steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
예제 #10
0
                                        con.zeros)
dev_sentences = loader.load_sentences(con.dataset['devdata'], con.lower,
                                      con.zeros)
test_sentences = loader.load_sentences(con.dataset['testdata'], con.lower,
                                       con.zeros)

dev_oov_sentences = loader.load_sentences(con.dev_oov, con.lower, con.zeros)
test_oov_sentences = loader.load_sentences(con.test_oov, con.lower, con.zeros)

# Create a dictionary / mapping of words
# If we use pretrained embeddings, we add them to the dictionary.
if con.model_para['emb_path'] != None:
    dico_words_train = word_mapping(train_sentences, con.lower)[0]
    dico_words, word_to_id, id_to_word = augment_with_pretrained(
        dico_words_train.copy(), con.model_para['emb_path'],
        list(
            itertools.chain.from_iterable(
                [[w[0] for w in s] for s in dev_sentences + test_sentences])))
    embedding_matrix = loader.get_lample_embedding(con.model_para['emb_path'],
                                                   id_to_word,
                                                   con.model_para['input_dim'])
else:
    dico_words, word_to_id, id_to_word = word_mapping(train_sentences,
                                                      con.lower)
    dico_words_train = dico_words
    embedding_matrix = None

# Create a dictionary and a mapping for words / POS tags / tags
dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)
def evaluate_testDataSet():
    # load data sets
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), len(dev_data), len(test_data)))

    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        for i in range(100):
            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
예제 #12
0
def train():
    # load data sets
    # sentences 的格式如下  ['在', 'O'], ['厦', 'B-LOC'], ['门', 'I-LOC']
    # train_sentences = loader.load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros)
    # dev_sentences = loader.load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    # test_sentences = loader.load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    train_sentences = loader.load_folder_sentences(FLAGS.train_file,
                                                   FLAGS.lower, FLAGS.zeros)
    dev_sentences = loader.load_folder_sentences(FLAGS.dev_file, FLAGS.lower,
                                                 FLAGS.zeros)
    test_sentences = loader.load_folder_sentences(FLAGS.test_file, FLAGS.lower,
                                                  FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    # update_tag_scheme 后sentence没有太大的变化
    loader.update_tag_scheme(train_sentences, FLAGS.tag_schema)
    loader.update_tag_scheme(test_sentences, FLAGS.tag_schema)

    os.environ["CUDA_VISIBLE_DEVICES"] = "0"

    # create maps if not exist
    # 是否存在maps.pkl文件,如果不存在就需要读取训练数据,
    # 获得char_to_id  tag_to_id

    # create maps if not exist
    # 是否存在maps.pkl文件,
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = loader.char_mapping(train_sentences,
                                                   FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = loader.augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = loader.char_mapping(
                train_sentences, FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = loader.tag_mapping(train_sentences)

        print('tag_to_id: ', tag_to_id)

        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # print('tag_to_id: ', tag_to_id)

    print('tag_to_id: ', tag_to_id)
    # prepare data, get a collection of list containing index
    train_data = loader.prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                        FLAGS.lower)
    dev_data = loader.prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                                      FLAGS.lower)
    test_data = loader.prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                       FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), 0, len(test_data)))

    train_manager = data_utils.BatchManager(train_data, FLAGS.batch_size)
    dev_manager = data_utils.BatchManager(dev_data, 100)
    test_manager = data_utils.BatchManager(test_data, 100)

    # make path for store log and model if not exist
    utils.make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = utils.load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        utils.save_config(config, FLAGS.config_file)
    utils.make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)  # ./log/train.log
    logger = utils.get_logger(log_path)
    utils.print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = utils.create_model(sess, Model, FLAGS.ckpt_path,
                                   data_utils.load_word2vec, config,
                                   id_to_char, logger)
        logger.info("start training")
        loss = []

        for i in range(FLAGS.iterations):
            # for i in range(10):
            logger.info('epoch: {}'.format(i))
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                utils.save_model(sess, model, FLAGS.ckpt_path, logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
예제 #13
0
def train():
    """
    train函数:传入数据、处理数据、模型训练、输出测试集f1值
    :return:
    """
    # load data sets传入数据集,做基本处理包括转小写、换0、去除空格提取word等,将训练集word和tag放在list中。 .dev_file用作cross validation
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)  # FLAGS.zeros = False
    # train_sentences格式 ['厦', 'B-LOC'], ['门', 'I-LOC'], ['与', 'O'], ['金', 'B-LOC'], ['门', 'I-LOC']
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES) 将IOB格式标签转换文IOBES。I:中间,O:其他,B:开始 | E:结束,S:单个
    # 调用loder.py中的update_tag_scheme函数进行tag转换,在此函数内又调用data_utils.py中的iob_iobes函数转换tag
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)
    update_tag_scheme(dev_sentences, FLAGS.tag_schema)

    # create maps if not exist 创建词映射字典
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:  # 数据增强 添加预训练词向量到训练字典中
            dico_chars_train = char_mapping(
                train_sentences, FLAGS.lower
            )[0]  # 调用loader.py中的char_mapping函数,只输出一个被转换为小写的数据集字典,frequency降序排列
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(  # 调用loader.py中的augment_with_pretrained函数
                # 添加原字典中没有的pretrain字符到原字典中,pretrain必须在test集中有出现过
                dico_chars_train.copy(),
                FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences
                                                   ])  # 使用test集作为预训练词向量的基准
                ))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)
            # _c是无序字典,即列出了每个key出现的次数。char_to_id是有序字典,但是value不是frequency,是序号,但key排列顺序是按frequence降序

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(
            train_sentences)  # 调用loader.py中的tag_mapping函数创建tag字典,_t是不重复tag的字典
        # tag_to_id: {'O': 0, 'S-MISC': 1, 'B-ORG': 2, 'B-PER': 3, 'E-ORG': 4, 'E-PER': 5, 'S-LOC': 6, 'S-ORG': 7, 'I-PER': 8, 'S-PER': 9}
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag],
                        f)  # 将上述字典保存到map file中
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(  # 调用loader.py中的prepare_dataset函数生成 训练集word字符——word的frequency——分词后的word特征——标签的frequency
        train_sentences, char_to_id, tag_to_id, FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), len(dev_data), len(test_data)))

    # 生成bach_size大小 可以调用batch_data和len_data两个内置变量
    # BatchManager用来统一输入的训练数据的array的长度
    train_manager = BatchManager(
        train_data, FLAGS.batch_size)  # data_utils.py传入BatchManager类
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)  # output配置文件config_file
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)  # 打印生成的log并储存在文件夹内

    # 迭代原理 训练 loss值如何产生
    # limit GPU memory
    tf_config = tf.compat.v1.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    # 英文:steps_per_epoch = 703 即一共需要处理的训练数据批次量, steps_per_epoch * 20 = 总共的句子个数
    # 中文:steps_per_epoch = 1044
    steps_per_epoch = train_manager.len_data
    # 开始训练模型
    with tf.compat.v1.Session(
            config=tf_config
    ) as sess:  # 使用tf.Session激活配置参数,使用utils.py中create_model函数下session.run执行
        # 创建模型框架,包括init函数中定义的模型各个层参数和相应函数调用,先生成num_chars * 100的word embedding权重矩阵
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        # 调用utils.py中的Model类创建模型,传入训练字典。调用data_utils中的load_word2vec函数
        logger.info("start training")
        loss = []
        # 这层循环的意义是共训练模型100次,不断传入train和验证集来调整模型的参数,得到最优F1值。括号内的range(100)可调参
        for i in range(100):
            # 开始训练模型 传入数据集
            # 先在模型中根据batch创建输入字典feed_dict,每20个一组,包括每句话的word id,每句话的word feature,每句话tag id
            # 依次执行模型每一层,从embedding layer开始
            # 生成词向量,按批次传入参数包括每句话的char id;每句话feature和是否存在句子维度的预先定义值,生成120维包含所有训练数据的词向量
            # 用dropout随机去除部分词向量防止过拟合,将词向量喂给CNN模型进行卷积训练。
            for batch in train_manager.iter_batch(
                    shuffle=True):  # iter_batch:data_utils.py中的iter_batch函数
                # batch是产生随机顺序的句子,输出上述array
                # batch组成:4个大list,每个list包含:
                # 1. 随机输出的所有句子,['Fairview', ',', 'Texas', ',', '$', '1.82', 'million', 'deal', 'Baa1', '-'],
                # 2. word出现在字典中的位置。
                # 3. 每句话对应的表征word长度特征的list。
                # 4. 每句话对应的tag在tag字典中出现的位置
                step, batch_loss = model.run_step(sess, True, batch)
                # loss:60.648315 76.53908 54.006336 108.96472
                # step从1开始增加,每100次输出一次当前loss值
                loss.append(batch_loss)
                # 5个batch输出一次loss值,step=100,总batch
                if step % FLAGS.steps_check == 0:  # 每迭代100次输出一次loss,
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
예제 #14
0
update_tag_scheme(test_sentences, tag_scheme)

# load external features if provided
features = []
if params['feat']:
    features = load_features(params['feat'])

# prepare mappings
all_sentences = train_sentences + dev_sentences + test_sentences
mappings = prepare_mapping_bi(all_sentences, bi_train_sentences, features,
                              **params)

# If pretrained embeddings is used and all_emb flag is on,
# we augment the words by pretrained embeddings.
# if parameters['pre_emb'] and parameters['all_emb']:
updated_word_mappings = augment_with_pretrained(all_sentences,
                                                params['pre_emb'])
mappings.update(updated_word_mappings)

updated_word_mappings = augment_with_pretrained_bi(bi_train_sentences,
                                                   params['bi_pre_emb'])
mappings.update(updated_word_mappings)

# compute vocab size
params['label_size'] = len(mappings['id_to_tag'])
params['word_vocab_size'] = len(mappings['id_to_word'])
params['bi_word_vocab_size'] = len(mappings['bi_id_to_word'])
params['char_vocab_size'] = len(mappings['id_to_char'])
params['feat_vocab_size'] = [len(item) for item in mappings['id_to_feat_list']]

print("word vocab size: ", params['word_vocab_size'])
print("bi word vocab size: ", params['bi_word_vocab_size'])
예제 #15
0
def train():
    # load data sets
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist, load data if exists maps
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), 0, len(test_data)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)

    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # 设置训练日志目录
    train_log = os.path.join(FLAGS.logdir, "train")
    if not os.path.exists(train_log):
        os.makedirs(train_log)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data  # the nums of batch data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        # 观察所建立的计算图
        train_writer = tf.summary.FileWriter(train_log, sess.graph)
        logger.info("start training")
        loss = []
        dev_f1 = []
        test_f1 = []
        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss, merged = model.run_step(
                    sess, True, batch)  # step是global step
                # 在迭代中输出到结果
                train_writer.add_summary(merged, step)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

            # use dev data to validation the model
            best, dev_f1_value = evaluate(sess, model, "dev", dev_manager,
                                          id_to_tag, logger)
            # store the dev f1
            dev_f1.append(dev_f1_value)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
            # use current the  model to test
            _, test_f1_value = evaluate(sess, model, "test", test_manager,
                                        id_to_tag, logger)
            #   store the test f1
            test_f1.append(test_f1_value)
        # write the dev_f1 and test_f1 to file
        f1_result = {}
        f1_result["dev_f1"] = dev_f1
        f1_result["test_f1"] = test_f1
        write_data_to_file(f1_result, "f1_result")
예제 #16
0
def train():
    # load data sets:返回的是语料集的[['字','标'],...]元组
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # 由loader.py负责处理数据

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):

        # create dictionary for word
        if FLAGS.pre_emb:  # 判断是否用之前训练好的词向量
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            # dico_chars_train应该只接收了dico <注意后面的[0]> ,即训练数据的不重复统计的字集

            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences]))
                # chain.from_iterable(iterables): 一个备用链构造函数,其中的iterables是一个迭代变量,生成迭代序列
                # 所以这里的list生成的就是test_sentences里的字集
            )
            # 这里dico_chars是在train_set字典基础上添加wiki_100中包含的test_set里的字构成的字典
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
            # 通过pickle模块的序列化操作我们能够将程序中运行的对象信息保存到文件中去,永久存储。
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    # xxx_data 以句子为单位存储[字符,字符id,标签id/chars长度的全是“0”对应标签id的list <train = True/False>,标签]
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), 0, len(test_data)))

    train_manager = BatchManager(train_data,
                                 FLAGS.batch_size)  # 默认的batch_size为20
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    # 定义了3个BatchManager类:这个类中包含batch_data和len_data
    # batch_data 是按句子长短顺序排序后一个batch大小的data列表数据,而且每个batch中的数据都padding到统一长短
    # len_data   是所分batch的数量

    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto(
    )  # tf.ConfigProto一般用在创建session的时候。用来对session进行参数配置
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
예제 #17
0
파일: main.py 프로젝트: ypycsy/CDTL-PSE
def train(X_train,X_dev,X_test):
    # load data sets
    train_sentences = X_train
    dev_sentences = X_dev
    test_sentences = X_test

    train_sentences_loc = load_sentences(FLAGS.train_file_loc, FLAGS.lower, FLAGS.zeros)
    dev_sentences_loc = load_sentences(FLAGS.dev_file_loc, FLAGS.lower, FLAGS.zeros)
    test_sentences_loc = load_sentences(FLAGS.test_file_loc, FLAGS.lower, FLAGS.zeros)
    train_sentences_org = load_sentences(FLAGS.train_file_org, FLAGS.lower, FLAGS.zeros)
    dev_sentences_org = load_sentences(FLAGS.dev_file_org, FLAGS.lower, FLAGS.zeros)
    test_sentences_org = load_sentences(FLAGS.test_file_org, FLAGS.lower, FLAGS.zeros)
    train_sentences_per = load_sentences(FLAGS.train_file_per, FLAGS.lower, FLAGS.zeros)
    dev_sentences_per = load_sentences(FLAGS.dev_file_per, FLAGS.lower, FLAGS.zeros)
    test_sentences_per = load_sentences(FLAGS.test_file_per, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    update_tag_scheme(train_sentences_loc, FLAGS.tag_schema)
    update_tag_scheme(test_sentences_loc, FLAGS.tag_schema)
    update_tag_scheme(train_sentences_per, FLAGS.tag_schema)
    update_tag_scheme(test_sentences_per, FLAGS.tag_schema)
    update_tag_scheme(train_sentences_org, FLAGS.tag_schema)
    update_tag_scheme(test_sentences_org, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(),
                FLAGS.emb_file,
                list(itertools.chain.from_iterable(
                    [[w[0] for w in s] for s in test_sentences])
                )
            )
            dico_chars_train_loc = char_mapping(train_sentences_loc, FLAGS.lower)[0]
            dico_chars_loc, char_to_id_loc, id_to_char_loc = augment_with_pretrained(
                dico_chars_train_loc.copy(),
                FLAGS.emb_file,
                list(itertools.chain.from_iterable(
                    [[w[0] for w in s] for s in test_sentences_loc])
                )
            )
            dico_chars_train_per = char_mapping(train_sentences_per, FLAGS.lower)[0]
            dico_chars_per, char_to_id_per, id_to_char_per = augment_with_pretrained(
                dico_chars_train_per.copy(),
                FLAGS.emb_file,
                list(itertools.chain.from_iterable(
                    [[w[0] for w in s] for s in test_sentences_per])
                )
            )
            dico_chars_train_org = char_mapping(train_sentences_org, FLAGS.lower)[0]
            dico_chars_org, char_to_id_org, id_to_char_org = augment_with_pretrained(
                dico_chars_train_org.copy(),
                FLAGS.emb_file,
                list(itertools.chain.from_iterable(
                    [[w[0] for w in s] for s in test_sentences_org])
                )
            )
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower)
            _c_loc, char_to_id_loc, id_to_char_loc = char_mapping(train_sentences_loc, FLAGS.lower)
            _c_per, char_to_id_per, id_to_char_per = char_mapping(train_sentences_per, FLAGS.lower)
            _c_org, char_to_id_org, id_to_char_org = char_mapping(train_sentences_org, FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        _t_loc, tag_to_id_loc, id_to_tag_loc = tag_mapping(train_sentences_loc)
        _t_per, tag_to_id_per, id_to_tag_per = tag_mapping(train_sentences_per)
        _t_org, tag_to_id_org, id_to_tag_org = tag_mapping(train_sentences_org)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag,char_to_id_loc, id_to_char_loc, tag_to_id_loc, id_to_tag_loc,char_to_id_per, id_to_char_per, tag_to_id_per, id_to_tag_per,char_to_id_org, id_to_char_org, tag_to_id_org, id_to_tag_org], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag,char_to_id_loc, id_to_char_loc, tag_to_id_loc, id_to_tag_loc,char_to_id_per, id_to_char_per, tag_to_id_per, id_to_tag_per,char_to_id_org, id_to_char_org, tag_to_id_org, id_to_tag_org = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(
        train_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    dev_data = prepare_dataset(
        dev_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    test_data = prepare_dataset(
        test_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    print("%i / %i / %i sentences in train / dev / test." % (
        len(train_data),len(dev_data), len(test_data)))
    train_data_loc = prepare_dataset_ner(
        train_sentences_loc, char_to_id_loc, tag_to_id_loc, FLAGS.lower
    )
    dev_data_loc = prepare_dataset_ner(
        dev_sentences_loc, char_to_id_loc, tag_to_id_loc, FLAGS.lower
    )
    test_data_loc = prepare_dataset_ner(
        test_sentences_loc, char_to_id_loc, tag_to_id_loc, FLAGS.lower
    )
    print("%i / %i / %i sentences_loc in train / dev / test." % (
        len(train_data_loc), len(dev_data_loc), len(test_data_loc)))
    train_data_per = prepare_dataset_ner(
        train_sentences_per, char_to_id_per, tag_to_id_per, FLAGS.lower
    )
    dev_data_per = prepare_dataset_ner(
        dev_sentences_per, char_to_id_per, tag_to_id_per, FLAGS.lower
    )
    test_data_per = prepare_dataset_ner(
        test_sentences_per, char_to_id_per, tag_to_id_per, FLAGS.lower
    )
    print("%i / %i / %i sentences_per in train / dev / test." % (
        len(train_data_per), len(dev_data_per), len(test_data_per)))
    train_data_org = prepare_dataset_ner(
        train_sentences_org, char_to_id_org, tag_to_id_org, FLAGS.lower
    )
    dev_data_org = prepare_dataset_ner(
        dev_sentences_org, char_to_id_org, tag_to_id_org, FLAGS.lower
    )
    test_data_org = prepare_dataset_ner(
        test_sentences_org, char_to_id_org, tag_to_id_org, FLAGS.lower
    )
    print("%i / %i / %i sentences_org in train / dev / test." % (
        len(train_data_org), len(dev_data_org), len(test_data_org)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)

    train_manager_loc = BatchManager(train_data_loc, FLAGS.batch_size)
    train_manager_per = BatchManager(train_data_per, FLAGS.batch_size)
    train_manager_org = BatchManager(train_data_org, FLAGS.batch_size)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id,char_to_id_loc, tag_to_id_loc,char_to_id_per, tag_to_id_per,char_to_id_org, tag_to_id_org)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    steps_per_epoch_loc = train_manager_loc.len_data
    steps_per_epoch_per = train_manager_per.len_data
    steps_per_epoch_org = train_manager_org.len_data
    model = create_model(Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, id_to_char_loc, id_to_char_per, id_to_char_org, logger)

    with tf.Session(config=tf_config, graph = model.graph ) as sess:

        sess.run(tf.global_variables_initializer())
        if config["pre_emb"]:
            emb_weights = sess.run(model.char_lookup.read_value())
            emb_weights_ner = sess.run(model.char_lookup.read_value())
            emb_weights, emb_weights_ner = load_word2vec(config["emb_file"], id_to_char, id_to_char_loc,id_to_char_per,id_to_char_org, config["char_dim"],
                                                    emb_weights, emb_weights_ner)
            sess.run(model.char_lookup.assign(emb_weights))
            logger.info("Load pre-trained embedding.")
        logger.info("start training")
        loss = []
        loss_loc = []
        loss_per = []
        loss_org = []
        for i in range(100):
            for batch_loc in train_manager_loc.iter_batch(shuffle=True):
                    step_loc, batch_loss_loc = model.run_step_ner(sess, True, batch_loc)
                    loss_loc.append(batch_loss_loc)
                    if step_loc % FLAGS.steps_check == 0:
                        iteration_loc = step_loc // steps_per_epoch_loc + 1
                        logger.info("iteration:{} step_loc:{}/{}, "
                                    "NER loss:{:>9.6f}".format(
                            iteration_loc, step_loc % steps_per_epoch_loc, steps_per_epoch_loc, np.mean(loss_loc)))
                        loss_loc = []
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration_1 = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                            "SKILL loss:{:>9.6f}".format(
                        iteration_1, step % steps_per_epoch, steps_per_epoch, np.mean(loss)))
                loss = []
            precision_loc_dev = model.precision(sess, dev_manager, id_to_tag)
            precision_loc_test = model.precision(sess, test_manager, id_to_tag)
            for batch_per in train_manager_per.iter_batch(shuffle=True):
                    step_per, batch_loss_per = model.run_step_ner(sess, True, batch_per)
                    loss_per.append(batch_loss_per)
                    if step_per % FLAGS.steps_check == 0:
                        iteration_per = step_per // steps_per_epoch_per + 1
                        logger.info("iteration:{} step_per:{}/{}, "
                                    "NER loss:{:>9.6f}".format(
                            iteration_per, step_per % steps_per_epoch_per, steps_per_epoch_per, np.mean(loss_per)))
                        loss_per = []
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration_2 = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                            "SKILL loss:{:>9.6f}".format(
                        iteration_2, step % steps_per_epoch, steps_per_epoch, np.mean(loss)))
                loss = []
            precision_per_dev = model.precision(sess, dev_manager, id_to_tag)
            precision_per_test = model.precision(sess, test_manager, id_to_tag)
            for batch_org in train_manager_org.iter_batch(shuffle=True):
                    step_org, batch_loss_org = model.run_step_ner(sess, True, batch_org)
                    loss_org.append(batch_loss_org)
                    if step_org % FLAGS.steps_check == 0:
                        iteration_org = step_org // steps_per_epoch_org + 1
                        logger.info("iteration:{} step_org:{}/{}, "
                                    "NER loss:{:>9.6f}".format(
                            iteration_org, step_org % steps_per_epoch_org, steps_per_epoch_org, np.mean(loss_org)))
                        loss_org = []
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration_3 = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                            "SKILL loss:{:>9.6f}".format(
                        iteration_3, step % steps_per_epoch, steps_per_epoch, np.mean(loss)))
                loss = []
            precision_org_dev = model.precision(sess, dev_manager, id_to_tag)
            precision_org_test = model.precision(sess, test_manager, id_to_tag)
            best = evaluate(sess, model, "dev", dev_manager, id_to_tag,precision_loc_dev,precision_per_dev,precision_org_dev, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
                best_test,results= evaluate(sess, model, "test", test_manager, id_to_tag,precision_loc_test,precision_per_test,precision_org_test, logger)
                with open("CDTL_PSE-result.csv", "a",encoding='utf-8')as st_re:
                    st_re.write(str(results).replace("[", "").replace("]", ""))
                    st_re.write("\n")
예제 #18
0
def main(argv=None):  # pylint: disable=unused-argument

  # if tf.gfile.Exists(FLAGS.eval_dir):
  #   tf.gfile.DeleteRecursively(FLAGS.eval_dir)
  # tf.gfile.MakeDirs(FLAGS.eval_dir)

  # Read parameters from command line
  opts = read_args(evaluation=True)

  # Parse parameters
  parameters = form_parameters_dict(opts)

  # Check parameters validity
  assert os.path.isfile(opts.train)
  assert os.path.isfile(opts.dev)
  assert os.path.isfile(opts.test)
  assert parameters['char_dim'] > 0 or parameters['word_dim'] > 0
  assert 0. <= parameters['dropout'] < 1.0
  assert parameters['t_s'] in ['iob', 'iobes']
  assert not parameters['all_emb'] or parameters['pre_emb']
  assert not parameters['pre_emb'] or parameters['word_dim'] > 0
  assert not parameters['pre_emb'] or os.path.isfile(parameters['pre_emb'])

  # Check evaluation script / folders
  if not os.path.isfile(eval_script):
      raise Exception('CoNLL evaluation script not found at "%s"' % eval_script)
  if not os.path.exists(eval_temp):
      os.makedirs(eval_temp)
  if not os.path.exists(models_path):
      os.makedirs(models_path)
  event_logs_path = os.path.join(eval_temp, "eval_logs")
  # if not os.path.exists(event_logs_path):
  #     os.makedirs(event_logs_path)

  # Initialize model
  model = MainTaggerModel(parameters=parameters, models_path=models_path,
                          overwrite_mappings=opts.overwrite_mappings)
  print "MainTaggerModel location: %s" % model.model_path

  # Data parameters
  lower = parameters['lower']
  zeros = parameters['zeros']
  tag_scheme = parameters['t_s']

  max_sentence_lengths = {}
  max_word_lengths = {}

  # Load sentences
  train_sentences, max_sentence_lengths['train'], max_word_lengths['train'] = \
      loader.load_sentences(opts.train, lower, zeros)
  dev_sentences, max_sentence_lengths['dev'], max_word_lengths['dev'] = loader.load_sentences(
      opts.dev, lower, zeros)
  test_sentences, max_sentence_lengths['test'], max_word_lengths['test'] = loader.load_sentences(
      opts.test, lower, zeros)

  global_max_sentence_length, global_max_char_length = \
      calculate_global_maxes(max_sentence_lengths, max_word_lengths)

  # Use selected tagging scheme (IOB / IOBES)
  update_tag_scheme(train_sentences, tag_scheme)
  update_tag_scheme(dev_sentences, tag_scheme)
  update_tag_scheme(test_sentences, tag_scheme)

  # Create a dictionary / mapping of words
  # If we use pretrained embeddings, we add them to the dictionary.
  if parameters['pre_emb']:
      dico_words_train = word_mapping(train_sentences, lower)[0]
      dico_words, word_to_id, id_to_word = augment_with_pretrained(
          dico_words_train.copy(),
          parameters['pre_emb'],
          list(itertools.chain.from_iterable(
              [[w[0] for w in s] for s in dev_sentences + test_sentences])
          ) if not parameters['all_emb'] else None
      )
  else:
      dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower)
      dico_words_train = dico_words

  # Create a dictionary and a mapping for words / POS tags / tags
  dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
  dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)

  if opts.overwrite_mappings:
      print 'Saving the mappings to disk...'
      model.save_mappings(id_to_word, id_to_char, id_to_tag)

  model.reload_mappings()

  # Index data
  train_buckets, train_stats, train_unique_words = prepare_dataset(
      train_sentences, word_to_id, char_to_id, tag_to_id,
      global_max_sentence_length, global_max_char_length,
      lower
  )
  dev_buckets, dev_stats, dev_unique_words = prepare_dataset(
      dev_sentences, word_to_id, char_to_id, tag_to_id,
      global_max_sentence_length, global_max_char_length,
      lower
  )
  test_buckets, test_stats, test_unique_words = prepare_dataset(
      test_sentences, word_to_id, char_to_id, tag_to_id,
      global_max_sentence_length, global_max_char_length,
      lower
  )

  print "%i / %i / %i sentences in train / dev / test." % (
      len(train_stats), len(dev_stats), len(test_stats))

  print "%i / %i / %i words in train / dev / test." % (
      sum([x[0] for x in train_stats]), sum([x[0] for x in dev_stats]),
      sum([x[0] for x in test_stats]))

  print "%i / %i / %i longest sentences in train / dev / test." % (
      max([x[0] for x in train_stats]), max([x[0] for x in dev_stats]),
      max([x[0] for x in test_stats]))

  print "%i / %i / %i shortest sentences in train / dev / test." % (
      min([x[0] for x in train_stats]), min([x[0] for x in dev_stats]),
      min([x[0] for x in test_stats]))

  for i, label in [[2, 'char']]:
      print "%i / %i / %i total %s in train / dev / test." % (
          sum([sum(x[i]) for x in train_stats]), sum([sum(x[i]) for x in dev_stats]),
          sum([sum(x[i]) for x in test_stats]),
          label)

      print "%i / %i / %i max. %s lengths in train / dev / test." % (
          max([max(x[i]) for x in train_stats]), max([max(x[i]) for x in dev_stats]),
          max([max(x[i]) for x in test_stats]),
          label)

      print "%i / %i / %i min. %s lengths in train / dev / test." % (
          min([min(x[i]) for x in train_stats]), min([min(x[i]) for x in dev_stats]),
          min([min(x[i]) for x in test_stats]),
          label)

  print "Max. sentence lengths: %s" % max_sentence_lengths
  print "Max. char lengths: %s" % max_word_lengths

  for label, bin_stats, n_unique_words in [['train', train_stats, train_unique_words],
                                           ['dev', dev_stats, dev_unique_words],
                                           ['test', test_stats, test_unique_words]]:
      int32_items = len(train_stats) * (
          max_sentence_lengths[label] * (5 + max_word_lengths[label]) + 1)
      float32_items = n_unique_words * parameters['word_dim']
      total_size = int32_items + float32_items
      logging.info("Input ids size of the %s dataset is %d" % (label, int32_items))
      logging.info("Word embeddings (unique: %d) size of the %s dataset is %d" % (
          n_unique_words, label, float32_items))
      logging.info("Total size of the %s dataset is %d" % (label, total_size))

  batch_size = 5

  # Build the model
  cost, train_step, tag_scores, tag_ids, word_ids, \
  crf_transition_params, sentence_lengths, enqueue_op, placeholders = model.build(
      max_sentence_length_scalar=global_max_sentence_length,
      max_word_length_scalar=global_max_char_length,
      batch_size_scalar=batch_size,
      **parameters)

  FLAGS = tf.app.flags.FLAGS

  tf.app.flags.DEFINE_string('eval_dir', event_logs_path,
                             """Directory where to write event logs.""")
  tf.app.flags.DEFINE_string('eval_data', 'test',
                             """Either 'test' or 'train_eval'.""")
  tf.app.flags.DEFINE_string('checkpoint_dir', model.model_path,
                             """Directory where to read model checkpoints.""")
  tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 5,
                              """How often to run the eval.""")
  tf.app.flags.DEFINE_integer('num_examples', 10000,
                              """Number of examples to run.""")
  tf.app.flags.DEFINE_boolean('run_once', False,
                              """Whether to run eval only once.""")

  evaluate(model,
           dev_buckets, test_buckets,
           FLAGS, opts,
           id_to_tag,
           batch_size,
           placeholders,
           enqueue_op, tag_scores, tag_ids, word_ids, crf_transition_params, sentence_lengths,
           FLAGS.eval_dir,
           tag_scheme)
예제 #19
0
def runModelInLoop(dropout,char_dim,char_lstm_dim,word_dim,word_lstm_dim):
    #results File
    resultsPath = "/Users/Ehsan/Documents/Ehsan_General/HMQ/HMQ_Projects/DNR2/COLING-2016-Code/i2b2-2010/results/"
    for u_dropout in dropout:
        for v_char_dim in char_dim:
            for w_char_lstm_dim in char_lstm_dim:
                for x_word_dim in word_dim:
                    for y_word_lstm_dim in word_lstm_dim:
                        for dataset in datasets:
                            print "+++++++++++++++"
                            print u_dropout,v_char_dim,w_char_lstm_dim,x_word_dim,y_word_lstm_dim,dataset
                            parameters['dropout'] = u_dropout

                            parameters['char_dim'] = v_char_dim
                            parameters['char_lstm_dim'] =w_char_lstm_dim
                            parameters['word_dim'] = x_word_dim
                            parameters['word_lstm_dim'] = y_word_lstm_dim

                            # If dataset is DrugBank assign predefined path

                            if(dataset == "i2b2-2010"):
                                opts.train = i2b2BasePath+"train.txt"
                                opts.dev = i2b2BasePath+ "dev.txt"
                                opts.test = i2b2BasePath+ "test.txt"
                                resultsFile = resultsPath +"i2b2_2010_Results.txt"



                            # Initialize model
                            model = Model(parameters=parameters, models_path=models_path)
                            print "Model location: %s" % model.model_path

                            # Data parameters
                            lower = parameters['lower']
                            zeros = parameters['zeros']
                            tag_scheme = parameters['tag_scheme']

                            # Load sentences
                            train_sentences = loader.load_sentences(opts.train, lower, zeros)
                            dev_sentences = loader.load_sentences(opts.dev, lower, zeros)
                            test_sentences = loader.load_sentences(opts.test, lower, zeros)

                            # Use selected tagging scheme (IOB / IOBES)
                            update_tag_scheme(train_sentences, tag_scheme)
                            update_tag_scheme(dev_sentences, tag_scheme)
                            update_tag_scheme(test_sentences, tag_scheme)

                            # Create a dictionary / mapping of words
                            # If we use pretrained embeddings, we add them to the dictionary.
                            if parameters['pre_emb']:
                                dico_words_train = word_mapping(train_sentences, lower)[0]
                                dico_words, word_to_id, id_to_word = augment_with_pretrained(
                                    dico_words_train.copy(),
                                    parameters['pre_emb'],
                                    list(itertools.chain.from_iterable(
                                        [[w[0] for w in s] for s in dev_sentences + test_sentences])
                                    ) if not parameters['all_emb'] else None
                                )
                            else:
                                dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower)
                                dico_words_train = dico_words

                            # Create a dictionary and a mapping for words / POS tags / tags
                            dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
                            dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)

                            print "Calling the prepare_dataset :--"
                            # Index data
                            train_data = prepare_dataset(
                                train_sentences, word_to_id, char_to_id, tag_to_id, lower
                            )
                            dev_data = prepare_dataset(
                                dev_sentences, word_to_id, char_to_id, tag_to_id, lower
                            )
                            test_data = prepare_dataset(
                                test_sentences, word_to_id, char_to_id, tag_to_id, lower
                            )

                            print "%i / %i / %i sentences in train / dev / test." % (
                                len(train_data), len(dev_data), len(test_data))

                            # Save the mappings to disk
                            print 'Saving the mappings to disk...'
                            model.save_mappings(id_to_word, id_to_char, id_to_tag)

                            # Build the model
                            f_train, f_eval = model.build(**parameters)

                            # Reload previous model values
                            if opts.reload:
                                print 'Reloading previous model...'
                                model.reload()


                            # Train network
                            #
                            singletons = set([word_to_id[k] for k, v
                                              in dico_words_train.items() if v == 1])
                            n_epochs = 2  # number of epochs over the training set
                            freq_eval = 1000  # evaluate on dev every freq_eval steps
                            best_dev = -np.inf
                            best_test = -np.inf
                            count = 0
                            for epoch in xrange(n_epochs):
                                epoch_costs = []
                                print "Starting epoch %i..." % epoch
                                for i, index in enumerate(np.random.permutation(len(train_data))):
                                    count += 1
                                    input = create_input(train_data[index], parameters, True, singletons)
                                    new_cost = f_train(*input)
                                    epoch_costs.append(new_cost)
                                    #if i % 50 == 0 and i > 0 == 0:
                                    #    print "%i, cost average: %f" % (i, np.mean(epoch_costs[-50:]))
                                    if count % freq_eval == 0:
                                        dev_score = evaluate(parameters, f_eval, dev_sentences,
                                                             dev_data, id_to_tag, dico_tags)
                                        test_score = evaluate(parameters, f_eval, test_sentences,
                                                              test_data, id_to_tag, dico_tags)
                                        print "Score on dev: %.5f" % dev_score
                                        print "Score on test: %.5f" % test_score
                                        if dev_score > best_dev:
                                            best_dev = dev_score
                                            print "New best score on dev."+str(best_dev)
                                            # print "Saving model to disk..."
                                            # model.save()
                                        if test_score > best_test:
                                            best_test = test_score
                                            print "New best score on test."+str(best_test)
                                        # print "Config values used are : "


                                print "Epoch %i done. Average cost: %f" % (epoch, np.mean(epoch_costs))
                            # Write the best dev and test scores to the file
                            del model


                            with open(resultsFile, 'a') as f:
                                    f.write("dropout: "+ str(parameters['dropout'] ) +"| char_dim:  |"+str(parameters['char_dim'])+ "| char_lstm_dim:  "+str(parameters['char_lstm_dim']) +" word_dim: "+ str(parameters['word_dim']) +" |word_lstm_dim: "+ str( parameters['word_lstm_dim'] )+" | Best Dev Score: "+str(best_dev) + " | Best Test Score: "+str(best_test) +"\n")


    return
예제 #20
0
def train():
    # load data sets
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    # 检测并维护数据集的 tag 标记
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)
    update_tag_scheme(dev_sentences, FLAGS.tag_schema)

    # create maps if not exist
    # 根据数据集创建 char_to_id, id_to_char, tag_to_id, id_to_tag 字典,并储存为 pkl 文件
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            # 利用预训练嵌入集增强(扩充)字符字典,然后返回字符与位置映射关系
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        # 获取标记与位置映射关系
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)

        #with open('maps.txt','w',encoding='utf8') as f1:
        #f1.writelines(str(char_to_id)+" "+id_to_char+" "+str(tag_to_id)+" "+id_to_tag+'\n')
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # 提取句子特征
    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), len(dev_data), len(test_data)))

    # 获取可供模型训练的单个批次数据
    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)

    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    # 训练集全量跑一次需要迭代的次数
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:

        # 此处模型创建为项目最核心代码
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        with tf.device("/gpu:0"):
            for i in range(100):
                for batch in train_manager.iter_batch(shuffle=True):
                    step, batch_loss = model.run_step(sess, True, batch)
                    loss.append(batch_loss)
                    if step % FLAGS.steps_check == 0:
                        iteration = step // steps_per_epoch + 1
                        logger.info("iteration:{} step:{}/{}, "
                                    "NER loss:{:>9.6f}".format(
                                        iteration, step % steps_per_epoch,
                                        steps_per_epoch, np.mean(loss)))
                        loss = []

            # best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
                if i % 7 == 0:
                    save_model(sess, model, FLAGS.ckpt_path, logger)
예제 #21
0
def train():
    # load data sets
    train_sentences = load_sentences(
        FLAGS.train_file, FLAGS.lower,
        FLAGS.zeros)  # dimension:num_sentence*len_sentence*2
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(
        train_sentences,
        FLAGS.tag_schema)  # dimension:num_sentence*len_sentence*2
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:  # 如果使用预训练的词嵌入
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[
                0]  # dico_chars_train dimension: 训练数据集中出现的字符类别数*2,
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(  # 利用测试数据样本集中的字对dico_chars_train进行补充
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:  # 创建map_file文件
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(
        train_sentences, char_to_id, tag_to_id,
        FLAGS.lower)  # dimension: NumSentence*4*LenSentence
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), len(dev_data), len(test_data)))

    train_manager = BatchManager(
        train_data, FLAGS.batch_size
    )  # batch_data dimension: BatchNum*4*BatchSize*MaxLenSentence
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)

    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):  # 若已有config_file则读取加载
        config = load_config(FLAGS.config_file)
    else:  # 若没有config_file则新建并保存为文件
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)  # 将config打印到日志文件

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True  # 动态申请内存
    steps_per_epoch = train_manager.len_data  # len_data: ceil(NumSentence/BatchSize)
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        for i in range(FLAGS.max_epoch):  # 括号中数字是epoach数量
            for batch in train_manager.iter_batch(
                    shuffle=True
            ):  # 一次从batch_data中取出一个batch,Shuffle为True表示打乱batch_data的顺序
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
                evaluate(sess, model, "test", test_manager, id_to_tag, logger)

    # View the tensorboard graph by running the following code and then going to the terminal and typing:
    # tensorboard --logdir = tensorboard_logs
    merged = tf.summary.merge_all()
    if not os.path.exists('tensorboard_logs/'):
        os.makedirs('tensorboard_logs/')
    my_writer = tf.summary.FileWriter('tensorboard_logs/', sess.graph)
예제 #22
0
파일: main2.py 프로젝트: lzx00000/mynew
def train():
    # load data sets
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)  #训练集 101218 句子
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower,
                                   FLAGS.zeros)  #验证集 7827句子
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower,
                                    FLAGS.zeros)  #测试集 16804句子

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)  #更新标注iob转换成iobes
    update_tag_scheme(test_sentences, FLAGS.tag_schema)  #更新标注iob转换成iobes
    update_tag_scheme(dev_sentences, FLAGS.tag_schema)  #更新标注iob转换成iobes
    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):  #判断maps.pkl是否存在
        # create dictionary for word
        if FLAGS.pre_emb:  #是否使用预先训练的模型(训练好的字向量)  测试集的数据不在训练集中
            dico_chars_train = char_mapping(train_sentences,
                                            FLAGS.lower)[0]  #字频统计下来 dico_chars
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(),
                FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable(  #拉平,变成一个list
                        [[w[0] for w in s] for s in test_sentences])  #w[0] 是个字
                ))  #每个字建个字典,每个词建个字典
        else:
            #每个字的id,标记的id
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags 每个标记的id
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)  #字频,排序,写入文件
        #with open('maps.txt','w',encoding='utf8') as f1:
        #f1.writelines(str(char_to_id)+" "+id_to_char+" "+str(tag_to_id)+" "+id_to_tag+'\n')
        with open(FLAGS.map_file, "wb") as f:  #持久化下来
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(  #字词 数字特征化
        train_sentences, char_to_id, tag_to_id, FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), 0, len(test_data)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)  #训练集每次60个句子进行迭代
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    # make path for store log and model if not exist
    make_path(FLAGS)  #创建文件log,result,ckpt
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)  #字符对应的id,标签对应的id
        save_config(config, FLAGS.config_file)  #每次的数据不一样都要生成一个config_file,
    make_path(FLAGS)  #创建文件log,result,ckpt 模型中的文件

    log_path = os.path.join("log", FLAGS.log_file)  #读取log路径
    logger = get_logger(log_path)  #定义log日志的写入格式
    print_config(config, logger)  #写入log日志

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True  #设置GPU自适应,用多少使用多少
    #tf_config.gpu_options.per_process_gpu_memory_fraction=True 设置GPU的使用率,占比
    steps_per_epoch = train_manager.len_data  #总共分多少批,取多少次
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        #模型初始化结束
        logger.info("start training")
        loss = []
        # with tf.device("/gpu:0"):没有Gpu注释掉  卷积神经网络要求句子的长度一样,
        for i in range(100):  #迭代多少次,每次把数据拿过来
            for batch in train_manager.iter_batch(shuffle=True):  #随机的拿
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

        # best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)比上次模型好的话,就保存
            if i % 7 == 0:
                save_model(sess, model, FLAGS.ckpt_path, logger)
예제 #23
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {"ner": NerProcessor}
    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    bert_config = modeling.BertConfig.from_json_file(
        FLAGS.bert_config_file)  # 加载bert模型的参数设置

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:  # 限制ner的max_seq_length不大于bert的最大长度限制512
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    task_name = FLAGS.task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))
    processor = processors[task_name]()

    label_list = processor.get_labels(
    )  # 获取label标签["O", "B-DIS", "I-DIS", "X", "[CLS]", "[SEP]"]

    tokenizer = tokenization.FullTokenizer(  # 对vocab的初始处理,包括word:id,大小写等
        vocab_file=FLAGS.vocab_file,
        do_lower_case=FLAGS.do_lower_case)
    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:  # use_tpu 默认为False
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2

    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.
        save_checkpoints_steps,  # how often to save the model checkpoint. 1000
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,  # 1000
            num_shards=FLAGS.num_tpu_cores,  # 8
            per_host_input_for_training=is_per_host))

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None  # warm up 步数的比例,比如说总共学习100步,warmup_proportion=0.1表示前10步用来warm up,warm up时以
    # 较低的学习率进行学习(lr = global_step/num_warmup_steps * init_lr),10步之后以正常(或衰减)的学习
    # 率来学习。

    ##################
    train_sentences = load_sentences(
        os.path.join(FLAGS.data_dir, "ner.train"), FLAGS.lower,
        FLAGS.zeros)  # 加载训练数据,格式为二维list,外层存储每一句话,内层为每句话的一个字和对应的tag
    dev_sentences = load_sentences(os.path.join(FLAGS.data_dir, "ner.dev"),
                                   FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(os.path.join(FLAGS.data_dir, "ner.dev"),
                                    FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences,
                      FLAGS.tag_schema)  # 默认IOBES,更新tag方案,将IOB转化为IOBES
    update_tag_scheme(dev_sentences,
                      FLAGS.tag_schema)  # 默认IOBES,更新tag方案,将IOB转化为IOBES
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(map_file):
        # create dictionary for word
        if FLAGS.pre_emb:  # use pre-trained embedding
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(  # 为了保证训练集中未出现的测试集中的字至少也能用预训练的word embedding
                dico_chars_train.copy(),
                FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences
                                                   ])  # 将嵌套的列表拼接
                ))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)

        # 执行mark_mapping
        _c, mark_to_id, id_to_mark = mark_mapping(train_sentences)

        entropy_dict = load_entropy_dict(FLAGS.entropy_dict)

        with open(map_file, "wb") as f:
            pickle.dump([
                char_to_id, id_to_char, tag_to_id, id_to_tag, mark_to_id,
                id_to_mark, entropy_dict
            ], f)
    else:
        with open(map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag, mark_to_id, id_to_mark, entropy_dict = pickle.load(
                f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 mark_to_id, entropy_dict, FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               mark_to_id, entropy_dict, FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                mark_to_id, entropy_dict, FLAGS.lower)

    ###############

    if FLAGS.do_train:
        train_examples = processor.get_train_examples(
            FLAGS.data_dir, train_data)  # 返回的每一个元素是一个InputExample对象
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(
        bert_config=bert_config,
        num_labels=len(label_list) + 1,
        init_checkpoint=FLAGS.
        init_checkpoint,  # 将预训练的bert模型的参数加载到模型中作为fine-tuning的初始化参数
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_tpu)

    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
    filed_based_convert_examples_to_features(train_examples, label_list,
                                             FLAGS.max_seq_length, tokenizer,
                                             train_file)

    eval_examples = processor.get_dev_examples(FLAGS.data_dir)
    eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
    filed_based_convert_examples_to_features(eval_examples, label_list,
                                             FLAGS.max_seq_length, tokenizer,
                                             eval_file)

    token_path = os.path.join(FLAGS.output_dir, "token_test.txt")
    with open(FLAGS.output_dir + '/label2id.pkl', 'rb') as rf:
        label2id = pickle.load(rf)
        id2label = {value: key for key, value in label2id.items()}
    if os.path.exists(token_path):
        os.remove(token_path)
    predict_examples = processor.get_test_examples(FLAGS.data_dir)

    predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
    # batch_labels 是以句为单位的[[1,2,0,0,1,2],[...]]
    batch_tokens, batch_labels = filed_based_convert_examples_to_features(
        predict_examples,
        label_list,
        FLAGS.max_seq_length,
        tokenizer,
        predict_file,
        mode="test")

    for actual_train_step in list(range(1000, num_train_steps,
                                        2000)) + [num_train_steps]:

        if FLAGS.do_train:
            start = time.clock()
            tf.logging.info("start training time: %f", start)
            tf.logging.info("***** Running training *****")
            tf.logging.info("  Num examples = %d", len(train_examples))
            tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
            tf.logging.info("  Num steps = %d", actual_train_step)
            train_input_fn = file_based_input_fn_builder(
                input_file=train_file,
                seq_length=FLAGS.max_seq_length,
                is_training=True,
                drop_remainder=True)
            estimator.train(input_fn=train_input_fn,
                            max_steps=actual_train_step)

            end = time.clock()
            tf.logging.info("end training time: %f", end)
            tf.logging.info("training time: %f", end - start)

        if FLAGS.do_eval:
            start = time.clock()
            tf.logging.info("start evaluation time: %f", start)

            tf.logging.info("***** Running evaluation *****")
            tf.logging.info("  Num examples = %d", len(eval_examples))
            tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
            eval_steps = None
            if FLAGS.use_tpu:
                eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size)
            eval_drop_remainder = True if FLAGS.use_tpu else False
            eval_input_fn = file_based_input_fn_builder(
                input_file=eval_file,
                seq_length=FLAGS.max_seq_length,
                is_training=False,
                drop_remainder=eval_drop_remainder)
            result = estimator.evaluate(input_fn=eval_input_fn,
                                        steps=eval_steps)
            output_eval_file = os.path.join(FLAGS.output_dir,
                                            "eval_results.txt")
            with open(output_eval_file, "w") as writer:
                tf.logging.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    tf.logging.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))

            end = time.clock()
            tf.logging.info("end evaluation time: %f", end)
            tf.logging.info("evaluation time: %f", end - start)

        if FLAGS.do_predict:
            start = time.clock()
            tf.logging.info("start predict time: %f", start)
            tf.logging.info("***** Running prediction *****")
            tf.logging.info("  Num examples = %d", len(predict_examples))
            tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)
            if FLAGS.use_tpu:
                # Warning: According to tpu_estimator.py Prediction on TPU is an
                # experimental feature and hence not supported here
                raise ValueError("Prediction in TPU not supported")
            predict_drop_remainder = True if FLAGS.use_tpu else False
            predict_input_fn = file_based_input_fn_builder(
                input_file=predict_file,
                seq_length=FLAGS.max_seq_length,
                is_training=False,
                drop_remainder=predict_drop_remainder)

            result = estimator.predict(input_fn=predict_input_fn)

            _result = []
            for prediction in result:
                _result += [prediction_id for prediction_id in prediction]

            output_predict_file = os.path.join(
                FLAGS.output_dir + "/label_test/",
                "label_test.txt-" + str(actual_train_step))
            Writer(output_predict_file, _result, batch_tokens, batch_labels,
                   id2label)

            end = time.clock()
            tf.logging.info("end predict time: %f", end)
            tf.logging.info("predict time: %f", end - start)
예제 #24
0
def train():
    # load data sets
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)
    # "sentences[0]:[['我', 'O'], ['要', 'O'], ['看', 'O'], ['乌', 'B-SLOC'], ['鲁', 'I-SLOC'], ['木', 'I-SLOC'], ['齐', 'I-SLOC'], ['市', 'I-SLOC'], ['第', 'I-SLOC'], ['四', 'I-SLOC'], ['十', 'I-SLOC'], ['九', 'I-SLOC'], ['中', 'I-SLOC'], ['学', 'I-SLOC'], ['东', 'I-SLOC'], ['门', 'I-SLOC'], ['去', 'O'], ['乌', 'B-ELOC'], ['鲁', 'I-ELOC'], ['木', 'I-ELOC'], ['齐', 'I-ELOC'], ['推', 'I-ELOC'], ['拿', 'I-ELOC'], ['职', 'I-ELOC'], ['业', 'I-ELOC'], ['学', 'I-ELOC'], ['校', 'I-ELOC'], ['南', 'I-ELOC'], ['门', 'I-ELOC'], ['沿', 'O'], ['西', 'B-ROAD'], ['虹', 'I-ROAD'], ['东', 'I-ROAD'], ['路', 'I-ROAD'], ['的', 'O'], ['监', 'B-TYPE'], ['控', 'I-TYPE']]"
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    # print("train_sentences[0]:{}".format(train_sentences[0]))
    # "train_sentences[0]:[['我', 'O'], ['要', 'O'], ['看', 'O'], ['乌', 'B-SLOC'], ['鲁', 'I-SLOC'], ['木', 'I-SLOC'], ['齐', 'I-SLOC'], ['市', 'I-SLOC'], ['第', 'I-SLOC'], ['四', 'I-SLOC'], ['十', 'I-SLOC'], ['九', 'I-SLOC'], ['中', 'I-SLOC'], ['学', 'I-SLOC'], ['东', 'I-SLOC'], ['门', 'E-SLOC'], ['去', 'O'], ['乌', 'B-ELOC'], ['鲁', 'I-ELOC'], ['木', 'I-ELOC'], ['齐', 'I-ELOC'], ['推', 'I-ELOC'], ['拿', 'I-ELOC'], ['职', 'I-ELOC'], ['业', 'I-ELOC'], ['学', 'I-ELOC'], ['校', 'I-ELOC'], ['南', 'I-ELOC'], ['门', 'E-ELOC'], ['沿', 'O'], ['西', 'B-ROAD'], ['虹', 'I-ROAD'], ['东', 'I-ROAD'], ['路', 'E-ROAD'], ['的', 'O'], ['监', 'B-TYPE'], ['控', 'E-TYPE']]"
    update_tag_scheme(dev_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    # print("map_file:{}".format(FLAGS.map_file))
    # print("pre_emb:{}".format(FLAGS.pre_emb))
    # map_file: maps.pkl
    # pre_emb: False
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(
                train_sentences, FLAGS.lower)[0]  # character -> count dict
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)

        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), 0, len(test_data)))
    # '3027 / 0 / 361 sentences in train / dev / test.'

    # print("batch_size:{}".format(FLAGS.batch_size))
    # batch_size: 20
    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    # make path for store log and model if not exist
    make_path(FLAGS)
    # print("config_file:{}".format(FLAGS.config_file))
    # config_file: config_file
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)

    log_path = os.path.join("log", FLAGS.log_file)
    # log_path:log/train.log
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    # print("steps_per_epoch:{}".format(steps_per_epoch))
    # steps_per_epoch: 152
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                # print("steps_check:{}".format(FLAGS.steps_check))
                # steps_check: 100
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
            export(model, sess, "ner", "export_model")
예제 #25
0
def train():
    # 加载数据集
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # 选择tag schema(IOB / IOBES)    I:中间,O:其他,B:开始 | E:结束,S:单个
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)
    update_tag_scheme(dev_sentences, FLAGS.tag_schema)
    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):  # 配置文件:char_to_id, id_to_char, tag_to_id, id_to_tag的数据
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(),
                FLAGS.emb_file,
                list(itertools.chain.from_iterable(
                    [[w[0] for w in s] for s in test_sentences])
                )
            )
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences, FLAGS.id_to_tag_path, FLAGS.tag_to_id_path)
        # with open('maps.txt','w',encoding='utf8') as f1:
        # f1.writelines(str(char_to_id)+" "+id_to_char+" "+str(tag_to_id)+" "+id_to_tag+'\n')
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)  #

    # prepare data, get a collection of list containing index
    # train_data[0][0]:一句话;
    # train_data[0][1]:单个字的编号;
    # train_data[0][2]:切词之后,切词特征:词的大小是一个字的话是0,词的大小是2以上的话:1,2....,2,3;
    # train_data[0][3]:每个字的标签
    train_data = prepare_dataset(
        train_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    dev_data = prepare_dataset(
        dev_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    test_data = prepare_dataset(
        test_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    print("%i / %i / %i sentences in train / dev / test." % (
        len(train_data), 0, len(test_data)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)  # 按batch size将数据拆分
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger)
        logger.info("start training")
        loss = []
        # tf.device("/cpu:0") 指定运行的GPU(默认为GPU:0)
        with tf.device("/cpu:0"):
            for i in range(100):
                # 按批次训练模型。这个是训练的开始,可以从这里倒着找整个网络怎么训练
                for batch in train_manager.iter_batch(shuffle=True):
                    step, batch_loss = model.run_step(sess, True, batch)
                    loss.append(batch_loss)
                    # 打印信息:
                    # iteration:迭代次数,也就是经过多少个epoch;
                    #
                    if step % FLAGS.steps_check == 0:
                        iteration = step // steps_per_epoch + 1
                        logger.info("iteration:{} step:{}/{}, "
                                    "NER loss:{:>9.6f}".format(
                            iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss)))
                        loss = []

                # best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
                if i % 7 == 0:
                    save_model(sess, model, FLAGS.ckpt_path, logger)
예제 #26
0
def train():
    train_sentences = load_sentences(FLAGS.train_file)
    dev_sentences = load_sentences(FLAGS.dev_file)
    test_sentences = load_sentences(FLAGS.test_file)

    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)
    update_tag_scheme(dev_sentences, FLAGS.tag_schema)

    if not os.path.isfile(FLAGS.map_file):
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]

            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, 'wb') as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, 'rb') as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id)
    train_manager = BatchManager(train_data, FLAGS.batch_size, FLAGS.num_steps)

    dev_manager = BatchManager(dev_data, 100, FLAGS.num_steps)
    test_manager = BatchManager(test_data, 100, FLAGS.num_steps)

    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)
    os.environ["CUDA_VISIBLE_DEVICES"] = "3"
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)
    tf_config = tf.ConfigProto(gpu_options=gpu_options)
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        for i in range(75):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{},".format(
                        iteration, step % steps_per_epoch, steps_per_epoch))
                    loss = []
            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
예제 #27
0
def train():
    # load data sets
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), 0, len(test_data)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

            #best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            #if best:
            save_model(sess, model, FLAGS.ckpt_path, logger)
예제 #28
0
def main():
    # load data sets
    global args
    args = parser.parse_args()
    pp.pprint(vars(args))
    running_name = 'X'
    use_cuda = cuda_model.ifUseCuda(args.gpu_id, args.multiGpu)
    # use_cuda = False

    train_file = 'data/example.train'
    dev_file = 'data/example.dev'
    test_file = 'data/example.test'
    embedding_file = 'data/vec.txt'
    map_file = 'map.pkl'
    config_file = 'config_file_pytorch'
    tag_file = 'tag.pkl'
    embedding_easy_file = 'data/easy_embedding.npy'
    train_sentences = load_sentences(train_file)
    dev_sentences = load_sentences(dev_file)
    test_sentences = load_sentences(test_file)
    # train_sentences = dev_sentences
    update_tag_scheme(train_sentences, args.tag_schema)
    update_tag_scheme(test_sentences, args.tag_schema)
    update_tag_scheme(dev_sentences, args.tag_schema)

    if not os.path.isfile(tag_file):
        _, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(tag_file, "wb") as f:
            pickle.dump([tag_to_id, id_to_tag], f)
    else:
        with open(tag_file, 'rb') as t:
            tag_to_id, id_to_tag = pickle.load(t)

    if not os.path.isfile(map_file):
        # create dictionary for word
        dico_chars_train = char_mapping(train_sentences)[0]
        dico_chars, char_to_id, id_to_char = augment_with_pretrained(
            dico_chars_train.copy(), embedding_file,
            list(
                itertools.chain.from_iterable([[w[0] for w in s]
                                               for s in test_sentences])))
        # _, tag_to_id, id_to_tag = tag_mapping(train_sentences)

        with open(map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char], f)
    else:
        with open(map_file, "rb") as f:
            char_to_id, id_to_char = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id)

    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), len(dev_data), len(test_data)))

    train_manager = BatchManager(train_data, args.batch_size)
    dev_manager = BatchManager(dev_data, 50)
    test_manager = BatchManager(test_data, 50)
    # make path for store log and model if not exist
    # make_path(FLAGS)
    if os.path.isfile(config_file):
        config = load_config(config_file)
    else:
        config = config_model(char_to_id, tag_to_id, args)
        save_config(config, config_file)
    # make_path(running_name)

    save_places = dir_utils.save_places(running_name)

    # log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(
        os.path.join(save_places.log_save_dir,
                     '{:s}.txt'.format(dir_utils.get_date_str())))
    print_config(config, logger)

    logger.info("start training")
    # loss = []

    #Update: create model and embedding!
    model = NERModel.CNERPointer(char_dim=args.char_dim,
                                 seg_dim=args.seg_dim,
                                 hidden_dim=args.hidden_dim,
                                 max_length=15,
                                 embedding_path=embedding_file,
                                 id_to_word=id_to_char,
                                 easy_load=embedding_easy_file)
    print("Number of Params\t{:d}".format(
        sum([p.data.nelement() for p in model.parameters()])))

    #Update: this won't work!
    # model = cuda_model.convertModel2Cuda(model, gpu_id=args.gpu_id, multiGpu=args.multiGpu)
    if use_cuda:
        model = model.cuda()

    model_optim = optim.Adam(filter(lambda p: p.requires_grad,
                                    model.parameters()),
                             lr=float(args.lr))
    optim_scheduler = optim.lr_scheduler.ReduceLROnPlateau(model_optim,
                                                           'min',
                                                           patience=10)

    for epoch in range(args.start_epoch, args.nof_epoch + args.start_epoch):
        total_losses = AverageMeter()
        loc_losses = AverageMeter()
        cls_losses = AverageMeter()
        Accuracy = AverageMeter()
        IOU = AverageMeter()
        ordered_IOU = AverageMeter()
        model.train()
        pbar = progressbar.ProgressBar(max_value=train_manager.len_data)

        for batch_idx, batch in enumerate(
                train_manager.iter_batch(shuffle=True)):
            pbar.update(batch_idx)
            word_vectors = torch.LongTensor(batch[1])
            seg_vectors = torch.LongTensor(batch[2])

            batch_size = word_vectors.shape[0]
            input_length = word_vectors.shape[1]

            word_input = Variable(word_vectors)
            seg_input = Variable(seg_vectors)

            if use_cuda:
                word_input = word_input.cuda()
                seg_input = seg_input.cuda()

            tagging_BIOUS = batch[3]
            segments, max_len = convertBIOU2SegmentsBatch(
                tagging_BIOUS, id_to_tag)
            gt_positions, gt_valids = createPytorchLabels(segments, max_len)

            head_pointer_probs, head_positions, tail_pointer_probs, tail_positions, cls_scores, _ = model(
                word_input, seg_input, max_len)

            pred_positions = torch.stack([head_positions, tail_positions],
                                         dim=-1)

            assigned_scores, assigned_locations = h_assign.Assign_Batch(
                gt_positions,
                pred_positions,
                gt_valids,
                thres=args.hassign_thres)

            if np.sum(assigned_scores) >= 1:
                iou_rate, effective_positives = Metrics.get_avg_iou2(
                    np.reshape(pred_positions.data.cpu().numpy(), (-1, 2)),
                    np.reshape(assigned_locations, (-1, 2)),
                    np.reshape(
                        assigned_scores,
                        assigned_scores.shape[0] * assigned_scores.shape[1]))

                IOU.update(iou_rate / (effective_positives),
                           effective_positives)
                # ordered_IOU.update(ordered_iou_rate/(args.batch_size*args.n_outputs),args.batch_size*args.n_outputs)

                # n_effective_batches += 1

            assigned_scores = Variable(torch.LongTensor(assigned_scores),
                                       requires_grad=False)
            assigned_locations = Variable(torch.LongTensor(assigned_locations),
                                          requires_grad=False)
            if use_cuda:
                assigned_scores = assigned_scores.cuda()
                assigned_locations = assigned_locations.cuda()

            cls_scores = cls_scores.contiguous().view(-1,
                                                      cls_scores.size()[-1])
            assigned_scores = assigned_scores.contiguous().view(-1)

            cls_loss = F.cross_entropy(cls_scores, assigned_scores)

            if torch.sum(assigned_scores) > 0:
                # print("HAHA")
                assigned_head_positions = assigned_locations[:, :, 0]
                assigned_head_positions = assigned_head_positions.contiguous(
                ).view(-1)
                #
                assigned_tail_positions = assigned_locations[:, :, 1]
                assigned_tail_positions = assigned_tail_positions.contiguous(
                ).view(-1)

                head_pointer_probs = head_pointer_probs.contiguous().view(
                    -1,
                    head_pointer_probs.size()[-1])
                tail_pointer_probs = tail_pointer_probs.contiguous().view(
                    -1,
                    tail_pointer_probs.size()[-1])

                # mask here: if there is non in assigned scores, no need to compute ...

                assigned_head_positions = torch.masked_select(
                    assigned_head_positions, assigned_scores.byte())
                assigned_tail_positions = torch.masked_select(
                    assigned_tail_positions, assigned_scores.byte())

                head_pointer_probs = torch.index_select(
                    head_pointer_probs,
                    dim=0,
                    index=assigned_scores.nonzero().squeeze(1))
                tail_pointer_probs = torch.index_select(
                    tail_pointer_probs,
                    dim=0,
                    index=assigned_scores.nonzero().squeeze(1))

                assigned_head_positions = to_one_hot(assigned_head_positions,
                                                     input_length)
                assigned_tail_positions = to_one_hot(assigned_tail_positions,
                                                     input_length)

                prediction_head_loss = EMD_L2(head_pointer_probs,
                                              assigned_head_positions,
                                              needSoftMax=True)
                prediction_tail_loss = EMD_L2(tail_pointer_probs,
                                              assigned_tail_positions,
                                              needSoftMax=True)
                loc_losses.update(
                    prediction_head_loss.data.item() +
                    prediction_tail_loss.data.item(), batch_size)
                total_loss = args.alpha * (prediction_head_loss +
                                           prediction_tail_loss) + cls_loss
            else:
                total_loss = cls_loss

            model_optim.zero_grad()
            total_loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.)
            model_optim.step()
            cls_losses.update(cls_loss.data.item(), batch_size)
            total_losses.update(total_loss.item(), batch_size)

        logger.info(
            "Train -- Epoch :{:06d}, LR: {:.6f},\tloss={:.4f}, \t c-loss:{:.4f}, \tloc-loss:{:.4f}\tcls-Accuracy:{:.4f}\tloc-Avg-IOU:{:.4f}\t topIOU:{:.4f}"
            .format(epoch, model_optim.param_groups[0]['lr'], total_losses.avg,
                    cls_losses.avg, loc_losses.avg, Accuracy.avg, IOU.avg,
                    ordered_IOU.avg))

        optim_scheduler.step(total_losses.avg)

        total_losses = AverageMeter()
        loc_losses = AverageMeter()
        cls_losses = AverageMeter()
        Accuracy = AverageMeter()
        IOU = AverageMeter()
        ordered_IOU = AverageMeter()
        model.eval()
        pbar = progressbar.ProgressBar(max_value=dev_manager.len_data)

        for batch_idx, batch in enumerate(
                dev_manager.iter_batch(shuffle=True)):
            pbar.update(batch_idx)
            word_vectors = torch.LongTensor(batch[1])
            seg_vectors = torch.LongTensor(batch[2])

            batch_size = word_vectors.shape[0]
            input_length = word_vectors.shape[1]

            word_input = Variable(word_vectors)
            seg_input = Variable(seg_vectors)

            if use_cuda:
                word_input = word_input.cuda()
                seg_input = seg_input.cuda()

            tagging_BIOUS = batch[3]
            segments, max_len = convertBIOU2SegmentsBatch(
                tagging_BIOUS, id_to_tag)

            head_pointer_probs, head_positions, tail_pointer_probs, tail_positions, cls_scores, _ = model(
                word_input, seg_input, max_len)

            pred_positions = torch.stack([head_positions, tail_positions],
                                         dim=-1)
            gt_positions, gt_valids = createPytorchLabels(segments, max_len)

            assigned_scores, assigned_locations = h_assign.Assign_Batch(
                gt_positions,
                pred_positions,
                gt_valids,
                thres=args.hassign_thres)

            if np.sum(assigned_scores) >= 1:
                iou_rate, effective_positives = Metrics.get_avg_iou2(
                    np.reshape(pred_positions.data.cpu().numpy(), (-1, 2)),
                    np.reshape(assigned_locations, (-1, 2)),
                    np.reshape(
                        assigned_scores,
                        assigned_scores.shape[0] * assigned_scores.shape[1]))

                IOU.update(iou_rate / (effective_positives),
                           effective_positives)
                # ordered_IOU.update(ordered_iou_rate/(args.batch_size*args.n_outputs),args.batch_size*args.n_outputs)

                # n_effective_batches += 1

            assigned_scores = Variable(torch.LongTensor(assigned_scores),
                                       requires_grad=False)
            assigned_locations = Variable(torch.LongTensor(assigned_locations),
                                          requires_grad=False)
            if use_cuda:
                assigned_scores = assigned_scores.cuda()
                assigned_locations = assigned_locations.cuda()

            cls_scores = cls_scores.contiguous().view(-1,
                                                      cls_scores.size()[-1])
            assigned_scores = assigned_scores.contiguous().view(-1)

            cls_loss = F.cross_entropy(cls_scores, assigned_scores)

            if torch.sum(assigned_scores) > 0:
                # print("HAHA")
                assigned_head_positions = assigned_locations[:, :, 0]
                assigned_head_positions = assigned_head_positions.contiguous(
                ).view(-1)
                #
                assigned_tail_positions = assigned_locations[:, :, 1]
                assigned_tail_positions = assigned_tail_positions.contiguous(
                ).view(-1)

                head_pointer_probs = head_pointer_probs.contiguous().view(
                    -1,
                    head_pointer_probs.size()[-1])
                tail_pointer_probs = tail_pointer_probs.contiguous().view(
                    -1,
                    tail_pointer_probs.size()[-1])

                # mask here: if there is non in assigned scores, no need to compute ...

                assigned_head_positions = torch.masked_select(
                    assigned_head_positions, assigned_scores.byte())
                assigned_tail_positions = torch.masked_select(
                    assigned_tail_positions, assigned_scores.byte())

                head_pointer_probs = torch.index_select(
                    head_pointer_probs,
                    dim=0,
                    index=assigned_scores.nonzero().squeeze(1))
                tail_pointer_probs = torch.index_select(
                    tail_pointer_probs,
                    dim=0,
                    index=assigned_scores.nonzero().squeeze(1))

                assigned_head_positions = to_one_hot(assigned_head_positions,
                                                     input_length)
                assigned_tail_positions = to_one_hot(assigned_tail_positions,
                                                     input_length)

                prediction_head_loss = EMD_L2(head_pointer_probs,
                                              assigned_head_positions,
                                              needSoftMax=True)
                prediction_tail_loss = EMD_L2(tail_pointer_probs,
                                              assigned_tail_positions,
                                              needSoftMax=True)
                loc_losses.update(
                    prediction_head_loss.data.item() +
                    prediction_tail_loss.data.item(), batch_size)
                total_loss = args.alpha * (prediction_head_loss +
                                           prediction_tail_loss) + cls_loss
            else:
                total_loss = cls_loss

            # model_optim.zero_grad()
            # total_loss.backward()
            # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.)
            # model_optim.step()
            cls_losses.update(cls_loss.data.item(), batch_size)
            total_losses.update(total_loss.item(), batch_size)

        logger.info(
            "Val -- Epoch :{:06d}, LR: {:.6f},\tloss={:.4f}, \t c-loss:{:.4f}, \tloc-loss:{:.4f}\tcls-Accuracy:{:.4f}\tloc-Avg-IOU:{:.4f}\t topIOU:{:.4f}"
            .format(epoch, model_optim.param_groups[0]['lr'], total_losses.avg,
                    cls_losses.avg, loc_losses.avg, Accuracy.avg, IOU.avg,
                    ordered_IOU.avg))

        if epoch % 1 == 0:
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'loss': total_losses.avg,
                    'cls_loss': cls_losses.avg,
                    'loc_loss': loc_losses.avg,
                    'IoU': IOU.avg
                }, (epoch + 1),
                file_direcotry=save_places.model_save_dir)