Пример #1
0
def train():
    # load data sets
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(dev_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        # with open(FLAGS.map_file, "wb") as f:
        #     pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)

        # author : wn
        _t_pos, pos_to_id, id_to_pos = pos_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([
                char_to_id, id_to_char, tag_to_id, id_to_tag, pos_to_id,
                id_to_pos
            ], f)
    else:
        # with open(FLAGS.map_file, "rb") as f:
        #     char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

        # author : wn
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag, pos_to_id, id_to_pos = pickle.load(
                f)

    print(tag_to_id)
    print(pos_to_id)
    # prepare data, get a collection of list containing index
    # train_data = prepare_dataset(
    #     train_sentences, char_to_id, tag_to_id, FLAGS.lower
    # )
    # dev_data = prepare_dataset(
    #     dev_sentences, char_to_id, tag_to_id, FLAGS.lower
    # )
    # test_data = prepare_dataset(
    #     test_sentences, char_to_id, tag_to_id, FLAGS.lower
    # )

    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 pos_to_id, FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, pos_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                pos_to_id, FLAGS.lower)

    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), len(dev_data), len(test_data)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)
    train_dev_manager = BatchManager(train_data, 100)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        # config = config_model(char_to_id, tag_to_id)

        # author : wn
        config = config_model(char_to_id, tag_to_id, pos_to_id)

        save_config(config, FLAGS.config_file)

    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        current_epoch = FLAGS.current_epoch
        while current_epoch < FLAGS.max_epoch:
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.debug("iteration:{} step:{}/{}, "
                                 "NER loss:{:>9.6f}".format(
                                     iteration, step % steps_per_epoch,
                                     steps_per_epoch, np.mean(loss)))
                    loss = []
            logger.info(
                "\n\n *******************epoch-{} NER loss:{:>9.6f}************************"
                .format(current_epoch, np.mean(loss)))
            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                # save_model(sess, model, FLAGS.ckpt_path, logger)
                save_model(sess,
                           model,
                           FLAGS.ckpt_path,
                           logger,
                           current_epoch,
                           np.mean(loss),
                           remark='best_dev')
            # elif current_epoch%10 ==0 :
            #     save_model(sess, model, FLAGS.ckpt_path, logger, current_epoch, np.mean(loss))
            evaluate(sess, model, "train", train_dev_manager, id_to_tag,
                     logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
            current_epoch += 1
Пример #2
0
            itertools.chain.from_iterable([[w[0] for w in s]
                                           for s in dev_sentences +
                                           test_sentences]))
        if not parameters['all_emb'] else None)
else:
    dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower)
    dico_words_train = dico_words

# Create a dictionary and a mapping for words / POS tags / tags
if opts.reload == None:
    if opts.train_true:
        dico_chars, char_to_id, id_to_char = char_mapping(train_sentences +
                                                          train_true_sentences)
        dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences +
                                                      train_true_sentences)
        dico_POSs, POS_to_id, id_to_POS = pos_mapping(train_sentences +
                                                      train_true_sentences)
    else:
        dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
        dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        dico_POSs, POS_to_id, id_to_POS = pos_mapping(train_sentences)

if opts.reload != None:
    word_to_id, char_to_id, tag_to_id, POS_to_id = [
        {v: k
         for k, v in x.items()} for x in
        [model.id_to_word, model.id_to_char, model.id_to_tag, model.id_to_POS]
    ]

    id_to_tag = model.id_to_tag
    id_to_char = model.id_to_char
    id_to_word = model.id_to_word
Пример #3
0
dev_sentences = loader.load_sentences(opts.dev, lower, zeros)
test_sentences = loader.load_sentences(opts.test, lower, zeros)

# 选择标注规范(IOB / IOBES)
#update_tag_scheme(train_sentences, 'iobes')
#update_tag_scheme(dev_sentences, 'iobes')
#update_tag_scheme(test_sentences, 'iobes')

# 用训练集建立词/字/NER标签/词性标签的词典和映射
dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower)
dico_words_train = dico_words
id_to_char = {}
if opts.char_dim:
    dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)
dico_pos_tags, pos_tag_to_id, id_to_pos_tag = pos_mapping(train_sentences)

n_tag = len(id_to_tag)
n_pos_tag = len(id_to_pos_tag)

# Index data
if opts.char_dim:
    train_data = prepare_dataset(train_sentences, word_to_id, char_to_id,
                                 tag_to_id, lower)
    dev_data = prepare_dataset(dev_sentences, word_to_id, char_to_id,
                               tag_to_id, lower)
    test_data = prepare_dataset(test_sentences, word_to_id, char_to_id,
                                tag_to_id, lower)
else:
    train_data = prepare_dataset_(train_sentences, word_to_id, tag_to_id,
                                  pos_tag_to_id, lower)