Exemplo n.º 1
0
    parser.add_argument(
        "--gaz", default="", nargs="+",
        help="gazetteers paths."
    )
    args = parser.parse_args()

    # external features
    parameters = dict()
    parameters['lower'] = args.lower == 1
    parameters['upenn_stem'] = args.upenn_stem
    parameters['pos_model'] = args.pos_model
    parameters['cluster'] = args.cluster
    parameters['ying_stem'] = args.ying_stem
    parameters['gaz'] = args.gaz

    sentences = load_sentences(args.bio_input)

    feats, stem = generate_features(sentences, parameters)

    # output bio with features
    if feats:
        bio = []
        for i, s in enumerate(sentences):
            bio_s = []
            for j, w in enumerate(s):
                bio_s.append(' '.join(w[:args.feat_column] + feats[i][j] +
                                      w[args.feat_column:]))
            bio.append('\n'.join(bio_s))
        with open(args.bio_output, 'w') as f:
            f.write('\n\n'.join(bio))
    else:
Exemplo n.º 2
0
    raise Exception('CoNLL evaluation script not found at "%s"' % eval_script)
if not os.path.exists(eval_temp):
    os.makedirs(eval_temp)
if not os.path.exists(models_path):
    os.makedirs(models_path)

# Initialize model
model = Model(parameters=parameters, models_path=models_path)
print "Model location: %s" % model.model_path

# Data parameters
lower = parameters['lower']
zeros = parameters['zeros']
tag_scheme = parameters['tag_scheme']
# Load sentences
train_sentences = loader.load_sentences(opts.train, lower, zeros)
# Use selected tagging scheme (IOB / IOBES)
update_tag_scheme(train_sentences, tag_scheme)
# Create a dictionary / mapping of words
# If we use pretrained embeddings, we add them to the dictionary.
dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower)
dico_words_train = dico_words
# Create a dictionary and a mapping for words / POS tags / tags
dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)

# Save the mappings to disk
print 'Saving the mappings to disk...'
# how this work, should have a mapping in disk and load every time?
model.save_mappings(id_to_word, id_to_char, id_to_tag)
Exemplo n.º 3
0
logging.basicConfig(format="[%(levelname)s] %(asctime)-15s: %(message)s",
                    level=logging.INFO)
logger = logging.getLogger

# Initialize model
model = Model(parameters=parameters, models_path=models_path)
logging.info("Model location: %s" % model.model_path)

# Data parameters
lower = parameters['lower']
zeros = parameters['zeros']
##tag_scheme = parameters['tag_scheme']

# Load sentences
train_sentences = loader.load_sentences(opts.train, lower, zeros)
dev_sentences = loader.load_sentences(opts.dev, lower, zeros)
test_sentences = loader.load_sentences(opts.test, lower, zeros)

# Use selected tagging scheme (IOB / IOBES)
##update_tag_scheme(train_sentences, tag_scheme)
##update_tag_scheme(dev_sentences, tag_scheme)
##update_tag_scheme(test_sentences, tag_scheme)

# Create a dictionary / mapping of words
# If we use pretrained embeddings, we add them to the dictionary.
if parameters['pre_emb']:
    dico_words_train = word_mapping(train_sentences, lower)[0]
    dico_words, word_to_id, id_to_word = augment_with_pretrained(
        dico_words_train.copy(), parameters['pre_emb'], None)
else:
Exemplo n.º 4
0
if not os.path.isfile(eval_script):
    raise Exception('CoNLL evaluation script not found at "%s"' % eval_script)
if not os.path.exists(eval_temp):
    os.makedirs(eval_temp)
if not os.path.exists(models_path):
    os.makedirs(models_path)

args.train_crfFeaturesFile = args.train
from thyme_code_oct2016 import preprocessingscript_thyme
preprocessingscript_thyme.loadFeatures2Tokens(args)

lower = parameters['lower']
zeros = parameters['zeros']
tag_scheme = parameters['tag_scheme']

train_sentences = loader.load_sentences(args.train, lower, zeros)
dev_sentences = loader.load_sentences(args.dev, lower, zeros)
test_sentences = loader.load_sentences(args.test, lower, zeros)
test_train_sentences = loader.load_sentences(args.test_train, lower, zeros)

update_tag_scheme(train_sentences, tag_scheme)
update_tag_scheme(dev_sentences, tag_scheme)
update_tag_scheme(test_sentences, tag_scheme)
update_tag_scheme(test_train_sentences, tag_scheme)

dico_words_train = word_mapping(train_sentences, lower)[0]

dico_words, word_to_id, id_to_word = augment_with_pretrained(
    dico_words_train.copy(), parameters['pre_emb'],
    list(
        itertools.chain.from_iterable([[w[0] for w in s]
Exemplo n.º 5
0
                            overwrite_mappings=opts.overwrite_mappings)
else:
    # Initialize model
    model = MainTaggerModel(parameters=parameters, models_path=models_path, overwrite_mappings=opts.overwrite_mappings)
print "MainTaggerModel location: %s" % model.model_path

# Data parameters
lower = parameters['lower']
zeros = parameters['zeros']
tag_scheme = parameters['t_s']

max_sentence_lengths = {}
max_word_lengths = {}

# Load sentences
train_sentences, max_sentence_lengths['train'], max_word_lengths['train'] = loader.load_sentences(opts.train, lower, zeros)
dev_sentences, max_sentence_lengths['dev'], max_word_lengths['dev'] = loader.load_sentences(opts.dev, lower, zeros)
test_sentences, max_sentence_lengths['test'], max_word_lengths['test'] = loader.load_sentences(opts.test, lower, zeros)

if parameters['test_with_yuret'] or parameters['train_with_yuret']:
    # train.merge and test.merge
    yuret_train_sentences, max_sentence_lengths['yuret_train'], max_word_lengths['yuret_train'] = \
        loader.load_sentences(opts.yuret_train, lower, zeros)
    yuret_test_sentences, max_sentence_lengths['yuret_test'], max_word_lengths['yuret_test'] = \
        loader.load_sentences(opts.yuret_test, lower, zeros)
    update_tag_scheme(yuret_train_sentences, tag_scheme)
    update_tag_scheme(yuret_test_sentences, tag_scheme)
else:
    yuret_train_sentences = []
    yuret_test_sentences = []
Exemplo n.º 6
0
    # Initialize model
    model = Model(parameters=parameters, models_path=models_path)
    print "Model location: %s" % model.model_path
    
    # Data parameters
    lower = parameters['lower']
    zeros = parameters['zeros']
    tag_scheme = parameters['tag_scheme']
    

    train_path = "iob/training_data_iob_" + str(x) + ".txt"
    vali_path = "iob/validation_data_iob_" + str(x) + ".txt"     
    test_path = "iob/test_data_iob_" + str(x) + ".txt"    
    
    # Load sentences
    train_sentences = loader.load_sentences(train_path, lower, zeros)
    dev_sentences = loader.load_sentences(vali_path, lower, zeros)
    test_sentences = loader.load_sentences(test_path, lower, zeros)
    
    # are there right tag, just (I, O, B)
    #check_tag_scheme(train_sentences, tag_scheme)
    #check_tag_scheme(dev_sentences, tag_scheme)
    #check_tag_scheme(test_sentences, tag_scheme)
        
    # Create a dictionary / mapping of words
    # If we use pretrained embeddings, we add them to the dictionary.
    if parameters['pre_emb']:
        dico_words_train = word_mapping(train_sentences, lower)[0]
        dico_words, word_to_id, id_to_word = augment_with_pretrained(
            dico_words_train.copy(),
            parameters['pre_emb'],
Exemplo n.º 7
0
def train():
    train_sentences = load_sentences(
        FLAGS.train_file, FLAGS.lower, FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)
    #选择标注方式IBO or IBOES
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    if not os.path.isfile(FLAGS.map_file):
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(),
                FLAGS.emb_file,
                list(itertools.chain.from_iterable(
                    [[w[0] for w in s] for s in test_sentences])
                )
            )
        else:
            _c, char_to_id, id_to_char = char_mapping(
                train_sentences, FLAGS.lower)
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)
    train_data = prepare_dataset(
        train_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    dev_data = prepare_dataset(
        dev_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    test_data = prepare_dataset(
        test_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    print("%i / %i / %i sentences in train / dev / test." % (
        len(train_data), len(dev_data), len(test_data)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)
    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path,
                             load_word2vec, config, id_to_char, logger)
        logger.info("start training")
        loss = []
        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
Exemplo n.º 8
0
sys.stdout = Tee(sys.stdout, f)

print('Training data: %s' % args.train)
print('Dev data: %s' % args.dev)
print('Test data: %s' % args.test)
print("Model location: %s" % model_dir)
print('Model parameters:')
for k, v in parameters.items():
    print('%s=%s' % (k, v))

# Data parameters
lower = parameters['lower']
zeros = parameters['zeros']

# Load sentences
train_sentences = load_sentences(args.train, lower, zeros)
dev_sentences = load_sentences(args.dev, lower, zeros)
test_sentences = load_sentences(args.test, lower, zeros)

# train_sentences = train_sentences[:50]
# dev_sentences = dev_sentences[:50]
# test_sentences = test_sentences[:50]

# Create a dictionary / mapping of words
# If we use pretrained embeddings, we add them to the dictionary.
if parameters['pre_emb']:
    dico_words_train = word_mapping(train_sentences, lower)[0]
    dico_words, word_to_id, id_to_word = augment_with_pretrained(
        dico_words_train.copy(), parameters['pre_emb'],
        list(
            itertools.chain.from_iterable([[w[0] for w in s]
Exemplo n.º 9
0
def train():
    # load data sets
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist, load data if exists maps
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), 0, len(test_data)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)

    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # 设置训练日志目录
    train_log = os.path.join(FLAGS.logdir, "train")
    if not os.path.exists(train_log):
        os.makedirs(train_log)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data  # the nums of batch data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        # 观察所建立的计算图
        train_writer = tf.summary.FileWriter(train_log, sess.graph)
        logger.info("start training")
        loss = []
        dev_f1 = []
        test_f1 = []
        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss, merged = model.run_step(
                    sess, True, batch)  # step是global step
                # 在迭代中输出到结果
                train_writer.add_summary(merged, step)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

            # use dev data to validation the model
            best, dev_f1_value = evaluate(sess, model, "dev", dev_manager,
                                          id_to_tag, logger)
            # store the dev f1
            dev_f1.append(dev_f1_value)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
            # use current the  model to test
            _, test_f1_value = evaluate(sess, model, "test", test_manager,
                                        id_to_tag, logger)
            #   store the test f1
            test_f1.append(test_f1_value)
        # write the dev_f1 and test_f1 to file
        f1_result = {}
        f1_result["dev_f1"] = dev_f1
        f1_result["test_f1"] = test_f1
        write_data_to_file(f1_result, "f1_result")
Exemplo n.º 10
0
print 'Reloading previous model...'
_, f_eval = model.build(training=False, **parameters)
model.reload()


print "--------"
print opts.test_folder
arr = os.listdir(opts.test_folder)
print arr.sort()

for test_file in arr:
    print test_file
    if test_file.endswith(".conll") or test_file.endswith(".txt"):
        out_file = test_file.replace(".conll", "") + ".txt"

        test_sentences = loader.load_sentences(opts.test_folder + "/" + test_file, lower, zeros)

        update_tag_scheme(test_sentences, tag_scheme)

        test_data = prepare_dataset2(
            test_sentences, word_to_id, char_to_id, tag_to_id, model.feature_maps, lower
        )

        print "input: ", test_file, ":" , len(test_sentences), len(test_data)
        score_file = opts.out_folder + "/" + opts.prefix + out_file
        print "output: ", score_file

        test_score, iob_test_score, s_result_test, eval_lines, log = evaluate(parameters, f_eval, test_sentences,
                                  test_data, model.id_to_tag, blog=opts.log, eval_script=eval_script)

Exemplo n.º 11
0
parameters['all_emb'] = opts.all_emb == 1
parameters['cap_dim'] = opts.cap_dim
parameters['crf'] = opts.crf == 1
parameters['dropout'] = opts.dropout
parameters['reload'] = opts.reload == 1
parameters['name'] = opts.name
parameters['char_mode'] = opts.char_mode

parameters['use_gpu'] = opts.use_gpu == 1 and torch.cuda.is_available()
use_gpu = parameters['use_gpu']

lower = parameters['lower']
zeros = parameters['zeros']
tag_scheme = parameters['tag_scheme']

train_sentences = loader.load_sentences(opts.train, lower, zeros)
dev_sentences = loader.load_sentences(opts.dev, lower, zeros)
test_sentences = loader.load_sentences(opts.test, lower, zeros)
test_train_sentences = loader.load_sentences(opts.test_train, lower, zeros)
test_single_out_sentences = loader.load_sentences(
    "./dataset/origin_single_out.txt", lower, zeros)

name = parameters['name']
update_tag_scheme(train_sentences, tag_scheme)
update_tag_scheme(dev_sentences, tag_scheme)
update_tag_scheme(test_sentences, tag_scheme)
update_tag_scheme(test_train_sentences, tag_scheme)
update_tag_scheme(test_single_out_sentences, tag_scheme)

dico_words_train = word_mapping(train_sentences, lower)[0]
Exemplo n.º 12
0
    def train(self,
              n_epochs=100,
              freq_eval=1000,
              verbose=True,
              eval_test_set=False):
        """
        :param n_epochs: number of epochs over the training set
        :param freq_eval: evaluate on dev every freq_eval steps
        :return: Saves the model with the best F1-Score, evaluated on the dev set
        """
        # Initialize model
        model = Model(parameters=self.parameters, models_path=models_path)
        print("Model location: %s" % model.model_path)

        # Data parameters
        lower = self.parameters['lower']
        zeros = self.parameters['zeros']
        tag_scheme = self.parameters['tag_scheme']

        # Load sentences
        train_sentences = loader.load_sentences(self.parameters['train'],
                                                lower, zeros)
        dev_sentences = loader.load_sentences(self.parameters['dev'], lower,
                                              zeros)
        test_sentences = loader.load_sentences(self.parameters['test'], lower,
                                               zeros)

        # Use selected tagging scheme (IOB / IOBES)
        update_tag_scheme(train_sentences, tag_scheme)
        update_tag_scheme(dev_sentences, tag_scheme)
        update_tag_scheme(test_sentences, tag_scheme)

        # Create a dictionary / mapping of words
        # If we use pretrained embeddings, we add them to the dictionary.
        if self.parameters['pre_emb']:
            dico_words_train = word_mapping(train_sentences, lower)[0]
            dico_words, word_to_id, id_to_word = augment_with_pretrained(
                dico_words_train.copy(), self.parameters['pre_emb'],
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in dev_sentences +
                                                   test_sentences]))
                if not self.parameters['all_emb'] else None)
        else:
            dico_words, word_to_id, id_to_word = word_mapping(
                train_sentences, lower)
            dico_words_train = dico_words

        # Create a dictionary and a mapping for words / POS tags / tags
        dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
        dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)

        # Index data
        train_data = prepare_dataset(train_sentences, word_to_id, char_to_id,
                                     tag_to_id, lower)
        dev_data = prepare_dataset(dev_sentences, word_to_id, char_to_id,
                                   tag_to_id, lower)
        test_data = prepare_dataset(test_sentences, word_to_id, char_to_id,
                                    tag_to_id, lower)

        print("%i / %i / %i sentences in train / dev / test." %
              (len(train_data), len(dev_data), len(test_data)))

        # Save the mappings to disk
        print('Saving the mappings to disk...')
        model.save_mappings(id_to_word, id_to_char, id_to_tag)

        # Build the model
        f_train, f_eval = model.build(**self.parameters)

        # Reload previous model values
        if self.parameters['reload']:
            print('Reloading previous model...')
            model.reload()

        #
        # Train network
        #
        singletons = set(
            [word_to_id[k] for k, v in dico_words_train.items() if v == 1])
        best_dev = -np.inf
        best_test = -np.inf
        count = 0
        for epoch in range(n_epochs):
            epoch_costs = []
            print("Starting epoch %i at..." % epoch, time.ctime())
            for i, index in enumerate(np.random.permutation(len(train_data))):
                count += 1
                input = create_input(train_data[index], self.parameters, True,
                                     singletons)
                new_cost = f_train(*input)
                epoch_costs.append(new_cost)
                if i % 50 == 0 and i > 0 == 0 and verbose:
                    print("%i, cost average: %f" %
                          (i, np.mean(epoch_costs[-50:])))
                if count % freq_eval == 0:
                    dev_score = evaluate(self.parameters,
                                         f_eval,
                                         dev_sentences,
                                         dev_data,
                                         id_to_tag,
                                         verbose=verbose)
                    if eval_test_set:
                        test_score = evaluate(self.parameters,
                                              f_eval,
                                              test_sentences,
                                              test_data,
                                              id_to_tag,
                                              verbose=verbose)
                    print("Score on dev: %.5f" % dev_score)
                    if eval_test_set:
                        print("Score on test: %.5f" % test_score)
                    if dev_score > best_dev:
                        best_dev = dev_score
                        print("New best score on dev.")
                        print("Saving model to disk...")
                        model.save()
                    if eval_test_set:
                        if test_score > best_test:
                            best_test = test_score
                            print("New best score on test.")
            print(
                "Epoch %i done. Average cost: %f. Ended at..." %
                (epoch, np.mean(epoch_costs)), time.ctime())
        return best_dev
Exemplo n.º 13
0
def train():
    # load data sets
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    # test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    # update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(
        train_sentences, FLAGS.max_seq_len, tag_to_id, FLAGS.lower
    )
    dev_data = prepare_dataset(
        dev_sentences, FLAGS.max_seq_len, tag_to_id, FLAGS.lower
    )
    # test_data = prepare_dataset(
    #     test_sentences, FLAGS.max_seq_len, tag_to_id, FLAGS.lower
    # )
    print("%i / %i / %i sentences in train / dev / test." % (
        len(train_data), 0, len(dev_data)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, FLAGS.batch_size)
    # test_manager = BatchManager(test_data, FLAGS.batch_size)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data

    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, FLAGS.bilstm_ckpt_path, config, logger)

        logger.info("start training")
        loss = []
        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)

                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                        iteration, step%steps_per_epoch, steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger, global_steps=step)
Exemplo n.º 14
0
    {v: k for k, v in x.items()}
    for x in [model.id_to_word, model.id_to_char]
]

print 'Reloading previous model...'
_, f_eval = model.build(training=False, **parameters)
model.reload()


assert os.path.isfile(opts.test_file)
test_file = opts.test_file

out_txt = opts.out_txt
out_json = opts.out_json

test_sentences = loader.load_sentences(test_file, lower, zeros)
update_tag_scheme(test_sentences, tag_scheme)

test_data = prepare_dataset3(
    test_sentences, word_to_id, char_to_id, model.tag_maps, model.feature_maps, lower
)


print "input: ", test_file

from pprint import pprint
print(model.tag_maps)
pprint(model.tag_maps)

test_score, iob_test_score, result_test, _ = evaluate_multilayer(parameters, f_eval, test_sentences, test_data, model.tag_maps)
Exemplo n.º 15
0
# Use selected tagging scheme (IOB / IOBES)

# Save the mappings to disk

# Build the model
_, f_eval, f_eval_softmax = model.build(**parameters)

# Reload previous model values
print 'Reloading previous model...'
model.reload()
#
# Train network
#
if opts.dev:
    dev_sentences = loader.load_sentences(opts.dev, lower, zeros)
    dev_data = prepare_dataset(dev_sentences, word_to_id, char_to_id,
                               tag_to_id, POS_to_id, lower)
    dev_score = evaluate(parameters, f_eval, dev_sentences, dev_data,
                         id_to_tag, eval_temp, -1, 'valid')

dico_tags = []
for line in open(opts.test):
    fn = line.rstrip().split('\t')[0]

    try:
        test_sentences = loader.load_sentences(fn, lower, zeros)
        test_data = prepare_dataset(test_sentences, word_to_id, char_to_id,
                                    tag_to_id, POS_to_id, lower)
        outfn = line.rstrip().split('\t')[1]
        print fn
Exemplo n.º 16
0
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger)
        while True:
            line = input("请输入测试句子:")
            result = model.evaluate_line(sess, input_from_line(line, char_to_id), id_to_tag)
            print('result:',result)
            break

def main():
    if FLAGS.train:
        if FLAGS.clean:
            clean(FLAGS)
        train(X_train,X_dev,X_test)


if __name__ == "__main__":
    sentences = load_sentences("target_data/All_skill.train", True, False)
    kf = RepeatedKFold(n_splits=5, n_repeats=1)
    for train_index, test_index in kf.split(sentences):
        X_trainall, X_test = np.array(sentences)[train_index], np.array(sentences)[test_index]
        kt = RepeatedKFold(n_splits=10, n_repeats=1)
        for train_index1, test_index1 in kt.split(X_trainall):
            X_train, X_dev = np.array(X_trainall)[train_index1], np.array(X_trainall)[test_index1]
            print(' X_train,X_test,X_dev', len(X_train), len(X_test), len(X_dev))
            main()






Exemplo n.º 17
0
def train():
    # load data sets
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(dev_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        # with open(FLAGS.map_file, "wb") as f:
        #     pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)

        # author : wn
        _t_pos, pos_to_id, id_to_pos = pos_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([
                char_to_id, id_to_char, tag_to_id, id_to_tag, pos_to_id,
                id_to_pos
            ], f)
    else:
        # with open(FLAGS.map_file, "rb") as f:
        #     char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

        # author : wn
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag, pos_to_id, id_to_pos = pickle.load(
                f)

    print(tag_to_id)
    print(pos_to_id)
    # prepare data, get a collection of list containing index
    # train_data = prepare_dataset(
    #     train_sentences, char_to_id, tag_to_id, FLAGS.lower
    # )
    # dev_data = prepare_dataset(
    #     dev_sentences, char_to_id, tag_to_id, FLAGS.lower
    # )
    # test_data = prepare_dataset(
    #     test_sentences, char_to_id, tag_to_id, FLAGS.lower
    # )

    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 pos_to_id, FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, pos_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                pos_to_id, FLAGS.lower)

    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), len(dev_data), len(test_data)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)
    train_dev_manager = BatchManager(train_data, 100)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        # config = config_model(char_to_id, tag_to_id)

        # author : wn
        config = config_model(char_to_id, tag_to_id, pos_to_id)

        save_config(config, FLAGS.config_file)

    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        current_epoch = FLAGS.current_epoch
        while current_epoch < FLAGS.max_epoch:
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.debug("iteration:{} step:{}/{}, "
                                 "NER loss:{:>9.6f}".format(
                                     iteration, step % steps_per_epoch,
                                     steps_per_epoch, np.mean(loss)))
                    loss = []
            logger.info(
                "\n\n *******************epoch-{} NER loss:{:>9.6f}************************"
                .format(current_epoch, np.mean(loss)))
            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                # save_model(sess, model, FLAGS.ckpt_path, logger)
                save_model(sess,
                           model,
                           FLAGS.ckpt_path,
                           logger,
                           current_epoch,
                           np.mean(loss),
                           remark='best_dev')
            # elif current_epoch%10 ==0 :
            #     save_model(sess, model, FLAGS.ckpt_path, logger, current_epoch, np.mean(loss))
            evaluate(sess, model, "train", train_dev_manager, id_to_tag,
                     logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
            current_epoch += 1
Exemplo n.º 18
0
def train(X_train,X_dev,X_test):
    # load data sets
    train_sentences = X_train
    dev_sentences = X_dev
    test_sentences = X_test

    train_sentences_loc = load_sentences(FLAGS.train_file_loc, FLAGS.lower, FLAGS.zeros)
    dev_sentences_loc = load_sentences(FLAGS.dev_file_loc, FLAGS.lower, FLAGS.zeros)
    test_sentences_loc = load_sentences(FLAGS.test_file_loc, FLAGS.lower, FLAGS.zeros)
    train_sentences_org = load_sentences(FLAGS.train_file_org, FLAGS.lower, FLAGS.zeros)
    dev_sentences_org = load_sentences(FLAGS.dev_file_org, FLAGS.lower, FLAGS.zeros)
    test_sentences_org = load_sentences(FLAGS.test_file_org, FLAGS.lower, FLAGS.zeros)
    train_sentences_per = load_sentences(FLAGS.train_file_per, FLAGS.lower, FLAGS.zeros)
    dev_sentences_per = load_sentences(FLAGS.dev_file_per, FLAGS.lower, FLAGS.zeros)
    test_sentences_per = load_sentences(FLAGS.test_file_per, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    update_tag_scheme(train_sentences_loc, FLAGS.tag_schema)
    update_tag_scheme(test_sentences_loc, FLAGS.tag_schema)
    update_tag_scheme(train_sentences_per, FLAGS.tag_schema)
    update_tag_scheme(test_sentences_per, FLAGS.tag_schema)
    update_tag_scheme(train_sentences_org, FLAGS.tag_schema)
    update_tag_scheme(test_sentences_org, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(),
                FLAGS.emb_file,
                list(itertools.chain.from_iterable(
                    [[w[0] for w in s] for s in test_sentences])
                )
            )
            dico_chars_train_loc = char_mapping(train_sentences_loc, FLAGS.lower)[0]
            dico_chars_loc, char_to_id_loc, id_to_char_loc = augment_with_pretrained(
                dico_chars_train_loc.copy(),
                FLAGS.emb_file,
                list(itertools.chain.from_iterable(
                    [[w[0] for w in s] for s in test_sentences_loc])
                )
            )
            dico_chars_train_per = char_mapping(train_sentences_per, FLAGS.lower)[0]
            dico_chars_per, char_to_id_per, id_to_char_per = augment_with_pretrained(
                dico_chars_train_per.copy(),
                FLAGS.emb_file,
                list(itertools.chain.from_iterable(
                    [[w[0] for w in s] for s in test_sentences_per])
                )
            )
            dico_chars_train_org = char_mapping(train_sentences_org, FLAGS.lower)[0]
            dico_chars_org, char_to_id_org, id_to_char_org = augment_with_pretrained(
                dico_chars_train_org.copy(),
                FLAGS.emb_file,
                list(itertools.chain.from_iterable(
                    [[w[0] for w in s] for s in test_sentences_org])
                )
            )
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower)
            _c_loc, char_to_id_loc, id_to_char_loc = char_mapping(train_sentences_loc, FLAGS.lower)
            _c_per, char_to_id_per, id_to_char_per = char_mapping(train_sentences_per, FLAGS.lower)
            _c_org, char_to_id_org, id_to_char_org = char_mapping(train_sentences_org, FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        _t_loc, tag_to_id_loc, id_to_tag_loc = tag_mapping(train_sentences_loc)
        _t_per, tag_to_id_per, id_to_tag_per = tag_mapping(train_sentences_per)
        _t_org, tag_to_id_org, id_to_tag_org = tag_mapping(train_sentences_org)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag,char_to_id_loc, id_to_char_loc, tag_to_id_loc, id_to_tag_loc,char_to_id_per, id_to_char_per, tag_to_id_per, id_to_tag_per,char_to_id_org, id_to_char_org, tag_to_id_org, id_to_tag_org], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag,char_to_id_loc, id_to_char_loc, tag_to_id_loc, id_to_tag_loc,char_to_id_per, id_to_char_per, tag_to_id_per, id_to_tag_per,char_to_id_org, id_to_char_org, tag_to_id_org, id_to_tag_org = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(
        train_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    dev_data = prepare_dataset(
        dev_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    test_data = prepare_dataset(
        test_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    print("%i / %i / %i sentences in train / dev / test." % (
        len(train_data),len(dev_data), len(test_data)))
    train_data_loc = prepare_dataset_ner(
        train_sentences_loc, char_to_id_loc, tag_to_id_loc, FLAGS.lower
    )
    dev_data_loc = prepare_dataset_ner(
        dev_sentences_loc, char_to_id_loc, tag_to_id_loc, FLAGS.lower
    )
    test_data_loc = prepare_dataset_ner(
        test_sentences_loc, char_to_id_loc, tag_to_id_loc, FLAGS.lower
    )
    print("%i / %i / %i sentences_loc in train / dev / test." % (
        len(train_data_loc), len(dev_data_loc), len(test_data_loc)))
    train_data_per = prepare_dataset_ner(
        train_sentences_per, char_to_id_per, tag_to_id_per, FLAGS.lower
    )
    dev_data_per = prepare_dataset_ner(
        dev_sentences_per, char_to_id_per, tag_to_id_per, FLAGS.lower
    )
    test_data_per = prepare_dataset_ner(
        test_sentences_per, char_to_id_per, tag_to_id_per, FLAGS.lower
    )
    print("%i / %i / %i sentences_per in train / dev / test." % (
        len(train_data_per), len(dev_data_per), len(test_data_per)))
    train_data_org = prepare_dataset_ner(
        train_sentences_org, char_to_id_org, tag_to_id_org, FLAGS.lower
    )
    dev_data_org = prepare_dataset_ner(
        dev_sentences_org, char_to_id_org, tag_to_id_org, FLAGS.lower
    )
    test_data_org = prepare_dataset_ner(
        test_sentences_org, char_to_id_org, tag_to_id_org, FLAGS.lower
    )
    print("%i / %i / %i sentences_org in train / dev / test." % (
        len(train_data_org), len(dev_data_org), len(test_data_org)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)

    train_manager_loc = BatchManager(train_data_loc, FLAGS.batch_size)
    train_manager_per = BatchManager(train_data_per, FLAGS.batch_size)
    train_manager_org = BatchManager(train_data_org, FLAGS.batch_size)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id,char_to_id_loc, tag_to_id_loc,char_to_id_per, tag_to_id_per,char_to_id_org, tag_to_id_org)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    steps_per_epoch_loc = train_manager_loc.len_data
    steps_per_epoch_per = train_manager_per.len_data
    steps_per_epoch_org = train_manager_org.len_data
    model = create_model(Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, id_to_char_loc, id_to_char_per, id_to_char_org, logger)

    with tf.Session(config=tf_config, graph = model.graph ) as sess:

        sess.run(tf.global_variables_initializer())
        if config["pre_emb"]:
            emb_weights = sess.run(model.char_lookup.read_value())
            emb_weights_ner = sess.run(model.char_lookup.read_value())
            emb_weights, emb_weights_ner = load_word2vec(config["emb_file"], id_to_char, id_to_char_loc,id_to_char_per,id_to_char_org, config["char_dim"],
                                                    emb_weights, emb_weights_ner)
            sess.run(model.char_lookup.assign(emb_weights))
            logger.info("Load pre-trained embedding.")
        logger.info("start training")
        loss = []
        loss_loc = []
        loss_per = []
        loss_org = []
        for i in range(100):
            for batch_loc in train_manager_loc.iter_batch(shuffle=True):
                    step_loc, batch_loss_loc = model.run_step_ner(sess, True, batch_loc)
                    loss_loc.append(batch_loss_loc)
                    if step_loc % FLAGS.steps_check == 0:
                        iteration_loc = step_loc // steps_per_epoch_loc + 1
                        logger.info("iteration:{} step_loc:{}/{}, "
                                    "NER loss:{:>9.6f}".format(
                            iteration_loc, step_loc % steps_per_epoch_loc, steps_per_epoch_loc, np.mean(loss_loc)))
                        loss_loc = []
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration_1 = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                            "SKILL loss:{:>9.6f}".format(
                        iteration_1, step % steps_per_epoch, steps_per_epoch, np.mean(loss)))
                loss = []
            precision_loc_dev = model.precision(sess, dev_manager, id_to_tag)
            precision_loc_test = model.precision(sess, test_manager, id_to_tag)
            for batch_per in train_manager_per.iter_batch(shuffle=True):
                    step_per, batch_loss_per = model.run_step_ner(sess, True, batch_per)
                    loss_per.append(batch_loss_per)
                    if step_per % FLAGS.steps_check == 0:
                        iteration_per = step_per // steps_per_epoch_per + 1
                        logger.info("iteration:{} step_per:{}/{}, "
                                    "NER loss:{:>9.6f}".format(
                            iteration_per, step_per % steps_per_epoch_per, steps_per_epoch_per, np.mean(loss_per)))
                        loss_per = []
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration_2 = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                            "SKILL loss:{:>9.6f}".format(
                        iteration_2, step % steps_per_epoch, steps_per_epoch, np.mean(loss)))
                loss = []
            precision_per_dev = model.precision(sess, dev_manager, id_to_tag)
            precision_per_test = model.precision(sess, test_manager, id_to_tag)
            for batch_org in train_manager_org.iter_batch(shuffle=True):
                    step_org, batch_loss_org = model.run_step_ner(sess, True, batch_org)
                    loss_org.append(batch_loss_org)
                    if step_org % FLAGS.steps_check == 0:
                        iteration_org = step_org // steps_per_epoch_org + 1
                        logger.info("iteration:{} step_org:{}/{}, "
                                    "NER loss:{:>9.6f}".format(
                            iteration_org, step_org % steps_per_epoch_org, steps_per_epoch_org, np.mean(loss_org)))
                        loss_org = []
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration_3 = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                            "SKILL loss:{:>9.6f}".format(
                        iteration_3, step % steps_per_epoch, steps_per_epoch, np.mean(loss)))
                loss = []
            precision_org_dev = model.precision(sess, dev_manager, id_to_tag)
            precision_org_test = model.precision(sess, test_manager, id_to_tag)
            best = evaluate(sess, model, "dev", dev_manager, id_to_tag,precision_loc_dev,precision_per_dev,precision_org_dev, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
                best_test,results= evaluate(sess, model, "test", test_manager, id_to_tag,precision_loc_test,precision_per_test,precision_org_test, logger)
                with open("CDTL_PSE-result.csv", "a",encoding='utf-8')as st_re:
                    st_re.write(str(results).replace("[", "").replace("]", ""))
                    st_re.write("\n")
model = Model(parameters=parameters,
              models_path=models_path,
              overwrite_mappings=opts.overwrite_mappings)
print "Model location: %s" % model.model_path

# Data parameters
lower = parameters['lower']
zeros = parameters['zeros']
tag_scheme = parameters['t_s']

max_sentence_lengths = {}
max_word_lengths = {}

# Load sentences
train_sentences, max_sentence_lengths['train'], max_word_lengths['train'] =\
    loader.load_sentences(opts.train, lower, zeros)
dev_sentences, max_sentence_lengths['dev'], max_word_lengths[
    'dev'] = loader.load_sentences(opts.dev, lower, zeros)
test_sentences, max_sentence_lengths['test'], max_word_lengths[
    'test'] = loader.load_sentences(opts.test, lower, zeros)

global_max_sentence_length, global_max_char_length = \
    calculate_global_maxes(max_sentence_lengths, max_word_lengths)

# Use selected tagging scheme (IOB / IOBES)
update_tag_scheme(train_sentences, tag_scheme)
update_tag_scheme(dev_sentences, tag_scheme)
update_tag_scheme(test_sentences, tag_scheme)

# Create a dictionary / mapping of words
# If we use pretrained embeddings, we add them to the dictionary.
Exemplo n.º 20
0
def load_data(pre_emb):

    train_sentences = loader.load_sentences('dataset/eng.train', lower, zeros)
    dev_sentences = loader.load_sentences("dataset/eng.testa", lower, zeros)
    test_sentences = loader.load_sentences("dataset/eng.testb", lower, zeros)

    update_tag_scheme(train_sentences, tag_scheme)
    update_tag_scheme(dev_sentences, tag_scheme)
    update_tag_scheme(test_sentences, tag_scheme)

    all_emb = 1

    dico_words_train = word_mapping(train_sentences, lower)[0]

    dico_words, word_to_id, id_to_word = augment_with_pretrained(
        dico_words_train.copy(), pre_emb,
        list(
            itertools.chain.from_iterable([[w[0] for w in s]
                                           for s in dev_sentences +
                                           test_sentences]))
        if not all_emb else None)

    dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
    dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)

    all_word_embeds = {}
    for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
        s = line.strip().split()
        if len(s) == word_dim + 1:
            all_word_embeds[s[0]] = np.array([float(i) for i in s[1:]])

    word_embeds = np.random.uniform(-np.sqrt(0.06), np.sqrt(0.06),
                                    (len(word_to_id), word_dim))

    for w in word_to_id:
        if w in all_word_embeds:
            word_embeds[word_to_id[w]] = all_word_embeds[w]
        elif w.lower() in all_word_embeds:
            word_embeds[word_to_id[w]] = all_word_embeds[w.lower()]

    print('Loaded %i pretrained embeddings.' % len(all_word_embeds))

    pretrained_word_list = set([
        line.rstrip().split()[0].strip()
        for line in codecs.open(pre_emb, 'r', 'utf-8') if len(pre_emb) > 0
    ])

    filter_ppdb_list = []
    for i in pretrained_word_list:
        if re.match('[^\x00-\x7F]+', i):
            continue
        else:
            if all(w in dico_chars for w in i):
                filter_ppdb_list.append(i)

    filter_ppdb_list = set(filter_ppdb_list) | set(dico_words_train.keys())

    train_sentences_packed = packed_data(train_sentences)
    dev_sentences_packed = packed_data(dev_sentences)
    test_sentences_packed = packed_data(test_sentences)

    return train_sentences_packed, dev_sentences_packed, test_sentences_packed, word_embeds, word_to_id, filter_ppdb_list
Exemplo n.º 21
0



mapping_file = 'models/mapping.pkl'
print(parameters)
eval_script = "./evaluation/conlleval.pl"
eval_temp = "./evaluation/temp"
print("eval_script = ", eval_script)
print("eval_temp = ", eval_temp)

lower = parameters['lower']
zeros = parameters['zeros']
tag_scheme = parameters['tag_scheme']

train_sentences = loader.load_sentences(parameters["train"], lower, zeros)
dev_sentences = loader.load_sentences(parameters["dev"], lower, zeros)
test_sentences = loader.load_sentences(parameters["test"], lower, zeros)
test_train_sentences = loader.load_sentences(parameters["test_train"], lower, zeros)

update_tag_scheme(train_sentences, tag_scheme)
update_tag_scheme(dev_sentences, tag_scheme)
update_tag_scheme(test_sentences, tag_scheme)

dico_words_train = word_mapping(train_sentences, lower)[0]
dico_words, word_to_id, id_to_word = augment_with_pretrained(
        dico_words_train.copy(),
        parameters['pre_emb'],
        list(itertools.chain.from_iterable(
            [[w[0] for w in s] for s in dev_sentences + test_sentences])
        ) if not parameters['all_emb'] else None
Exemplo n.º 22
0
def test_inference_performance():
    from sklearn.metrics import f1_score
    from torchtext.datasets import SequenceTaggingDataset
    from torchtext.data import Field, NestedField

    WORD = Field(init_token='<bos>', eos_token='<eos>')
    CHAR_NESTING = Field(tokenize=list, init_token='<bos>', eos_token='<eos>')
    CHAR = NestedField(CHAR_NESTING, init_token='<bos>', eos_token='<eos>')
    ENTITY = Field(init_token='<bos>', eos_token='<eos>')

    data_file = tempfile.NamedTemporaryFile(delete=True)

    # TODO Need to be decoded in Python 3
    data_file.write(requests.get(CORA_URL).content)

    fields = [(('text', 'char'),
               (WORD, CHAR))] + [(None, None)] * 22 + [('entity', ENTITY)]

    dataset = SequenceTaggingDataset(data_file.name, fields, separator=" ")

    model = Model(model_path='models/neuralParsCit')
    model.parameters['pre_emb'] = os.path.join(os.getcwd(),
                                               'vectors_with_unk.kv')
    f = model.build(training=False, **model.parameters)

    model.reload()

    word_to_id = {v: i for i, v in model.id_to_word.items()}
    char_to_id = {v: i for i, v in model.id_to_char.items()}
    tag_to_id = {tag: i for i, tag in model.id_to_tag.items()}

    tf = tempfile.NamedTemporaryFile(delete=False)
    tf.write("\n\n".join(
        ["\n".join(example.text) for example in dataset.examples]))
    tf.close()

    train_sentences = load_sentences(tf.name, model.parameters['lower'],
                                     model.parameters['zeros'])

    train_inputs = prepare_dataset(train_sentences, word_to_id, char_to_id,
                                   model.parameters['lower'], True)

    preds = []

    for citation in train_inputs:
        inputs = create_input(citation, model.parameters, False)
        y_pred = np.array(f[1](*inputs))[1:-1]

        preds.append([(w, y_pred[i])
                      for i, w in enumerate(citation['str_words'])])

    assert len(preds) == len(dataset.examples)

    results = []

    for P, T in zip(preds, dataset.examples):
        for p, t in zip(P, zip(T.text, T.entity)):
            results.append((p[1], tag_to_id[t[1]]))

    pred, true = zip(*results)

    eval_metrics = {
        'micro_f1': f1_score(true, pred, average='micro'),
        'macro_f1': f1_score(true, pred, average='macro')
    }

    data_file.close()

    assert eval_metrics == pytest.approx({
        'macro_f1': 0.984,
        'micro_f1': 0.993
    },
                                         abs=0.001)
Exemplo n.º 23
0
def train():
    # load data sets
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)
    update_tag_scheme(dev_sentences, FLAGS.tag_schema)
    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        #with open('maps.txt','w',encoding='utf8') as f1:
        #f1.writelines(str(char_to_id)+" "+id_to_char+" "+str(tag_to_id)+" "+id_to_tag+'\n')
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), 0, len(test_data)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

        # best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if i % 7 == 0:
                save_model(sess, model, FLAGS.ckpt_path, logger)
Exemplo n.º 24
0
def train():
    # load data sets
    # train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros)
    # dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    all_train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                         FLAGS.zeros)
    train_sentences, dev_sentences = split_train_dev(all_train_sentences)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # update_tag_scheme(dev_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            # dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars_train = char_mapping(all_train_sentences,
                                            FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(all_train_sentences,
                                                      FLAGS.lower)
        # _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(all_train_sentences)
        # _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

# nlp = StanfordCoreNLP(r'E:\DC\dataset\泰一指尚评测数据\stanford-corenlp-full-2017-06-09')
#l_sorted_lexcion = load_lexcion(FLAGS.lexcion_file, nlp)
    l_sorted_lexcion = []
    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 l_sorted_lexcion, FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               l_sorted_lexcion, FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                l_sorted_lexcion, FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), len(dev_data), len(test_data)))

    max_len = max(
        [len(sentence[0]) for sentence in train_data + test_data + dev_data])

    train_manager = BatchManager(train_data, FLAGS.batch_size, max_len)
    dev_manager = BatchManager(dev_data, 800, max_len)
    test_manager = BatchManager(test_data, 800, max_len)

    # random.shuffle(train_data)

    # pad_test_data = pad_data(test_data)
    # pad_dev_data = pad_data(dev_data)

    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id, max_len)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        for i in range(FLAGS.max_epoch):
            random.shuffle(train_data)
            pad_train_data = pad_data(train_data, max_len)
            strings, chars, lexcion_teatures, pos_ids, dep_ids, head_ids, targets = pad_train_data
            for j in range(0, len(strings), FLAGS.batch_size):
                batch = [
                    strings[j:j + FLAGS.batch_size],
                    chars[j:j + FLAGS.batch_size],
                    lexcion_teatures[j:j + FLAGS.batch_size],
                    pos_ids[j:j + FLAGS.batch_size],
                    dep_ids[j:j + FLAGS.batch_size],
                    head_ids[j:j + FLAGS.batch_size],
                    targets[j:j + FLAGS.batch_size]
                ]
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "AS loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger, i)
                evaluate(sess, model, "test", test_manager, id_to_tag, logger)
        evaluate(sess, model, "test", test_manager, id_to_tag, logger)
Exemplo n.º 25
0
    os.makedirs(models_path)

# Initialize model
start_time = time.time()
model = Model(parameters=parameters, models_path=models_path)
print "Model location: %s" % model.model_path

# Data parameters
lower = parameters['lower']
zeros = parameters['zeros']
tag_scheme = parameters['tag_scheme']
use_gaze = parameters['use_gaze']
batch_size = opts.batch_size

# Load sentences
train_sentences = loader.load_sentences(opts.train, zeros, lower)
dev_sentences = loader.load_sentences(opts.dev, zeros, lower)
test_sentences = loader.load_sentences(opts.test, zeros, lower)

# Use selected tagging scheme (IOB / IOESB / IOESB1B2)
update_tag_scheme(train_sentences, tag_scheme)
update_tag_scheme(dev_sentences, tag_scheme)
update_tag_scheme(test_sentences, tag_scheme)

# Create a dictionary / mapping of chars
# If we use pretrained embeddings, we add them to the dictionary.
if parameters['pre_emb']:
    dico_chars_train = char_mapping(train_sentences)[0]
    dico_chars, char_to_id, id_to_char = augment_with_pretrained(
        dico_chars_train.copy(), parameters['pre_emb'],
        list(
Exemplo n.º 26
0
def train_new():
    train_sent = load_sentences(FLAGS.filepath)

    update_tag_scheme(train_sent, FLAGS.tag_schema)

    if not os.path.isfile(FLAGS.map_file):
        _c, char_to_id, id_to_char = char_mapping(train_sent, FLAGS.lower)
        print("random embedding")

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sent)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # 数据准备,划分验证集和训练集
    np.random.seed(10)
    train_sent_ = np.array(train_sent)
    shuffle_indices = np.random.permutation(np.arange(len(train_sent)))

    sent_shuffled = train_sent_[shuffle_indices]
    dev_sample_index = -1 * int(FLAGS.dev_percentage * float(len(train_sent)))
    train_sent_new, dev_sent = sent_shuffled[:dev_sample_index], sent_shuffled[
        dev_sample_index:]

    train_data = prepare_dataset(train_sent_new, char_to_id, tag_to_id,
                                 FLAGS.lower)
    dev_data = prepare_dataset(dev_sent, char_to_id, tag_to_id, FLAGS.lower)

    print("%i / %i sentences in train." % (len(train_data), len(dev_data)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)

    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = FLAGS.log_file
    logger = get_logger(log_path)
    print_config(config, logger)

    # 根据需求,设置动态使用GPU资源
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:

        fig = plt.figure()
        ax = fig.add_subplot(211)
        ax2 = fig.add_subplot(212)
        plt.grid(True)
        plt.ion()

        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        for i in range(FLAGS.max_epoch):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)

                if step % 20 == 0:
                    ax.scatter(step, np.mean(loss), c='b', marker='.')
                    plt.pause(0.001)

                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []
            best, f1 = evaluate(sess, model, "dev", dev_manager, id_to_tag,
                                logger)
            ax2.scatter(i + 1, f1, c='b', marker='.')
            plt.pause(0.001)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger, "best")
Exemplo n.º 27
0
def train():
    # load data sets
    train_sentences = load_sentences(
        FLAGS.train_file, FLAGS.lower,
        FLAGS.zeros)  # dimension:num_sentence*len_sentence*2
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(
        train_sentences,
        FLAGS.tag_schema)  # dimension:num_sentence*len_sentence*2
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:  # 如果使用预训练的词嵌入
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[
                0]  # dico_chars_train dimension: 训练数据集中出现的字符类别数*2,
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(  # 利用测试数据样本集中的字对dico_chars_train进行补充
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:  # 创建map_file文件
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(
        train_sentences, char_to_id, tag_to_id,
        FLAGS.lower)  # dimension: NumSentence*4*LenSentence
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), len(dev_data), len(test_data)))

    train_manager = BatchManager(
        train_data, FLAGS.batch_size
    )  # batch_data dimension: BatchNum*4*BatchSize*MaxLenSentence
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)

    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):  # 若已有config_file则读取加载
        config = load_config(FLAGS.config_file)
    else:  # 若没有config_file则新建并保存为文件
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)  # 将config打印到日志文件

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True  # 动态申请内存
    steps_per_epoch = train_manager.len_data  # len_data: ceil(NumSentence/BatchSize)
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        for i in range(FLAGS.max_epoch):  # 括号中数字是epoach数量
            for batch in train_manager.iter_batch(
                    shuffle=True
            ):  # 一次从batch_data中取出一个batch,Shuffle为True表示打乱batch_data的顺序
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
                evaluate(sess, model, "test", test_manager, id_to_tag, logger)

    # View the tensorboard graph by running the following code and then going to the terminal and typing:
    # tensorboard --logdir = tensorboard_logs
    merged = tf.summary.merge_all()
    if not os.path.exists('tensorboard_logs/'):
        os.makedirs('tensorboard_logs/')
    my_writer = tf.summary.FileWriter('tensorboard_logs/', sess.graph)
Exemplo n.º 28
0
def train():
    # load data sets
    datasets = load_sentences(FLAGS.train_file, FLAGS.lower)
    random.shuffle(datasets)
    train_sentences = datasets[:14000]
    test_sentences = datasets[14000:]

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        char_to_id, _ = elmo_char_mapping(FLAGS.elmo_vocab)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i sentences in train / dev." %
          (len(train_data), len(test_data)))

    elmo_batcher = get_batcher()
    train_manager = BatchManager(train_data, FLAGS.batch_size, elmo_batcher)
    test_manager = BatchManager(test_data, FLAGS.batch_size, elmo_batcher)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto(allow_soft_placement=True)
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        elmo_model = load_elmo()
        model = create_model(sess, Model, FLAGS.ckpt_path, elmo_model, config,
                             logger)
        logger.info("start training")
        loss = []
        for i in range(FLAGS.max_epoch):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info(
                        "iteration:{} step:{}/{}, NER loss:{:>9.6f}".format(
                            iteration, step % steps_per_epoch, steps_per_epoch,
                            np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "test", test_manager, id_to_tag,
                            logger)
            # evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
Exemplo n.º 29
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {"ner": NerProcessor}
    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    bert_config = modeling.BertConfig.from_json_file(
        FLAGS.bert_config_file)  # 加载bert模型的参数设置

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:  # 限制ner的max_seq_length不大于bert的最大长度限制512
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    task_name = FLAGS.task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))
    processor = processors[task_name]()

    label_list = processor.get_labels(
    )  # 获取label标签["O", "B-DIS", "I-DIS", "X", "[CLS]", "[SEP]"]

    tokenizer = tokenization.FullTokenizer(  # 对vocab的初始处理,包括word:id,大小写等
        vocab_file=FLAGS.vocab_file,
        do_lower_case=FLAGS.do_lower_case)
    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:  # use_tpu 默认为False
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2

    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.
        save_checkpoints_steps,  # how often to save the model checkpoint. 1000
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,  # 1000
            num_shards=FLAGS.num_tpu_cores,  # 8
            per_host_input_for_training=is_per_host))

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None  # warm up 步数的比例,比如说总共学习100步,warmup_proportion=0.1表示前10步用来warm up,warm up时以
    # 较低的学习率进行学习(lr = global_step/num_warmup_steps * init_lr),10步之后以正常(或衰减)的学习
    # 率来学习。

    ##################
    train_sentences = load_sentences(
        os.path.join(FLAGS.data_dir, "ner.train"), FLAGS.lower,
        FLAGS.zeros)  # 加载训练数据,格式为二维list,外层存储每一句话,内层为每句话的一个字和对应的tag
    dev_sentences = load_sentences(os.path.join(FLAGS.data_dir, "ner.dev"),
                                   FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(os.path.join(FLAGS.data_dir, "ner.dev"),
                                    FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences,
                      FLAGS.tag_schema)  # 默认IOBES,更新tag方案,将IOB转化为IOBES
    update_tag_scheme(dev_sentences,
                      FLAGS.tag_schema)  # 默认IOBES,更新tag方案,将IOB转化为IOBES
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(map_file):
        # create dictionary for word
        if FLAGS.pre_emb:  # use pre-trained embedding
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(  # 为了保证训练集中未出现的测试集中的字至少也能用预训练的word embedding
                dico_chars_train.copy(),
                FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences
                                                   ])  # 将嵌套的列表拼接
                ))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)

        # 执行mark_mapping
        _c, mark_to_id, id_to_mark = mark_mapping(train_sentences)

        entropy_dict = load_entropy_dict(FLAGS.entropy_dict)

        with open(map_file, "wb") as f:
            pickle.dump([
                char_to_id, id_to_char, tag_to_id, id_to_tag, mark_to_id,
                id_to_mark, entropy_dict
            ], f)
    else:
        with open(map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag, mark_to_id, id_to_mark, entropy_dict = pickle.load(
                f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 mark_to_id, entropy_dict, FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               mark_to_id, entropy_dict, FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                mark_to_id, entropy_dict, FLAGS.lower)

    ###############

    if FLAGS.do_train:
        train_examples = processor.get_train_examples(
            FLAGS.data_dir, train_data)  # 返回的每一个元素是一个InputExample对象
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(
        bert_config=bert_config,
        num_labels=len(label_list) + 1,
        init_checkpoint=FLAGS.
        init_checkpoint,  # 将预训练的bert模型的参数加载到模型中作为fine-tuning的初始化参数
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_tpu)

    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
    filed_based_convert_examples_to_features(train_examples, label_list,
                                             FLAGS.max_seq_length, tokenizer,
                                             train_file)

    eval_examples = processor.get_dev_examples(FLAGS.data_dir)
    eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
    filed_based_convert_examples_to_features(eval_examples, label_list,
                                             FLAGS.max_seq_length, tokenizer,
                                             eval_file)

    token_path = os.path.join(FLAGS.output_dir, "token_test.txt")
    with open(FLAGS.output_dir + '/label2id.pkl', 'rb') as rf:
        label2id = pickle.load(rf)
        id2label = {value: key for key, value in label2id.items()}
    if os.path.exists(token_path):
        os.remove(token_path)
    predict_examples = processor.get_test_examples(FLAGS.data_dir)

    predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
    # batch_labels 是以句为单位的[[1,2,0,0,1,2],[...]]
    batch_tokens, batch_labels = filed_based_convert_examples_to_features(
        predict_examples,
        label_list,
        FLAGS.max_seq_length,
        tokenizer,
        predict_file,
        mode="test")

    for actual_train_step in list(range(1000, num_train_steps,
                                        2000)) + [num_train_steps]:

        if FLAGS.do_train:
            start = time.clock()
            tf.logging.info("start training time: %f", start)
            tf.logging.info("***** Running training *****")
            tf.logging.info("  Num examples = %d", len(train_examples))
            tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
            tf.logging.info("  Num steps = %d", actual_train_step)
            train_input_fn = file_based_input_fn_builder(
                input_file=train_file,
                seq_length=FLAGS.max_seq_length,
                is_training=True,
                drop_remainder=True)
            estimator.train(input_fn=train_input_fn,
                            max_steps=actual_train_step)

            end = time.clock()
            tf.logging.info("end training time: %f", end)
            tf.logging.info("training time: %f", end - start)

        if FLAGS.do_eval:
            start = time.clock()
            tf.logging.info("start evaluation time: %f", start)

            tf.logging.info("***** Running evaluation *****")
            tf.logging.info("  Num examples = %d", len(eval_examples))
            tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
            eval_steps = None
            if FLAGS.use_tpu:
                eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size)
            eval_drop_remainder = True if FLAGS.use_tpu else False
            eval_input_fn = file_based_input_fn_builder(
                input_file=eval_file,
                seq_length=FLAGS.max_seq_length,
                is_training=False,
                drop_remainder=eval_drop_remainder)
            result = estimator.evaluate(input_fn=eval_input_fn,
                                        steps=eval_steps)
            output_eval_file = os.path.join(FLAGS.output_dir,
                                            "eval_results.txt")
            with open(output_eval_file, "w") as writer:
                tf.logging.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    tf.logging.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))

            end = time.clock()
            tf.logging.info("end evaluation time: %f", end)
            tf.logging.info("evaluation time: %f", end - start)

        if FLAGS.do_predict:
            start = time.clock()
            tf.logging.info("start predict time: %f", start)
            tf.logging.info("***** Running prediction *****")
            tf.logging.info("  Num examples = %d", len(predict_examples))
            tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)
            if FLAGS.use_tpu:
                # Warning: According to tpu_estimator.py Prediction on TPU is an
                # experimental feature and hence not supported here
                raise ValueError("Prediction in TPU not supported")
            predict_drop_remainder = True if FLAGS.use_tpu else False
            predict_input_fn = file_based_input_fn_builder(
                input_file=predict_file,
                seq_length=FLAGS.max_seq_length,
                is_training=False,
                drop_remainder=predict_drop_remainder)

            result = estimator.predict(input_fn=predict_input_fn)

            _result = []
            for prediction in result:
                _result += [prediction_id for prediction_id in prediction]

            output_predict_file = os.path.join(
                FLAGS.output_dir + "/label_test/",
                "label_test.txt-" + str(actual_train_step))
            Writer(output_predict_file, _result, batch_tokens, batch_labels,
                   id2label)

            end = time.clock()
            tf.logging.info("end predict time: %f", end)
            tf.logging.info("predict time: %f", end - start)
Exemplo n.º 30
0
def runModelInLoop(dropout,char_dim,char_lstm_dim,word_dim,word_lstm_dim):
    #results File
    resultsPath = "/Users/Ehsan/Documents/Ehsan_General/HMQ/HMQ_Projects/DNR2/COLING-2016-Code/i2b2-2010/results/"
    for u_dropout in dropout:
        for v_char_dim in char_dim:
            for w_char_lstm_dim in char_lstm_dim:
                for x_word_dim in word_dim:
                    for y_word_lstm_dim in word_lstm_dim:
                        for dataset in datasets:
                            print "+++++++++++++++"
                            print u_dropout,v_char_dim,w_char_lstm_dim,x_word_dim,y_word_lstm_dim,dataset
                            parameters['dropout'] = u_dropout

                            parameters['char_dim'] = v_char_dim
                            parameters['char_lstm_dim'] =w_char_lstm_dim
                            parameters['word_dim'] = x_word_dim
                            parameters['word_lstm_dim'] = y_word_lstm_dim

                            # If dataset is DrugBank assign predefined path

                            if(dataset == "i2b2-2010"):
                                opts.train = i2b2BasePath+"train.txt"
                                opts.dev = i2b2BasePath+ "dev.txt"
                                opts.test = i2b2BasePath+ "test.txt"
                                resultsFile = resultsPath +"i2b2_2010_Results.txt"



                            # Initialize model
                            model = Model(parameters=parameters, models_path=models_path)
                            print "Model location: %s" % model.model_path

                            # Data parameters
                            lower = parameters['lower']
                            zeros = parameters['zeros']
                            tag_scheme = parameters['tag_scheme']

                            # Load sentences
                            train_sentences = loader.load_sentences(opts.train, lower, zeros)
                            dev_sentences = loader.load_sentences(opts.dev, lower, zeros)
                            test_sentences = loader.load_sentences(opts.test, lower, zeros)

                            # Use selected tagging scheme (IOB / IOBES)
                            update_tag_scheme(train_sentences, tag_scheme)
                            update_tag_scheme(dev_sentences, tag_scheme)
                            update_tag_scheme(test_sentences, tag_scheme)

                            # Create a dictionary / mapping of words
                            # If we use pretrained embeddings, we add them to the dictionary.
                            if parameters['pre_emb']:
                                dico_words_train = word_mapping(train_sentences, lower)[0]
                                dico_words, word_to_id, id_to_word = augment_with_pretrained(
                                    dico_words_train.copy(),
                                    parameters['pre_emb'],
                                    list(itertools.chain.from_iterable(
                                        [[w[0] for w in s] for s in dev_sentences + test_sentences])
                                    ) if not parameters['all_emb'] else None
                                )
                            else:
                                dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower)
                                dico_words_train = dico_words

                            # Create a dictionary and a mapping for words / POS tags / tags
                            dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
                            dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)

                            print "Calling the prepare_dataset :--"
                            # Index data
                            train_data = prepare_dataset(
                                train_sentences, word_to_id, char_to_id, tag_to_id, lower
                            )
                            dev_data = prepare_dataset(
                                dev_sentences, word_to_id, char_to_id, tag_to_id, lower
                            )
                            test_data = prepare_dataset(
                                test_sentences, word_to_id, char_to_id, tag_to_id, lower
                            )

                            print "%i / %i / %i sentences in train / dev / test." % (
                                len(train_data), len(dev_data), len(test_data))

                            # Save the mappings to disk
                            print 'Saving the mappings to disk...'
                            model.save_mappings(id_to_word, id_to_char, id_to_tag)

                            # Build the model
                            f_train, f_eval = model.build(**parameters)

                            # Reload previous model values
                            if opts.reload:
                                print 'Reloading previous model...'
                                model.reload()


                            # Train network
                            #
                            singletons = set([word_to_id[k] for k, v
                                              in dico_words_train.items() if v == 1])
                            n_epochs = 2  # number of epochs over the training set
                            freq_eval = 1000  # evaluate on dev every freq_eval steps
                            best_dev = -np.inf
                            best_test = -np.inf
                            count = 0
                            for epoch in xrange(n_epochs):
                                epoch_costs = []
                                print "Starting epoch %i..." % epoch
                                for i, index in enumerate(np.random.permutation(len(train_data))):
                                    count += 1
                                    input = create_input(train_data[index], parameters, True, singletons)
                                    new_cost = f_train(*input)
                                    epoch_costs.append(new_cost)
                                    #if i % 50 == 0 and i > 0 == 0:
                                    #    print "%i, cost average: %f" % (i, np.mean(epoch_costs[-50:]))
                                    if count % freq_eval == 0:
                                        dev_score = evaluate(parameters, f_eval, dev_sentences,
                                                             dev_data, id_to_tag, dico_tags)
                                        test_score = evaluate(parameters, f_eval, test_sentences,
                                                              test_data, id_to_tag, dico_tags)
                                        print "Score on dev: %.5f" % dev_score
                                        print "Score on test: %.5f" % test_score
                                        if dev_score > best_dev:
                                            best_dev = dev_score
                                            print "New best score on dev."+str(best_dev)
                                            # print "Saving model to disk..."
                                            # model.save()
                                        if test_score > best_test:
                                            best_test = test_score
                                            print "New best score on test."+str(best_test)
                                        # print "Config values used are : "


                                print "Epoch %i done. Average cost: %f" % (epoch, np.mean(epoch_costs))
                            # Write the best dev and test scores to the file
                            del model


                            with open(resultsFile, 'a') as f:
                                    f.write("dropout: "+ str(parameters['dropout'] ) +"| char_dim:  |"+str(parameters['char_dim'])+ "| char_lstm_dim:  "+str(parameters['char_lstm_dim']) +" word_dim: "+ str(parameters['word_dim']) +" |word_lstm_dim: "+ str( parameters['word_lstm_dim'] )+" | Best Dev Score: "+str(best_dev) + " | Best Test Score: "+str(best_test) +"\n")


    return
Exemplo n.º 31
0
if not os.path.exists(eval_temp):
    os.makedirs(eval_temp)
if not os.path.exists(models_path):
    os.makedirs(models_path)

# Initialize model
model = Model(parameters=parameters, models_path=models_path)
print "Model location: %s" % model.model_path

# Data parameters
lower = parameters['lower']
zeros = parameters['zeros']
tag_scheme = parameters['tag_scheme']

# Load sentences
train_sentences = loader.load_sentences(opts.train, lower, zeros)
dev_sentences = loader.load_sentences(opts.dev, lower, zeros)
test_sentences = loader.load_sentences(opts.test, lower, zeros)

# Use selected tagging scheme (IOB / IOBES)
update_tag_scheme(train_sentences, tag_scheme)
update_tag_scheme(dev_sentences, tag_scheme)
update_tag_scheme(test_sentences, tag_scheme)

# Create a dictionary / mapping of words
# If we use pretrained embeddings, we add them to the dictionary.
if parameters['pre_emb']:
    dico_words_train = word_mapping(train_sentences, lower)[0]
    dico_words, word_to_id, id_to_word = augment_with_pretrained(
        dico_words_train.copy(),
        parameters['pre_emb'],
Exemplo n.º 32
0
def train():
    # load data sets
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(),
                FLAGS.emb_file,
                list(itertools.chain.from_iterable(
                    [[w[0] for w in s] for s in test_sentences])
                )
            )
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(
        train_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    dev_data = prepare_dataset(
        dev_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    test_data = prepare_dataset(
        test_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    print("%i / %i / %i sentences in train / dev / test." % (
        len(train_data), 0, len(test_data)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger)
        logger.info("start training")
        loss = []

        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                #print batch
                step, batch_loss = model.run_step(sess, True, batch)
                #print step
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                        iteration, step%steps_per_epoch, steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)