def train():
    #加载训练用的数据
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)
    #加载验证集和测试集合
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)
    # Use selected tagging scheme (IOB / IOBES)使用选定的标记方案I:中间,O:其他,B:开始 | E:结束,S:单个
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)
    update_tag_scheme(dev_sentences, FLAGS.tag_schema)
    _c, char_to_id, id_to_char = char_mapping(
        train_sentences, FLAGS.lower)  #统计每个字的频率以及为每个字分配一个id
    _t, tag_to_id, id_to_tag = tag_mapping(
        train_sentences, FLAGS.id_to_tag_path,
        FLAGS.tag_to_id_path)  #统计每个命名实体的频率以及为每个命名实体分配一个id
    #将字典写入pkl文件中
    with open(FLAGS.map_file, "wb") as f:
        pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    #准备数据,获取包含索引的列表集合,得到用于输入网络进行训练的数据
    train_data = prepare_dataset(  # train_data[0][0]:一句话;train_data[0][1]:单个字的编号;train_data[0][2]:切词之后,切词特征:词的大小是一个字的话是0,词的大小是2以上的话:1,2.。。,2,3; train_data[0][3]:每个字的标签
        train_sentences, char_to_id, tag_to_id, FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    train_manager = BatchManager(
        train_data, FLAGS.batch_size)  # 将数据拆分成以60句话为一个batch,得到一个可迭代对象
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    config = config_model(char_to_id, tag_to_id)  #补全参数配置
    #限制GPU的使用
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, load_word2vec, config, id_to_char)
        saver = tf.train.Saver()  # 用于保存模型
        with tf.device("/cpu:0"):
            for i in range(100):
                for batch in train_manager.iter_batch(shuffle=True):
                    step, batch_loss = model.run_step(
                        sess, True, batch)  # 按批次训练模型 这个是训练的开始,可以从这里倒着找整个网络怎么训练
                #每训练5次做一次验证并计算模型的f1
                if (i + 1) % 1 == 0:
                    f1 = evaluate(sess, model, "dev", dev_manager, id_to_tag)
                    print("验证集的F1系数:", f1)
                #每训练20次保存一次模型
                if (i + 10) % 1 == 0:
                    saver.save(sess, save_path=FLAGS.ckpt_path)
示例#2
0
文件: main.py 项目: aiedward/Cner_v1
def test():
    # 加载配置文件
    config = load_config(FLAGS.config_file)
    # 加载日志管理器
    log_path = os.path.join("log", FLAGS.test_log_file)
    logger = get_logger(log_path)
    # 配置GPU
    tf_config = tf.ConfigProto()
    # 加载数据集
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)
    # 读取词典
    with open(FLAGS.map_file, "rb") as f:
        char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)
    # 格式化test
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    # 加载batch
    test_manager = BatchManager(test_data, 20)
    with tf.Session(config=tf_config) as sess:
        logger.info("start testing...")
        start = time.time()
        # 根据保存的模型读取模型
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        # 获取testbatch
        evaluate(sess, model, "test", test_manager, id_to_tag, logger)
        logger.info("The best_f1 on test_dataset is {}".format(
            model.best_test_f1.eval()))
        logger.info('Time test for 10 batch is {} sec\n'.format(time.time() -
                                                                start))
示例#3
0
    def post(self):
        """
        Parse multiple string and return the associated entity for each token in each string.
        """
        args = self.parser.parse_args()
        ref_strings = args.get('strings')

        tokens = [[[token] for token in ref_string.split(" ")]
                  for ref_string in ref_strings]
        data = prepare_dataset(tokens, current_app.word_to_id,
                               current_app.char_to_id, {},
                               current_app.model.parameters['lower'], True)

        tagged = []

        for index, datum in enumerate(data):
            model_inputs = create_input(datum, current_app.model.parameters,
                                        False)
            y_pred = np.array(current_app.inference[1](*model_inputs))[1:-1]
            tags = [
                current_app.model.id_to_tag[y_pred[i]]
                for i in range(len(y_pred))
            ]

            tagged.append([
                Entity(term=term, entity=entity)
                for term, entity in zip(ref_strings[index].split(" "), tags)
            ])

        response = ParseBatchResponse(reference_strings=ref_strings,
                                      data=tagged)
        return response
示例#4
0
    def post(self):
        event = self.get_argument('event')
        lines = self.new_text_split(event)
        inputs = convert(prepare_dataset(lines,FLAGS.max_seq_len,tag_to_id,train=False))
        result = model.evaluate_lines(sess, inputs, id_to_tag)

        self.write(json.dumps(result, ensure_ascii=False))
 def predict():
     """
     对一个数据集进行实体识别
     :return:
     """
     config = load_config(FLAGS.config_file)
     logger = get_logger(FLAGS.log_file)
     tf_config = tf.ConfigProto()
     tf_config.gpu_options.allow_growth = True  # limit GPU memory
     # 从训练阶段生成的map_file中恢复各映射字典
     with open(FLAGS.map_file, "rb") as f:
         char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)
     test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower,
                                     FLAGS.zeros)
     test_data = prepare_dataset(test_sentences,
                                 char_to_id,
                                 tag_to_id,
                                 FLAGS.lower,
                                 train=False)
     test_manager = BatchManager(test_data, 1)
     with tf.Session(config=tf_config) as sess:
         model = create_model(sess, Model, FLAGS.ckpt_path, config,
                              id_to_char, logger)
         logger.info("predict data......")
         ner_results = model.predict(sess, test_manager, id_to_tag)
         result_write_evaluate(ner_results, FLAGS.result_path, "test")
示例#6
0
    def post(self):
        """
        Parse a single string and return the associated entity for each token in the string.
        """
        args = self.parser.parse_args()
        ref_string = args.get('string')
        if ref_string is None or ref_string == "":
            # Hackish way as reqparse can't catch empty string
            abort(400, description='string is empty or not provided.')

        tokens = ref_string.split(" ")

        data = prepare_dataset([[[token] for token in tokens]],
                               current_app.word_to_id, current_app.char_to_id,
                               {}, current_app.model.parameters['lower'], True)

        model_inputs = create_input(data[0], current_app.model.parameters,
                                    False)
        y_pred = np.array(current_app.inference[1](*model_inputs))[1:-1]
        tags = [
            current_app.model.id_to_tag[y_pred[i]] for i in range(len(y_pred))
        ]

        response = ParseResponse(reference_string=ref_string,
                                 data=[
                                     Entity(term=term, entity=entity)
                                     for term, entity in zip(tokens, tags)
                                 ])
        return response
示例#7
0
文件: main.py 项目: wshzd/NER
def main(_):

    if FLAGS.train:
        if FLAGS.clean:
            clean(FLAGS)
        train()
    else:
        # 下面使用testdata来进行评估模型
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)
        log_path = os.path.join("log", FLAGS.log_file)
        config = load_config(FLAGS.config_file)
        logger = get_logger(log_path)
        tf_config = tf.ConfigProto(allow_soft_placement=True,
                                   log_device_placement=True)
        test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower,
                                        FLAGS.zeros)
        test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                    FLAGS.lower)
        test_manager = BatchManager(test_data, 100)
        with tf.Session(config=tf_config) as sess:
            sess.run(tf.global_variables_initializer())
            model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                                 config, id_to_char, logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
示例#8
0
 def get_batch_data(self):
     """
     得到训练集和验证集的batch管理类:首先基于各映射字典对训练集和验证集的语句序列进行处理,得到每个语句的各特征列表以及
     真实标签列表,然后获取batch管理类,用于生成batch数据
     :return:
     """
     if not os.path.isfile(FLAGS.train_dev_file):
         train_data = prepare_dataset(self.train_sentences, self.char_to_id, self.tag_to_id, FLAGS.lower)
         dev_data = prepare_dataset(self.dev_sentences, self.char_to_id, self.tag_to_id, FLAGS.lower)
         with open(FLAGS.train_dev_file, "wb") as f:
             pickle.dump([train_data, dev_data], f)
     else:
         with open(FLAGS.train_dev_file, "rb") as f:
             train_data, dev_data = pickle.load(f)
     print("%i / %i  sentences in train / dev ." % (len(train_data), len(dev_data)))
     self.train_batch_manager = BatchManager(train_data, int(FLAGS.batch_size))
     self.dev_batch_manager = BatchManager(dev_data, int(FLAGS.batch_size))
示例#9
0
def main():
    os.environ['PYTHONHASHSEED'] = str(args.seed)
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

    train_sentences = load_sentences(args.train_file)
    dev_sentences = load_sentences(args.dev_file)
    test_sentences = load_sentences(args.test_file)

    update_tag_scheme(train_sentences, args.tag_schema)
    update_tag_scheme(test_sentences, args.tag_schema)
    update_tag_scheme(dev_sentences, args.tag_schema)

    with open(args.map_file, 'rb') as f:
        char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id)

    train_manager = BatchManager(train_data, args.batch_size, args.num_steps)
    dev_manager = BatchManager(dev_data, 100, args.num_steps)
    test_manager = BatchManager(test_data, 100, args.num_steps)

    if args.cuda >= 0:
        torch.cuda.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)
        device = torch.device(args.cuda)
    else:
        device = torch.device('cpu')
    print("device: ", device)

    if args.train:
        train(id_to_char, id_to_tag, train_manager, dev_manager, device)
    f1, res_info = eval_model(id_to_char, id_to_tag, test_manager, device,
                              args.log_name)
    log_handler.info("\n resinfo {} \v F1: {} ".format(res_info, f1))
示例#10
0
def load_gramcnn():
    #load parameters
    #print '------params----'
    opts, parameters, model_name = load_object('main_params.pkl')

    #prep for gram-cnn
    #print '------gram-cnn params----'
    lower = parameters['lower']
    zeros = parameters['zeros']
    tag_scheme = parameters['tag_scheme']
    word_to_id, char_to_id, tag_to_id, pt_to_id, dico_words, id_to_tag = reload_mappings(
        os.path.join(models_path, model_name, 'mappings.pkl'))

    if os.path.isfile(opts.test):
        test_sentences = loader.load_sentences(opts.test, lower, zeros)
        update_tag_scheme(test_sentences, tag_scheme)

    if os.path.isfile(opts.test):
        test_data, m3 = prepare_dataset(test_sentences, word_to_id, char_to_id,
                                        tag_to_id, pt_to_id, lower)

    max_seq_len = m3 if m3 > 200 else 200
    word_emb_weight = np.zeros((len(dico_words), parameters['word_dim']))
    n_words = len(dico_words)

    #print '------gramcnn model----'
    print ' [*] Loading GRAMCNN tensorflow model (3min)...'
    gramcnn = GRAMCNN(
        n_words,
        len(char_to_id),
        len(pt_to_id),
        use_word=parameters['use_word'],
        use_char=parameters['use_char'],
        use_pts=parameters['pts'],
        num_classes=len(tag_to_id),
        word_emb=parameters['word_dim'],
        drop_out=0,
        word2vec=word_emb_weight,
        feature_maps=parameters['num_kernels'],  #,200,200, 200,200],
        kernels=parameters['kernels'],
        hidden_size=parameters['word_lstm_dim'],
        hidden_layers=parameters['hidden_layer'],
        padding=parameters['padding'],
        max_seq_len=max_seq_len)

    #print '------gramcnn load----'
    gramcnn.load(models_path, model_name)
    compilation = [
        opts, id_to_tag, word_to_id, char_to_id, tag_to_id, pt_to_id, lower,
        max_seq_len
    ]

    print ' [*] Finished loading.'
    return compilation, parameters, gramcnn
示例#11
0
def test():
    make_path(FLAGS)
    config = load_config(FLAGS.config_file)
    with open(FLAGS.map_file, "rb") as f:
        char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id)
    test_manager = BatchManager(test_data, 100)
    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    os.environ["CUDA_VISIBLE_DEVICES"] = "3"
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)
    tf_config = tf.ConfigProto(gpu_options=gpu_options)
    tf_config.gpu_options.allow_growth = True
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        evaluate(sess, model, "test", test_manager, id_to_tag, logger)
示例#12
0
def evaluate_sentence(sentence):

    tokens = nltk.word_tokenize(sentence)
    test_sentences = [[[unicode(w), unicode('O')] for w in tokens]]

    if os.path.isfile(opts.test):
        test_data, m3 = prepare_dataset(test_sentences, word_to_id, char_to_id,
                                        tag_to_id, pt_to_id, lower)

    arr_results = evaluate(parameters,
                           gramcnn,
                           test_sentences,
                           test_data,
                           id_to_tag,
                           remove=False,
                           max_seq_len=max_seq_len,
                           padding=parameters['padding'],
                           use_pts=parameters['pts'])

    return process_results(arr_results, sentence)
示例#13
0
        list(itertools.chain.from_iterable(
            [[w[0] for w in s] for s in dev_sentences + test_sentences])
        ) if not parameters['all_emb'] else None
    )
else:
    dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower)
    dico_words_train = dico_words

# Create a dictionary and a mapping for words / POS tags / tags
dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)

# Index data
train_buckets, train_stats, train_unique_words = prepare_dataset(
    train_sentences, word_to_id, char_to_id, tag_to_id,
    global_max_sentence_length, global_max_char_length,
    lower
)
dev_buckets, dev_stats, dev_unique_words = prepare_dataset(
    dev_sentences, word_to_id, char_to_id, tag_to_id,
    global_max_sentence_length, global_max_char_length,
    lower
)
test_buckets, test_stats, test_unique_words = prepare_dataset(
    test_sentences, word_to_id, char_to_id, tag_to_id,
    global_max_sentence_length, global_max_char_length,
    lower
)

print "%i / %i / %i sentences in train / dev / test." % (
    len(train_stats), len(dev_stats), len(test_stats))
示例#14
0
def train():
    # load data sets
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(),
                FLAGS.emb_file,
                list(itertools.chain.from_iterable(
                    [[w[0] for w in s] for s in test_sentences])
                )
            )
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(
        train_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    dev_data = prepare_dataset(
        dev_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    test_data = prepare_dataset(
        test_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    print("%i / %i / %i sentences in train / dev / test." % (
        len(train_data), 0, len(test_data)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger)
        logger.info("start training")
        loss = []

        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                #print batch
                step, batch_loss = model.run_step(sess, True, batch)
                #print step
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                        iteration, step%steps_per_epoch, steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
示例#15
0
                                                       binary=False)

# Data parameters
lower = parameters['lower']
zeros = parameters['zeros']
tag_scheme = parameters['tag_scheme']

if os.path.isfile(opts.test):
    test_sentences = loader.load_sentences(opts.test, lower, zeros)
    update_tag_scheme(test_sentences, tag_scheme)

word_to_id, char_to_id, tag_to_id, pt_to_id, dico_words, id_to_tag = reload_mappings(
    os.path.join(models_path, model_name, 'mappings.pkl'))

if os.path.isfile(opts.test):
    test_data, m3 = prepare_dataset(test_sentences, word_to_id, char_to_id,
                                    tag_to_id, pt_to_id, lower)

print "%i   sentences in test." % (len(test_data))

n_epochs = 100  # number of epochs over the training set
freq_eval = 2000  # evaluate on dev every freq_eval steps
best_dev = -np.inf
best_test = -np.inf
count = 0
max_seq_len = m3 if m3 > 200 else 200

#initilaze the embedding matrix
word_emb_weight = np.zeros((len(dico_words), parameters['word_dim']))
n_words = len(dico_words)

gramcnn = GRAMCNN(
示例#16
0
文件: train.py 项目: metpallyv/tagger
        parameters['pre_emb'],
        list(itertools.chain.from_iterable(
            [[w[0] for w in s] for s in dev_sentences + test_sentences])
        ) if not parameters['all_emb'] else None
    )
else:
    dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower)
    dico_words_train = dico_words

# Create a dictionary and a mapping for words / POS tags / tags
dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)

# Index data
train_data = prepare_dataset(
    train_sentences, word_to_id, char_to_id, tag_to_id, lower
)
dev_data = prepare_dataset(
    dev_sentences, word_to_id, char_to_id, tag_to_id, lower
)
test_data = prepare_dataset(
    test_sentences, word_to_id, char_to_id, tag_to_id, lower
)

print "%i / %i / %i sentences in train / dev / test." % (
    len(train_data), len(dev_data), len(test_data))

if parameters['gaz_dim']:
    '''1: read from gazetteers file with the format: <gazeetteer <list of categories>>
       2: once we read the gazetteers, we create a one-hot-encoding gazetteer vector 
       for every word in the sentence. The length of vector is equal to no of categories
def main():
    # load data sets
    global args
    args = parser.parse_args()
    pp.pprint(vars(args))
    # running_name = 'X'
    use_cuda = cuda_model.ifUseCuda(args.gpu_id, args.multiGpu)
    # use_cuda = False

    # train_file = 'data/example.train'
    # dev_file = 'data/example.dev'
    test_file = 'data/example.test'
    # embedding_file = 'data/vec.txt'
    map_file = 'map.pkl'
    # config_file = 'config_file_pytorch'
    tag_file = 'tag.pkl'
    # embedding_easy_file = 'data/easy_embedding.npy'
    # train_sentences = load_sentences(train_file)
    # dev_sentences = load_sentences(dev_file)
    test_sentences = load_sentences(test_file)
    # train_sentences = dev_sentences
    # update_tag_scheme(train_sentences, args.tag_schema)
    update_tag_scheme(test_sentences, args.tag_schema)
    # update_tag_scheme(dev_sentences, args.tag_schema)

    if not os.path.isfile(tag_file):
        print("Tag file {:s} Not found".format(tag_file))
        sys.exit(-1)
    else:
        with open(tag_file, 'rb') as t:
            tag_to_id, id_to_tag = pickle.load(t)

    if not os.path.isfile(map_file):
        print("Map file {:s} Not found".format(map_file))
        # create dictionary for word
        # dico_chars_train = char_mapping(train_sentences)[0]
        # dico_chars, char_to_id, id_to_char = augment_with_pretrained(
        #     dico_chars_train.copy(),
        #     embedding_file,
        #     list(itertools.chain.from_iterable(
        #         [[w[0] for w in s] for s in test_sentences])
        #     )
        # )
        # # _, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        #
        # with open(map_file, "wb") as f:
        #     pickle.dump([char_to_id, id_to_char], f)
    else:
        with open(map_file, "rb") as f:
            char_to_id, id_to_char = pickle.load(f)

    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id)

    print("{:d} sentences in  test.".format(len(test_data)))

    test_manager = BatchManager(test_data, 1)

    save_places = dir_utils.save_places(args.eval)

    # log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(
        os.path.join(save_places.log_save_dir,
                     'evaluation-{:d}.txt'.format(args.fileid)))
    config = config_model(char_to_id, tag_to_id, args)
    print_config(config, logger)

    logger.info("start training")

    #Update: create model and embedding!
    model = NERModel.CNERPointer(char_dim=args.char_dim,
                                 seg_dim=args.seg_dim,
                                 hidden_dim=args.hidden_dim,
                                 max_length=15,
                                 output_classes=4,
                                 dropout=args.dropout,
                                 embedding_path=None,
                                 id_to_word=id_to_char,
                                 easy_load=None)
    print("Number of Params\t{:d}".format(
        sum([p.data.nelement() for p in model.parameters()])))

    #Update: this won't work!
    # model = cuda_model.convertModel2Cuda(model, gpu_id=args.gpu_id, multiGpu=args.multiGpu)
    if use_cuda:
        model = model.cuda()

    model.eval()
    if args.eval is not None:
        # if os.path.isfile(args.resume):
        ckpt_filename = os.path.join(
            save_places.model_save_dir,
            'checkpoint_{:04d}.pth.tar'.format(args.fileid))
        assert os.path.isfile(
            ckpt_filename), 'Error: no checkpoint directory found!'

        checkpoint = torch.load(ckpt_filename,
                                map_location=lambda storage, loc: storage)
        model.load_state_dict(checkpoint['state_dict'], strict=True)
        train_iou = checkpoint['IoU']
        print("=> loading checkpoint '{}', current iou: {:.04f}".format(
            ckpt_filename, train_iou))

    ner_results = evaluate(model, test_manager, id_to_tag, use_cuda, max_len=5)
    eval_lines = test_ner(ner_results, save_places.summary_save_dir)
    for line in eval_lines:
        logger.info(line)
    f1 = float(eval_lines[1].strip().split()[-1])
    return f1
示例#18
0
def train():
    # load data sets
    datasets = load_sentences(FLAGS.train_file, FLAGS.lower)
    random.shuffle(datasets)
    train_sentences = datasets[:14000]
    test_sentences = datasets[14000:]

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        char_to_id, _ = elmo_char_mapping(FLAGS.elmo_vocab)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i sentences in train / dev." %
          (len(train_data), len(test_data)))

    elmo_batcher = get_batcher()
    train_manager = BatchManager(train_data, FLAGS.batch_size, elmo_batcher)
    test_manager = BatchManager(test_data, FLAGS.batch_size, elmo_batcher)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto(allow_soft_placement=True)
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        elmo_model = load_elmo()
        model = create_model(sess, Model, FLAGS.ckpt_path, elmo_model, config,
                             logger)
        logger.info("start training")
        loss = []
        for i in range(FLAGS.max_epoch):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info(
                        "iteration:{} step:{}/{}, NER loss:{:>9.6f}".format(
                            iteration, step % steps_per_epoch, steps_per_epoch,
                            np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "test", test_manager, id_to_tag,
                            logger)
            # evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
示例#19
0
    for k, v in x.items()
} for x in [model.id_to_word, model.id_to_char, model.id_to_tag]]

logging.info("Reading test data from %s..." % opts.input)

lower = parameters['lower']
zeros = parameters['zeros']

test_sentences, len_mention = load_ner2line_sentences(opts.input, lower, zeros)
raw_sentences, _ = load_ner2line_sentences(opts.input,
                                           lower=False,
                                           zeros=False)

test_data = prepare_dataset(test_sentences, word_to_id, char_to_id, tag_to_id,
                            parameters['mode'], lower,
                            parameters['overlap_rate'],
                            parameters['negative_ratio'],
                            parameters['max_len'])

logging.info("%d sentences find in test dataset" % len(test_data))
logging.info("%d mentions find in test dataset" % len_mention)

t_time = time.time()
logging.info("Tagging...")

_, _, fb, _, _, preds, _ = model.eval(test_data)

f_output = codecs.open(opts.output, 'w', 'utf-8')

logging.info("Time used for tagging:%s" % time_used(t_time))
示例#20
0
def get_ne(sentence):
    test_sentences = loader.load_test_sentence(sentence, lower, zeros)
    test_data = prepare_dataset(test_sentences, word_to_id, char_to_id, lower)
    return evaluate(parameters, f_eval, test_sentences, test_data, id_to_tag,
                    dico_tags)
def train():
    # load data sets
    train_sentences = load_sentences(
        FLAGS.train_file, FLAGS.lower,
        FLAGS.zeros)  # dimension:num_sentence*len_sentence*2
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(
        train_sentences,
        FLAGS.tag_schema)  # dimension:num_sentence*len_sentence*2
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:  # 如果使用预训练的词嵌入
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[
                0]  # dico_chars_train dimension: 训练数据集中出现的字符类别数*2,
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(  # 利用测试数据样本集中的字对dico_chars_train进行补充
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:  # 创建map_file文件
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(
        train_sentences, char_to_id, tag_to_id,
        FLAGS.lower)  # dimension: NumSentence*4*LenSentence
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), len(dev_data), len(test_data)))

    train_manager = BatchManager(
        train_data, FLAGS.batch_size
    )  # batch_data dimension: BatchNum*4*BatchSize*MaxLenSentence
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)

    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):  # 若已有config_file则读取加载
        config = load_config(FLAGS.config_file)
    else:  # 若没有config_file则新建并保存为文件
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)  # 将config打印到日志文件

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True  # 动态申请内存
    steps_per_epoch = train_manager.len_data  # len_data: ceil(NumSentence/BatchSize)
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        for i in range(FLAGS.max_epoch):  # 括号中数字是epoach数量
            for batch in train_manager.iter_batch(
                    shuffle=True
            ):  # 一次从batch_data中取出一个batch,Shuffle为True表示打乱batch_data的顺序
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
                evaluate(sess, model, "test", test_manager, id_to_tag, logger)

    # View the tensorboard graph by running the following code and then going to the terminal and typing:
    # tensorboard --logdir = tensorboard_logs
    merged = tf.summary.merge_all()
    if not os.path.exists('tensorboard_logs/'):
        os.makedirs('tensorboard_logs/')
    my_writer = tf.summary.FileWriter('tensorboard_logs/', sess.graph)
示例#22
0
                                           for s in test_sentences +
                                           dev_sentences + test_sentences]))
        if not parameters['all_emb'] else None)
else:
    dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
    dico_chars_train = dico_chars

dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)
dico_tags_train = dico_tags

list_prefix = read_list(opts.dictionary)
# Index data
label_cnt = len(dico_tags)
train_data = prepare_dataset(train_sentences,
                             char_to_id,
                             tag_to_id,
                             lower,
                             list_prefix=list_prefix,
                             label_cnt=label_cnt)
dev_data = prepare_dataset(dev_sentences,
                           char_to_id,
                           tag_to_id,
                           lower,
                           list_prefix=list_prefix,
                           label_cnt=label_cnt)
test_data = prepare_dataset(test_sentences,
                            char_to_id,
                            tag_to_id,
                            lower,
                            list_prefix=list_prefix,
                            label_cnt=label_cnt)
示例#23
0
# Save the mappings to disk
print 'Saving the mappings to disk...'
# how this work, should have a mapping in disk and load every time?
model.save_mappings(id_to_word, id_to_char, id_to_tag)

# Build the model
f_train, f_eval = model.build(**parameters)

# Reload previous model values
if opts.reload:
    print 'Reloading previous model...'
    model.reload()


def get_ne(sentence):
    test_sentences = loader.load_test_sentence(sentence, lower, zeros)
    test_data = prepare_dataset(test_sentences, word_to_id, char_to_id, lower)
    return evaluate(parameters, f_eval, test_sentences, test_data, id_to_tag,
                    dico_tags)


if __name__ == '__main__':
    while (True):
        sentence = raw_input("input >>> ")
        test_sentences = loader.load_test_sentence(sentence, lower, zeros)
        test_data = prepare_dataset(test_sentences, word_to_id, char_to_id,
                                    lower)
        print evaluate(parameters, f_eval, test_sentences, test_data,
                       id_to_tag, dico_tags)
示例#24
0
def train_new():
    train_sent = load_sentences(FLAGS.filepath)

    update_tag_scheme(train_sent, FLAGS.tag_schema)

    if not os.path.isfile(FLAGS.map_file):
        _c, char_to_id, id_to_char = char_mapping(train_sent, FLAGS.lower)
        print("random embedding")

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sent)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # 数据准备,划分验证集和训练集
    np.random.seed(10)
    train_sent_ = np.array(train_sent)
    shuffle_indices = np.random.permutation(np.arange(len(train_sent)))

    sent_shuffled = train_sent_[shuffle_indices]
    dev_sample_index = -1 * int(FLAGS.dev_percentage * float(len(train_sent)))
    train_sent_new, dev_sent = sent_shuffled[:dev_sample_index], sent_shuffled[
        dev_sample_index:]

    train_data = prepare_dataset(train_sent_new, char_to_id, tag_to_id,
                                 FLAGS.lower)
    dev_data = prepare_dataset(dev_sent, char_to_id, tag_to_id, FLAGS.lower)

    print("%i / %i sentences in train." % (len(train_data), len(dev_data)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)

    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = FLAGS.log_file
    logger = get_logger(log_path)
    print_config(config, logger)

    # 根据需求,设置动态使用GPU资源
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:

        fig = plt.figure()
        ax = fig.add_subplot(211)
        ax2 = fig.add_subplot(212)
        plt.grid(True)
        plt.ion()

        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        for i in range(FLAGS.max_epoch):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)

                if step % 20 == 0:
                    ax.scatter(step, np.mean(loss), c='b', marker='.')
                    plt.pause(0.001)

                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []
            best, f1 = evaluate(sess, model, "dev", dev_manager, id_to_tag,
                                logger)
            ax2.scatter(i + 1, f1, c='b', marker='.')
            plt.pause(0.001)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger, "best")
示例#25
0
        parameters['pre_emb'],
        list(itertools.chain.from_iterable(
            [[w[0] for w in s] for s in dev_sentences + test_sentences])
        ) if not parameters['all_emb'] else None
    )
else:
    dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower)
    dico_words_train = dico_words

# Create a dictionary and a mapping for words / POS tags / tags
dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)

# Index data
train_data = prepare_dataset(
    train_sentences, word_to_id, char_to_id, tag_to_id, lower
)
dev_data = prepare_dataset(
    dev_sentences, word_to_id, char_to_id, tag_to_id, lower
)
test_data = prepare_dataset(
    test_sentences, word_to_id, char_to_id, tag_to_id, lower
)

print "%i / %i / %i sentences in train / dev / test." % (
    len(train_data), len(dev_data), len(test_data))

# Save the mappings to disk
print 'Saving the mappings to disk...'
model.save_mappings(id_to_word, id_to_char, id_to_tag)
示例#26
0
    )
else:
    dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
    dico_chars_train = dico_chars

dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)
# dico_pos, pos_to_id, id_to_pos = pos_mapping(train_sentences)

list_prefix = read_list(opts.dictionary)


# Index data
label_cnt = len(dico_tags)

train_data = prepare_dataset(
    train_sentences, word_to_id, char_to_id, tag_to_id, use_gaze, True, list_prefix= list_prefix, label_cnt= label_cnt, lower= lower, pos= pos
) # False:表示将data['gaze']one-hot向量全部置为0;True:不置0
#print "train_data[0]['gaze']:", train_data[0]['gaze']
dev_data = prepare_dataset(
    dev_sentences, word_to_id, char_to_id, tag_to_id, use_gaze, True, list_prefix= list_prefix, label_cnt= label_cnt, lower= lower, pos= pos
)
# for data in dev_data:
#     data['pos_one_hot']
test_data = prepare_dataset(
    test_sentences, word_to_id, char_to_id, tag_to_id, use_gaze, True, list_prefix= list_prefix, label_cnt= label_cnt, lower= lower, pos= pos
)

print "%i / %i / %i sentences in train / dev / test." % (
    len(train_data), len(dev_data), len(test_data))

# Save the mappings to disk
示例#27
0
    sentences = []
    for line in codecs.open(path, 'r', 'utf8'):
        sentence = []
        line = line.rstrip()
        if line:
            word = line.split()
            for elem in word:
                sentence.append([elem])
            sentences.append(sentence)
    return sentences


test_sentences = load_sentences(opts.input)
test_data = prepare_dataset(test_sentences,
                            None,
                            parameters,
                            parameters['lower'],
                            isTest=True)
f_output = codecs.open(opts.output, 'w', 'utf-8')
start = time.time()


def xmlformat(sentence, tags):
    #{{{
    assert len(sentence) == len(tags)
    res = []
    preTag = "drug"
    for i in range(len(tags)):
        if tags[i][0] == 'B':
            if len(preTag):
                res.append("</" + preTag + ">")
示例#28
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {"ner": NerProcessor}
    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    bert_config = modeling.BertConfig.from_json_file(
        FLAGS.bert_config_file)  # 加载bert模型的参数设置

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:  # 限制ner的max_seq_length不大于bert的最大长度限制512
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    task_name = FLAGS.task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))
    processor = processors[task_name]()

    label_list = processor.get_labels(
    )  # 获取label标签["O", "B-DIS", "I-DIS", "X", "[CLS]", "[SEP]"]

    tokenizer = tokenization.FullTokenizer(  # 对vocab的初始处理,包括word:id,大小写等
        vocab_file=FLAGS.vocab_file,
        do_lower_case=FLAGS.do_lower_case)
    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:  # use_tpu 默认为False
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2

    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.
        save_checkpoints_steps,  # how often to save the model checkpoint. 1000
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,  # 1000
            num_shards=FLAGS.num_tpu_cores,  # 8
            per_host_input_for_training=is_per_host))

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None  # warm up 步数的比例,比如说总共学习100步,warmup_proportion=0.1表示前10步用来warm up,warm up时以
    # 较低的学习率进行学习(lr = global_step/num_warmup_steps * init_lr),10步之后以正常(或衰减)的学习
    # 率来学习。

    ##################
    train_sentences = load_sentences(
        os.path.join(FLAGS.data_dir, "ner.train"), FLAGS.lower,
        FLAGS.zeros)  # 加载训练数据,格式为二维list,外层存储每一句话,内层为每句话的一个字和对应的tag
    dev_sentences = load_sentences(os.path.join(FLAGS.data_dir, "ner.dev"),
                                   FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(os.path.join(FLAGS.data_dir, "ner.dev"),
                                    FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences,
                      FLAGS.tag_schema)  # 默认IOBES,更新tag方案,将IOB转化为IOBES
    update_tag_scheme(dev_sentences,
                      FLAGS.tag_schema)  # 默认IOBES,更新tag方案,将IOB转化为IOBES
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(map_file):
        # create dictionary for word
        if FLAGS.pre_emb:  # use pre-trained embedding
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(  # 为了保证训练集中未出现的测试集中的字至少也能用预训练的word embedding
                dico_chars_train.copy(),
                FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences
                                                   ])  # 将嵌套的列表拼接
                ))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)

        # 执行mark_mapping
        _c, mark_to_id, id_to_mark = mark_mapping(train_sentences)

        entropy_dict = load_entropy_dict(FLAGS.entropy_dict)

        with open(map_file, "wb") as f:
            pickle.dump([
                char_to_id, id_to_char, tag_to_id, id_to_tag, mark_to_id,
                id_to_mark, entropy_dict
            ], f)
    else:
        with open(map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag, mark_to_id, id_to_mark, entropy_dict = pickle.load(
                f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 mark_to_id, entropy_dict, FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               mark_to_id, entropy_dict, FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                mark_to_id, entropy_dict, FLAGS.lower)

    ###############

    if FLAGS.do_train:
        train_examples = processor.get_train_examples(
            FLAGS.data_dir, train_data)  # 返回的每一个元素是一个InputExample对象
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(
        bert_config=bert_config,
        num_labels=len(label_list) + 1,
        init_checkpoint=FLAGS.
        init_checkpoint,  # 将预训练的bert模型的参数加载到模型中作为fine-tuning的初始化参数
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_tpu)

    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
    filed_based_convert_examples_to_features(train_examples, label_list,
                                             FLAGS.max_seq_length, tokenizer,
                                             train_file)

    eval_examples = processor.get_dev_examples(FLAGS.data_dir)
    eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
    filed_based_convert_examples_to_features(eval_examples, label_list,
                                             FLAGS.max_seq_length, tokenizer,
                                             eval_file)

    token_path = os.path.join(FLAGS.output_dir, "token_test.txt")
    with open(FLAGS.output_dir + '/label2id.pkl', 'rb') as rf:
        label2id = pickle.load(rf)
        id2label = {value: key for key, value in label2id.items()}
    if os.path.exists(token_path):
        os.remove(token_path)
    predict_examples = processor.get_test_examples(FLAGS.data_dir)

    predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
    # batch_labels 是以句为单位的[[1,2,0,0,1,2],[...]]
    batch_tokens, batch_labels = filed_based_convert_examples_to_features(
        predict_examples,
        label_list,
        FLAGS.max_seq_length,
        tokenizer,
        predict_file,
        mode="test")

    for actual_train_step in list(range(1000, num_train_steps,
                                        2000)) + [num_train_steps]:

        if FLAGS.do_train:
            start = time.clock()
            tf.logging.info("start training time: %f", start)
            tf.logging.info("***** Running training *****")
            tf.logging.info("  Num examples = %d", len(train_examples))
            tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
            tf.logging.info("  Num steps = %d", actual_train_step)
            train_input_fn = file_based_input_fn_builder(
                input_file=train_file,
                seq_length=FLAGS.max_seq_length,
                is_training=True,
                drop_remainder=True)
            estimator.train(input_fn=train_input_fn,
                            max_steps=actual_train_step)

            end = time.clock()
            tf.logging.info("end training time: %f", end)
            tf.logging.info("training time: %f", end - start)

        if FLAGS.do_eval:
            start = time.clock()
            tf.logging.info("start evaluation time: %f", start)

            tf.logging.info("***** Running evaluation *****")
            tf.logging.info("  Num examples = %d", len(eval_examples))
            tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
            eval_steps = None
            if FLAGS.use_tpu:
                eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size)
            eval_drop_remainder = True if FLAGS.use_tpu else False
            eval_input_fn = file_based_input_fn_builder(
                input_file=eval_file,
                seq_length=FLAGS.max_seq_length,
                is_training=False,
                drop_remainder=eval_drop_remainder)
            result = estimator.evaluate(input_fn=eval_input_fn,
                                        steps=eval_steps)
            output_eval_file = os.path.join(FLAGS.output_dir,
                                            "eval_results.txt")
            with open(output_eval_file, "w") as writer:
                tf.logging.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    tf.logging.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))

            end = time.clock()
            tf.logging.info("end evaluation time: %f", end)
            tf.logging.info("evaluation time: %f", end - start)

        if FLAGS.do_predict:
            start = time.clock()
            tf.logging.info("start predict time: %f", start)
            tf.logging.info("***** Running prediction *****")
            tf.logging.info("  Num examples = %d", len(predict_examples))
            tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)
            if FLAGS.use_tpu:
                # Warning: According to tpu_estimator.py Prediction on TPU is an
                # experimental feature and hence not supported here
                raise ValueError("Prediction in TPU not supported")
            predict_drop_remainder = True if FLAGS.use_tpu else False
            predict_input_fn = file_based_input_fn_builder(
                input_file=predict_file,
                seq_length=FLAGS.max_seq_length,
                is_training=False,
                drop_remainder=predict_drop_remainder)

            result = estimator.predict(input_fn=predict_input_fn)

            _result = []
            for prediction in result:
                _result += [prediction_id for prediction_id in prediction]

            output_predict_file = os.path.join(
                FLAGS.output_dir + "/label_test/",
                "label_test.txt-" + str(actual_train_step))
            Writer(output_predict_file, _result, batch_tokens, batch_labels,
                   id2label)

            end = time.clock()
            tf.logging.info("end predict time: %f", end)
            tf.logging.info("predict time: %f", end - start)
示例#29
0
def train():
    # 加载数据集
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # 选择tag schema(IOB / IOBES)    I:中间,O:其他,B:开始 | E:结束,S:单个
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)
    update_tag_scheme(dev_sentences, FLAGS.tag_schema)
    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):  # 配置文件:char_to_id, id_to_char, tag_to_id, id_to_tag的数据
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(),
                FLAGS.emb_file,
                list(itertools.chain.from_iterable(
                    [[w[0] for w in s] for s in test_sentences])
                )
            )
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences, FLAGS.id_to_tag_path, FLAGS.tag_to_id_path)
        # with open('maps.txt','w',encoding='utf8') as f1:
        # f1.writelines(str(char_to_id)+" "+id_to_char+" "+str(tag_to_id)+" "+id_to_tag+'\n')
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)  #

    # prepare data, get a collection of list containing index
    # train_data[0][0]:一句话;
    # train_data[0][1]:单个字的编号;
    # train_data[0][2]:切词之后,切词特征:词的大小是一个字的话是0,词的大小是2以上的话:1,2....,2,3;
    # train_data[0][3]:每个字的标签
    train_data = prepare_dataset(
        train_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    dev_data = prepare_dataset(
        dev_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    test_data = prepare_dataset(
        test_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    print("%i / %i / %i sentences in train / dev / test." % (
        len(train_data), 0, len(test_data)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)  # 按batch size将数据拆分
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger)
        logger.info("start training")
        loss = []
        # tf.device("/cpu:0") 指定运行的GPU(默认为GPU:0)
        with tf.device("/cpu:0"):
            for i in range(100):
                # 按批次训练模型。这个是训练的开始,可以从这里倒着找整个网络怎么训练
                for batch in train_manager.iter_batch(shuffle=True):
                    step, batch_loss = model.run_step(sess, True, batch)
                    loss.append(batch_loss)
                    # 打印信息:
                    # iteration:迭代次数,也就是经过多少个epoch;
                    #
                    if step % FLAGS.steps_check == 0:
                        iteration = step // steps_per_epoch + 1
                        logger.info("iteration:{} step:{}/{}, "
                                    "NER loss:{:>9.6f}".format(
                            iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss)))
                        loss = []

                # best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
                if i % 7 == 0:
                    save_model(sess, model, FLAGS.ckpt_path, logger)
示例#30
0
def runModelInLoop(dropout,char_dim,char_lstm_dim,word_dim,word_lstm_dim):
    #results File
    resultsPath = "/Users/Ehsan/Documents/Ehsan_General/HMQ/HMQ_Projects/DNR2/COLING-2016-Code/i2b2-2010/results/"
    for u_dropout in dropout:
        for v_char_dim in char_dim:
            for w_char_lstm_dim in char_lstm_dim:
                for x_word_dim in word_dim:
                    for y_word_lstm_dim in word_lstm_dim:
                        for dataset in datasets:
                            print "+++++++++++++++"
                            print u_dropout,v_char_dim,w_char_lstm_dim,x_word_dim,y_word_lstm_dim,dataset
                            parameters['dropout'] = u_dropout

                            parameters['char_dim'] = v_char_dim
                            parameters['char_lstm_dim'] =w_char_lstm_dim
                            parameters['word_dim'] = x_word_dim
                            parameters['word_lstm_dim'] = y_word_lstm_dim

                            # If dataset is DrugBank assign predefined path

                            if(dataset == "i2b2-2010"):
                                opts.train = i2b2BasePath+"train.txt"
                                opts.dev = i2b2BasePath+ "dev.txt"
                                opts.test = i2b2BasePath+ "test.txt"
                                resultsFile = resultsPath +"i2b2_2010_Results.txt"



                            # Initialize model
                            model = Model(parameters=parameters, models_path=models_path)
                            print "Model location: %s" % model.model_path

                            # Data parameters
                            lower = parameters['lower']
                            zeros = parameters['zeros']
                            tag_scheme = parameters['tag_scheme']

                            # Load sentences
                            train_sentences = loader.load_sentences(opts.train, lower, zeros)
                            dev_sentences = loader.load_sentences(opts.dev, lower, zeros)
                            test_sentences = loader.load_sentences(opts.test, lower, zeros)

                            # Use selected tagging scheme (IOB / IOBES)
                            update_tag_scheme(train_sentences, tag_scheme)
                            update_tag_scheme(dev_sentences, tag_scheme)
                            update_tag_scheme(test_sentences, tag_scheme)

                            # Create a dictionary / mapping of words
                            # If we use pretrained embeddings, we add them to the dictionary.
                            if parameters['pre_emb']:
                                dico_words_train = word_mapping(train_sentences, lower)[0]
                                dico_words, word_to_id, id_to_word = augment_with_pretrained(
                                    dico_words_train.copy(),
                                    parameters['pre_emb'],
                                    list(itertools.chain.from_iterable(
                                        [[w[0] for w in s] for s in dev_sentences + test_sentences])
                                    ) if not parameters['all_emb'] else None
                                )
                            else:
                                dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower)
                                dico_words_train = dico_words

                            # Create a dictionary and a mapping for words / POS tags / tags
                            dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
                            dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)

                            print "Calling the prepare_dataset :--"
                            # Index data
                            train_data = prepare_dataset(
                                train_sentences, word_to_id, char_to_id, tag_to_id, lower
                            )
                            dev_data = prepare_dataset(
                                dev_sentences, word_to_id, char_to_id, tag_to_id, lower
                            )
                            test_data = prepare_dataset(
                                test_sentences, word_to_id, char_to_id, tag_to_id, lower
                            )

                            print "%i / %i / %i sentences in train / dev / test." % (
                                len(train_data), len(dev_data), len(test_data))

                            # Save the mappings to disk
                            print 'Saving the mappings to disk...'
                            model.save_mappings(id_to_word, id_to_char, id_to_tag)

                            # Build the model
                            f_train, f_eval = model.build(**parameters)

                            # Reload previous model values
                            if opts.reload:
                                print 'Reloading previous model...'
                                model.reload()


                            # Train network
                            #
                            singletons = set([word_to_id[k] for k, v
                                              in dico_words_train.items() if v == 1])
                            n_epochs = 2  # number of epochs over the training set
                            freq_eval = 1000  # evaluate on dev every freq_eval steps
                            best_dev = -np.inf
                            best_test = -np.inf
                            count = 0
                            for epoch in xrange(n_epochs):
                                epoch_costs = []
                                print "Starting epoch %i..." % epoch
                                for i, index in enumerate(np.random.permutation(len(train_data))):
                                    count += 1
                                    input = create_input(train_data[index], parameters, True, singletons)
                                    new_cost = f_train(*input)
                                    epoch_costs.append(new_cost)
                                    #if i % 50 == 0 and i > 0 == 0:
                                    #    print "%i, cost average: %f" % (i, np.mean(epoch_costs[-50:]))
                                    if count % freq_eval == 0:
                                        dev_score = evaluate(parameters, f_eval, dev_sentences,
                                                             dev_data, id_to_tag, dico_tags)
                                        test_score = evaluate(parameters, f_eval, test_sentences,
                                                              test_data, id_to_tag, dico_tags)
                                        print "Score on dev: %.5f" % dev_score
                                        print "Score on test: %.5f" % test_score
                                        if dev_score > best_dev:
                                            best_dev = dev_score
                                            print "New best score on dev."+str(best_dev)
                                            # print "Saving model to disk..."
                                            # model.save()
                                        if test_score > best_test:
                                            best_test = test_score
                                            print "New best score on test."+str(best_test)
                                        # print "Config values used are : "


                                print "Epoch %i done. Average cost: %f" % (epoch, np.mean(epoch_costs))
                            # Write the best dev and test scores to the file
                            del model


                            with open(resultsFile, 'a') as f:
                                    f.write("dropout: "+ str(parameters['dropout'] ) +"| char_dim:  |"+str(parameters['char_dim'])+ "| char_lstm_dim:  "+str(parameters['char_lstm_dim']) +" word_dim: "+ str(parameters['word_dim']) +" |word_lstm_dim: "+ str( parameters['word_lstm_dim'] )+" | Best Dev Score: "+str(best_dev) + " | Best Test Score: "+str(best_test) +"\n")


    return
示例#31
0
def train():
    # load data sets
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), 0, len(test_data)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

            #best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            #if best:
            save_model(sess, model, FLAGS.ckpt_path, logger)
示例#32
0
                           morpho_tag_column_index=parameters['mt_ci'],
                           joint_learning=True)
else:
    id_to_morpho_tag = {}
    morpho_tag_to_id = {}

if opts.overwrite_mappings:
    print 'Saving the mappings to disk...'
    model.save_mappings(id_to_word, id_to_char, id_to_tag, id_to_morpho_tag)

model.reload_mappings()


# Index data
train_buckets, train_stats, train_unique_words, train_data = prepare_dataset(
    train_sentences, word_to_id, char_to_id, tag_to_id, morpho_tag_to_id,
    lower, parameters['mt_d'], parameters['mt_t'], parameters['mt_ci'],
)
dev_buckets, dev_stats, dev_unique_words, dev_data = prepare_dataset(
    dev_sentences, word_to_id, char_to_id, tag_to_id, morpho_tag_to_id,
    lower, parameters['mt_d'], parameters['mt_t'], parameters['mt_ci'],
)
test_buckets, test_stats, test_unique_words, test_data = prepare_dataset(
    test_sentences, word_to_id, char_to_id, tag_to_id, morpho_tag_to_id,
    lower, parameters['mt_d'], parameters['mt_t'], parameters['mt_ci'],
)

if parameters['test_with_yuret'] or parameters['train_with_yuret']:
    # yuret train and test datasets
    yuret_train_buckets, yuret_train_stats, yuret_train_unique_words, yuret_train_data = prepare_dataset(
        yuret_train_sentences, word_to_id, char_to_id, tag_to_id, morpho_tag_to_id,
        lower, parameters['mt_d'], parameters['mt_t'], parameters['mt_ci'],