Python prepare_dataset示例，utils.loader.prepare_dataset Python示例

示例#1

0

显示文件

文件： load_data.py 项目： xv44586/Papers

def load(train_file='example.train',
         dev_file='example.dev',
         test_file='example.test',
         lower=True,
         zeros=True,
         tag_schema='iobes',
         map_file='map.pkl',
         pre_emb=True,
         emb_file='wiki_100.utf8'):

    train_file = get_data_path(train_file)
    dev_file = get_data_path(dev_file)
    test_file = get_data_path(test_file)
    map_file = get_data_path(map_file)
    emb_file = get_data_path(emb_file)

    train_sentences = load_sentences(train_file, lower, zeros)
    dev_sentences = load_sentences(dev_file, lower, zeros)
    test_sentences = load_sentences(test_file, lower, zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, tag_schema)
    update_tag_scheme(test_sentences, tag_schema)

    # create maps if not exist
    if not os.path.isfile(map_file):
        # create dictionary for word
        if pre_emb:
            dico_chars_train = char_mapping(train_sentences, lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences, lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), 0, len(test_data)))

    return train_data, dev_data, test_data, char_to_id, tag_to_id, id_to_char, id_to_tag

示例#2

0

显示文件

文件： train_evaluate.py 项目： zhenhua21/ner-slot_filling

def evaluate_test():
    config = load_config(args.config_file)
    logger = get_logger(args.log_file)

    with open(args.map_file, "rb") as f:
        char_to_id, id_to_char, tag_to_id, id_to_tag, intent_to_id, id_to_intent = pickle.load(
            f)

    test_sentences = load_sentences(args.test_file, args.lower, args.zeros)
    update_tag_scheme(test_sentences, args.tag_schema)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                intent_to_id, args.lower)
    test_manager = BatchManager(test_data, 100)

    # limit GPU memory 限制GPU的内存大小
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, args.ckpt_path, load_word2vec,
                             config, id_to_char, logger)

        evaluate(sess, model, "test", test_manager, id_to_tag, logger)

示例#3

0

显示文件

def predict_sentences_given_model(sentences_string, model):
    """

    :type sentences_string: string
    :type model: MainTaggerModel
    :param model:
        Mappings must be loaded.
    """

    from utils import tokenize_sentences_string
    from utils.loader import load_sentences, prepare_dataset

    tokenized_sentences = tokenize_sentences_string(sentences_string)

    # print tokenized_sentences

    from utils.morph_analyzer_caller import get_morph_analyzes, create_single_word_single_line_format

    # "\n".join([" ".join(x) for x in tokenized_sentences])
    dataset_file_string = ""
    morph_analyzer_output_for_all_sentences = ""
    for tokenized_sentence in tokenized_sentences:
        morph_analyzer_output_for_a_single_sentence = get_morph_analyzes(
            " ".join(tokenized_sentence))
        morph_analyzer_output_for_all_sentences += morph_analyzer_output_for_a_single_sentence + "\n"
        # print string_output
        dataset_file_string += create_single_word_single_line_format(
            morph_analyzer_output_for_a_single_sentence,
            conll=True,
            for_prediction=True)

    dataset_file_string = dataset_file_string.decode('iso-8859-9')
    # import sys
    # sys.exit(1)

    # print sentences_data_string.split("\n")
    # We now have the input sentences in our native format
    train_sentences, _, _ = load_sentences(dataset_file_string.split("\n"),
                                           model.parameters["zeros"])

    char_to_id, id_to_char, id_to_morpho_tag, id_to_tag, id_to_word, morpho_tag_to_id, tag_to_id, word_to_id = \
        extract_mapping_dictionaries_from_model(model)

    _, _, _, sentences_data = prepare_dataset(
        train_sentences,
        word_to_id,
        char_to_id,
        tag_to_id,
        morpho_tag_to_id,
        model.parameters['lower'],
        model.parameters['mt_d'],
        model.parameters['mt_t'],
        model.parameters['mt_ci'],
        morpho_tag_separator=("+" if model.parameters['lang_name'] == "turkish"
                              else "|"))

    f_scores, morph_accuracies, labeled_sentences = \
        predict_tags_given_model_and_input([('tagger_output', sentences_data)],
                                           model,
                                           return_result=True)

    print(labeled_sentences)
    return labeled_sentences, dataset_file_string

示例#4

0

显示文件

文件： main.py 项目： memoiry/NLU

def train_ner():
    clean(FLAGS)
    # load data sets
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)
    update_tag_scheme(dev_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), 0, len(test_data)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        for i in range(25):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)

示例#5

0

显示文件

文件： train_evaluate.py 项目： zhenhua21/ner-slot_filling

def train():
    # load data sets
    train_sentences = load_sentences(args.train_file, args.lower, args.zeros)
    dev_sentences = load_sentences(args.dev_file, args.lower, args.zeros)
    test_sentences = load_sentences(args.test_file, args.lower, args.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    # 检测并维护数据集的 tag 标记
    update_tag_scheme(train_sentences, args.tag_schema)
    update_tag_scheme(test_sentences, args.tag_schema)
    update_tag_scheme(dev_sentences, args.tag_schema)

    # create maps if not exist
    # 根据数据集创建 char_to_id, id_to_char, tag_to_id, id_to_tag 字典，并储存为 pkl 文件
    if not os.path.isfile(args.map_file):
        # create dictionary for word
        if args.pre_emb:
            dico_chars_train = char_mapping(train_sentences, args.lower)[0]
            # 利用预训练嵌入集增强（扩充）字符字典，然后返回字符与位置映射关系
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), args.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      args.lower)

        # Create a dictionary and a mapping for tags
        # 获取标记与位置映射关系
        tag_to_id, id_to_tag, intent_to_id, id_to_intent = tag_mapping(
            train_sentences)

        with open(args.map_file, "wb") as f:
            pickle.dump([
                char_to_id, id_to_char, tag_to_id, id_to_tag, intent_to_id,
                id_to_intent
            ], f)
    else:
        with open(args.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag, intent_to_id, id_to_intent = pickle.load(
                f)

    # 提取句子特征
    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 intent_to_id, args.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               intent_to_id, args.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                intent_to_id, args.lower)

    # code.interact(local=locals())

    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), len(dev_data), len(test_data)))

    # 获取可供模型训练的单个批次数据
    train_manager = BatchManager(train_data, args.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)

    # make path for store log and model if not exist
    make_path(args)
    if os.path.isfile(args.config_file):
        config = load_config(args.config_file)
    else:
        config = config_model(char_to_id, tag_to_id, intent_to_id)
        save_config(config, args.config_file)
    make_path(args)

    logger = get_logger(args.log_file)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True

    # 训练集全量跑一次需要迭代的次数
    steps_per_epoch = train_manager.len_data

    with tf.Session(config=tf_config) as sess:
        # 此处模型创建为项目最核心代码
        model = create_model(sess, Model, args.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss_slot = []
        loss_intent = []

        # with tf.device("/gpu:0"):
        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss_slot, batch_loss_intent = model.run_step(
                    sess, True, batch)
                loss_slot.append(batch_loss_slot)
                loss_intent.append(batch_loss_intent)

                if step % args.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "INTENT loss:{:>9.6f}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss_intent),
                                    np.mean(loss_slot)))
                    loss_slot = []
                    loss_intent = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                # if i%7 == 0:
                save_model(sess, model, args.ckpt_path, logger)
        evaluate(sess, model, "test", test_manager, id_to_tag, logger)

示例#6

0

显示文件

文件： train.py 项目： aka-zyq/bert_sequence_label

def train():
    tf.io.gfile.mkdir(FLAGS.output)
    log_path = os.path.join(FLAGS.output, 'model.log')
    logger = get_logger(log_path)
    # load data sets
    train_sentences = load_sentences(os.path.join(FLAGS.data, "train.txt"),
                                     FLAGS.zeros)
    dev_sentences = load_sentences(os.path.join(FLAGS.data, "dev.txt"),
                                   FLAGS.zeros)
    test_sentences = load_sentences(os.path.join(FLAGS.data, "test.txt"),
                                    FLAGS.zeros)
    # create maps if not exist
    map_file = os.path.join(FLAGS.output, 'maps.pkl')
    if not os.path.isfile(map_file):
        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(map_file, "wb") as f:
            pickle.dump([tag_to_id, id_to_tag], f)
    else:
        with open(map_file, "rb") as f:
            tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, FLAGS.max_seq_len, tag_to_id)
    dev_data = prepare_dataset(dev_sentences, FLAGS.max_seq_len, tag_to_id)
    test_data = prepare_dataset(test_sentences, FLAGS.max_seq_len, tag_to_id)
    logger.info("%i / %i / %i sentences in train / dev / test." %
                (len(train_data), len(dev_data), len(test_data)))
    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, FLAGS.batch_size)
    test_manager = BatchManager(test_data, FLAGS.batch_size)
    # make path for store log and model if not exist
    config_file = os.path.join(FLAGS.output, 'config.json')
    if os.path.isfile(config_file):
        config = load_config(config_file)
    else:
        config = config_model(tag_to_id)
        save_config(config, config_file)
    print_config(config, logger)
    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model,
                             os.path.join(FLAGS.output, 'checkpoint'), config,
                             logger)

        logger.info("start training")
        loss = []
        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)

                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess,
                           model,
                           os.path.join(FLAGS.output, 'checkpoint'),
                           logger,
                           global_steps=step)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)