Python prepare_dataset示例

编程语言: Python

命名空间/包名称: data_utils

方法/功能: prepare_dataset

hotexamples.com的示例: 5

Python prepare_dataset - 已找到5个示例。这些是从开源项目中提取的最受好评的data_utils.prepare_dataset现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

                    [[w[0] for w in s] for s in test_sentences])
                )
            )
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)   

    train_data = prepare_dataset(
        train_sentences, char_to_id, tag_to_id, FLAGS.lower, FLAGS.self_seg
    )
    dev_data = prepare_dataset(
        dev_sentences, char_to_id, tag_to_id, FLAGS.lower, FLAGS.self_seg
    )
    test_data = prepare_dataset(
        test_sentences, char_to_id, tag_to_id, FLAGS.lower, FLAGS.self_seg
    )
    print("%i / %i / %i sentences in train / dev / test." % (
        len(train_data), len(dev_data), len(test_data)))                 

    #长度不足补0
    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)

示例#2

显示文件

文件： main.py 项目： fudannlp16/NeuralChineseNER

def train():
    # load data sets
    train_sentences=load_sentences(FLAGS.train_file,FLAGS.zeros)
    dev_sentences=load_sentences(FLAGS.dev_file,FLAGS.zeros)
    test_sentences=load_sentences(FLAGS.test_file,FLAGS.zeros)

    # appoint tagging scheme (IOB/IOBES)
    train_sentences=update_tag_scheme(train_sentences,FLAGS.tag_schema)
    dev_sentences=update_tag_scheme(dev_sentences,FLAGS.tag_schema)
    test_sentences=update_tag_scheme(test_sentences,FLAGS.tag_schema)

    #create maps if not exist
    if not os.path.exists(FLAGS.map_file):
        if FLAGS.pre_emb:
            char_to_id,_=char_mapping(train_sentences)
            char_to_id,id_to_char=augment_with_pretrained(char_to_id,'wiki_100.utf8')
        else:
            char_to_id, id_to_char=char_mapping(train_sentences)
        tag_to_id, id_to_tag=tag_mapping(train_sentences)
        with open(FLAGS.map_file,'wb') as f:
            cPickle.dump([char_to_id,id_to_char,tag_to_id,id_to_tag],f,cPickle.HIGHEST_PROTOCOL)
    else:
        with open(FLAGS.map_file,'rb') as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag=cPickle.load(f)

    # prepare data, get a collection of list containing index
    train_data=prepare_dataset(train_sentences,char_to_id,tag_to_id,True)
    dev_data=prepare_dataset(dev_sentences,char_to_id,tag_to_id,True)
    test_data=prepare_dataset(test_sentences,char_to_id,tag_to_id,True)
    print "%i %i %i sentences in train / dev / test." % (len(train_data),len(dev_data),len(test_data))

    if not FLAGS.pre_emb:
        pre_emb=None
    else:
        pre_emb=load_word2vec(FLAGS.pre_emb_file,char_to_id,FLAGS.char_dim)
        print "init embedding shape: (%d,%d)" %(pre_emb.shape[0],pre_emb.shape[1])

    train_manager=BatchManager(train_data,FLAGS.batch_size,True)
    dev_manager=BatchManager(dev_data,FLAGS.batch_size,False)
    test_manager=BatchManager(test_data,FLAGS.batch_size,False)

    config=BasicModelConfig(FLAGS,len(char_to_id),len(tag_to_id),4)
    tfConfig = tf.ConfigProto()
    tfConfig.gpu_options.per_process_gpu_memory_fraction = FLAGS.memory_usage
    with tf.Session(config=tfConfig) as sess:
        print "Train started!"
        model=BasicModel(config,pre_emb)
        saver=tf.train.Saver()

        # tensorboard
        if not os.path.exists(FLAGS.summaries_dir):
            os.mkdir(FLAGS.summaries_dir)
        merged=tf.summary.merge_all()
        train_writer=tf.summary.FileWriter(os.path.join(FLAGS.summaries_dir,FLAGS.model_name,"train"),sess.graph)
        test_writer=tf.summary.FileWriter(os.path.join(FLAGS.summaries_dir,FLAGS.model_name,"test"),sess.graph)

        # load previous trained model or create a new model
        if not os.path.exists(FLAGS.checkpoints):
            os.mkdir(FLAGS.checkpoints)
        model_name=os.path.join(FLAGS.checkpoints,FLAGS.model_name)
        ckpt=tf.train.get_checkpoint_state(FLAGS.checkpoints)
        if ckpt and ckpt.model_checkpoint_path:
            print "restore from previous traied model: %s" % FLAGS.model_name
            saver.restore(sess,ckpt.model_checkpoint_path)
        else:
            sess.run(tf.global_variables_initializer())

        def evaluate(sess,model,manager):
            strings=[]
            predicts=[]
            goldens=[]
            bar = ProgressBar(max_value=manager.num_batch)
            for batch in bar(manager.iter_batch()):
                batch_string,batch_predict,batch_golden=model.evaluate_step(sess,batch)
                strings.extend(batch_string)
                predicts.extend(batch_predict)
                goldens.extend(batch_golden)
            return strings,predicts,goldens

        best_eval_f1=0
        noimpro_num=0
        for i in range(FLAGS.max_epoch):
            #train
            train_loss=[]
            bar = ProgressBar(max_value=train_manager.num_batch)
            for step,batch in bar(enumerate(train_manager.iter_batch())):
                batch.append(merged)
                summary,global_step,batch_loss=model.train_step(sess,batch,FLAGS.dropout_keep)
                #add summary to tensorboard
                train_writer.add_summary(summary,global_step)
                train_loss.append(batch_loss)
            print "Epoch %d Train loss is %.4f" % (i+1,np.mean(train_loss))

            #dev
            strings,predicts,goldens=evaluate(sess,model,dev_manager)
            eval_f1=report_results(strings,predicts,goldens,id_to_char,id_to_tag,'outputs/dev')
            if eval_f1>best_eval_f1:
                best_eval_f1=eval_f1
                noimpro_num=0
                saver.save(sess,model_name)
            else:
                noimpro_num+=1
            print "Epoch %d Best eval f1:%.6f" % (i+1,best_eval_f1)

            #test
            strings,predicts,goldens=evaluate(sess,model,test_manager)
            test_f1=report_results(strings,predicts,goldens,id_to_char,id_to_tag,'outputs/test',True)
            #early_stop
            if noimpro_num>=3:
                print "Early stop! Final F1 scores on test data is :%.6f" % test_f1
                break
            print

示例#3

显示文件

文件： car_ner.py 项目： churximi/Car-NER

def do_train(config):
    train, dev, test = load_data(config)  # 加载数据
    word_to_id, id_to_word, tag_to_id, id_to_tag = create_maps(
        train, config)  # 创建或读取maps

    # 配置信息及保存
    config["num_chars"] = len(word_to_id)  # 词总数
    config["num_tags"] = len(tag_to_id)  # 标签总数
    with open(config["config_file"], "w") as f:
        json.dump(config, f, ensure_ascii=False, indent=4)

    # 数据处理
    train_data = prepare_dataset(train, word_to_id, tag_to_id, config["lower"])
    dev_data = prepare_dataset(dev, word_to_id, tag_to_id, config["lower"])
    test_data = prepare_dataset(test, word_to_id, tag_to_id, config["lower"])

    print("train/dev/test 句子数：{} / {} / {}".format(len(train_data),
                                                   len(dev_data),
                                                   len(test_data)))

    # 分batch
    train_manager = BatchManager(train_data, config["batch_size"])
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)

    steps_per_epoch = train_manager.len_data  # 每个轮次的steps

    # 创建相关路径
    make_path(config)

    # logger
    logger = get_logger(config["log_file"])

    # GPU限制
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    with tf.Session(config=tf_config) as sess:
        # 创建模型, 可以提供使用现有参数配置
        model = Model(config)

        ckpt = tf.train.get_checkpoint_state(
            config["ckpt_path"])  # 从模型路径获取ckpt
        if ckpt and tf.train.checkpoint_exists(
                ckpt.model_checkpoint_path):  # 现有模型
            logger.info("读取现有模型...")
            model.saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            logger.info("新建模型...")
            sess.run(tf.global_variables_initializer())  # 不使用预训练的embeddings

            # 如果使用预训练的embeddings
            if config["pre_emb"]:
                emb_weights = sess.run(model.char_lookup.read_value())
                emb_weights = load_word2vec(config["emb_file"], id_to_word,
                                            config["char_dim"], emb_weights)
                sess.run(model.char_lookup.assign(emb_weights))
                logger.info("Load pre-trained embedding.")

        logger.info("开始训练...")
        loss = []
        for i in range(config["max_epoch"]):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)

                if step % config["steps_check"] == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info(
                        "iteration:{} step:{}/{}, NER loss:{:>9.6f}".format(
                            iteration, step % steps_per_epoch, steps_per_epoch,
                            np.mean(loss)))

                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger,
                            config)
            if best:
                save_model(sess, model, config["ckpt_path"], logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger,
                     config)

示例#4

显示文件

def train():
    # 加载数据集
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # 选择tag形式 (IOB / IOBES)  默认使用IOBES
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    if not os.path.isfile(FLAGS.map_file):
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            # {'S-LOC': 10, 'E-LOC': 3, 'B-ORG': 4, 'S-PER': 11, 'S-ORG': 12, 'O': 0,
            # 'E-ORG': 5, 'I-LOC': 6, 'I-PER': 7, 'I-ORG': 1, 'B-PER': 8, 'B-LOC': 2, 'E-PER': 9}
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # 转化成数字化的数据
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), len(dev_data), len(test_data)))

    #长度不足补0
    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)

    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # GPU设置
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data

    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    # 每100次算一次平均loss
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)

示例#5

显示文件

def main(unused_argv):
    if len(unused_argv
           ) != 1:  # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)

    tf.logging.set_verbosity(
        tf.logging.INFO)  # choose what level of logging you want
    if FLAGS.mode == 'rl_train':
        tf.logging.info('Starting model in %s mode...',
                        FLAGS.mode + '_' + FLAGS.reward_type)
    else:
        tf.logging.info('Starting model in %s mode...', FLAGS.mode)
    # If in decode mode, set batch_size = beam_size
    # Reason: in decode mode, we decode one example at a time.
    # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses.
    if FLAGS.mode == 'beam_search_decode':
        FLAGS.batch_size = FLAGS.beam_size

    train_data, valid_data, test_data = prepare_dataset(FLAGS.data_path)
    print('TrainData Size:', len(train_data))
    print('ValidData Size:', len(valid_data))
    print('TestData Size:', len(test_data))

    print("Building vocabulary ..... ")
    word2id, id2word, _, max_ending_len, min_ending_len = creat_vocab(
        train_data, FLAGS.word_vocab_size)
    print("Finished building vocabulary!")
    word_vocab_size = len(word2id.keys())

    # Make a namedtuple hps, containing the values of the hyperparameters that the model needs
    hparam_list = [
        'mode', 'loss_rate_of_sem', 'loss_rate_of_mle', 'word_vocab_size',
        'use_mixed_loss', 'lr', 'train_keep_prob', 'rl_loss_scale_factor',
        'rand_unif_init_mag', 'max_grad_norm', 'hidden_dim', 'emb_dim',
        'batch_size', 'coverage', 'cov_loss_wt', 'pointer_gen'
    ]
    hps_dict = {}
    for key, val in FLAGS.__flags.items():  # for each flag
        if key in hparam_list:  # if it's in the list
            hps_dict[key] = val  # add it to the dict
    hps_dict['max_dec_steps'] = max_ending_len
    hps_dict['min_ending_len'] = min_ending_len
    if FLAGS.word_vocab_size == None:
        hps_dict['word_vocab_size'] = word_vocab_size
    hps = namedtuple("HParams", hps_dict.keys())(**hps_dict)

    # create minibatches of data
    train_batches = get_batches(len(train_data), FLAGS.batch_size)
    valid_batches = get_batches(len(valid_data), FLAGS.batch_size)

    tf.set_random_seed(111)  # a seed value for randomness

    if hps.mode == 'seq2seq_train':
        train_dir = os.path.join(FLAGS.exp_name, "train_seq2seq")
        if not os.path.exists(train_dir): os.makedirs(train_dir)
        with tf.Graph().as_default():
            initializer = tf.random_uniform_initializer(
                -hps.rand_unif_init_mag, hps.rand_unif_init_mag)
            with tf.variable_scope("Model",
                                   reuse=None,
                                   initializer=initializer):
                m_train = SCST_RLModel(is_training=True, hps=hps)
            with tf.variable_scope("Model",
                                   reuse=True,
                                   initializer=initializer):
                m_valid = SCST_RLModel(is_training=False, hps=hps)
            if FLAGS.convert_to_coverage_model:
                assert FLAGS.coverage, "To convert your non-coverage model to a coverage model, run with convert_to_coverage_model=True and coverage=True"
                convert_to_coverage_model()
            sv = tf.train.Supervisor(logdir=train_dir,
                                     save_model_secs=FLAGS.save_model_secs)

            sess_context_manager = sv.managed_session(config=util.get_config())
            tf.logging.info("Created session.")
            try:
                run_seq2seq_training(
                    m_train, m_valid, train_data, train_batches, valid_data,
                    valid_batches, word2id, max_ending_len, sv,
                    sess_context_manager
                )  # this is an infinite loop until interrupted
            except KeyboardInterrupt:
                tf.logging.info(
                    "Caught keyboard interrupt on worker. Stopping supervisor..."
                )
                sv.stop()

    elif hps.mode == 'rl_train':
        train_dir = os.path.join(
            FLAGS.exp_name, "train_rl" + '_' + FLAGS.reward_type + 'mu_' +
            str(FLAGS.rl_loss_scale_factor))
        if not os.path.exists(train_dir): os.makedirs(train_dir)
        with tf.Graph().as_default():
            initializer = tf.random_uniform_initializer(
                -hps.rand_unif_init_mag, hps.rand_unif_init_mag)
            with tf.variable_scope("Model",
                                   reuse=None,
                                   initializer=initializer):
                m_train = SCST_RLModel(is_training=True, hps=hps)
            with tf.variable_scope("Model",
                                   reuse=True,
                                   initializer=initializer):
                m_valid = SCST_RLModel(is_training=False, hps=hps)

            # define load_pretrain funtion for restoring best seq2seq model from eval_dir
            ckpt_dir = 'eval_seq2seq'
            latest_filename = "checkpoint_best" if ckpt_dir == "eval_seq2seq" else None
            ckpt_dir = os.path.join(FLAGS.exp_name, ckpt_dir)
            ckpt_state = tf.train.get_checkpoint_state(
                ckpt_dir, latest_filename=latest_filename)
            print("loading pre_trained seq2seq model from %s",
                  ckpt_state.model_checkpoint_path)
            saver = tf.train.Saver()

            def load_pretrain(sess):
                return saver.restore(sess, ckpt_state.model_checkpoint_path)

            sv = tf.train.Supervisor(logdir=train_dir,
                                     saver=saver,
                                     save_model_secs=FLAGS.save_model_secs,
                                     init_fn=load_pretrain)
            sess_context_manager = sv.managed_session(config=util.get_config())
            tf.logging.info("Created session.")
            try:
                run_rl_training(m_train, m_valid, train_data, train_batches,
                                valid_data, valid_batches, word2id,
                                max_ending_len, sv, sess_context_manager
                                )  # this is an infinite loop until interrupted
            except KeyboardInterrupt:
                tf.logging.info(
                    "Caught keyboard interrupt on worker. Stopping supervisor..."
                )
                sv.stop()

    elif hps.mode == 'beam_search_decode':
        # This will be the hyperparameters for the decoder model
        decode_model_hps = hps._replace(
            max_dec_steps=1
        )  # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries
        test_examples_list = prepare_data_for_beam_seach_decode(
            test_data, FLAGS.batch_size, word2id, max_plot_len, max_ending_len,
            FLAGS.pointer_gen)
        with tf.Graph().as_default():
            initializer = tf.random_uniform_initializer(
                -hps.rand_unif_init_mag, hps.rand_unif_init_mag)
            with tf.variable_scope("Model",
                                   reuse=None,
                                   initializer=initializer):
                model_test = SCST_RLModel(is_training=False,
                                          hps=decode_model_hps)
                run_beam_search_decode(model_test,
                                       test_examples_list,
                                       id2word,
                                       data='test_data',
                                       ckpt_dir=FLAGS.decode_ckpt_dir)
    else:
        raise ValueError(
            "The 'mode' flag must be one of seq2seq_train/rl_train/beam_search_decode"
        )