Пример #1
0
def do_save_inference_model(args):
    if args.use_cuda:
        dev_count = fluid.core.get_cuda_device_count()
        place = fluid.CUDAPlace(0)
    else:
        dev_count = int(os.environ.get('CPU_NUM', 1))
        place = fluid.CPUPlace()

    test_prog = fluid.default_main_program()
    startup_prog = fluid.default_startup_program()

    with fluid.program_guard(test_prog, startup_prog):
        with fluid.unique_name.guard():
            infer_loader, probs, feed_target_names = create_model(
                args, num_labels=args.num_labels, is_prediction=True)

    test_prog = test_prog.clone(for_test=True)
    exe = fluid.Executor(place)
    exe.run(startup_prog)

    assert (args.init_checkpoint)

    if args.init_checkpoint:
        utils.init_checkpoint(exe, args.init_checkpoint, test_prog)

    fluid.io.save_inference_model(
        args.inference_model_dir,
        feeded_var_names=feed_target_names,
        target_vars=[probs],
        executor=exe,
        main_program=test_prog,
        model_filename="model.pdmodel",
        params_filename="params.pdparams")

    print("save inference model at %s" % (args.inference_model_dir))
Пример #2
0
def test_inference_model(args):
    if args.use_cuda:
        dev_count = fluid.core.get_cuda_device_count()
        place = fluid.CUDAPlace(0)
    else:
        dev_count = int(os.environ.get('CPU_NUM', 1))
        place = fluid.CPUPlace()
    
    exe = fluid.Executor(place)
    test_prog = fluid.Program()
    startup_prog = fluid.Program()

    with fluid.program_guard(test_prog, startup_prog):
        with fluid.unique_name.guard():
            infer_pyreader, probs, feed_target_names = create_model(
                args,
                pyreader_name='infer_reader',
                num_labels=args.num_labels,
                is_prediction=True)

    test_prog = test_prog.clone(for_test=True)
    exe = fluid.Executor(place)
    exe.run(startup_prog)

    processor = reader.SentaProcessor(data_dir=args.data_dir,
        vocab_path=args.vocab_path,
        random_seed=args.random_seed,
        max_seq_len=args.max_seq_len)

    num_labels = len(processor.get_labels())

    assert (args.inference_model_dir)
    infer_program, feed_names, fetch_targets = fluid.io.load_inference_model(
        dirname=args.inference_model_dir,
        executor=exe,
        model_filename="model.pdmodel",
        params_filename="params.pdparams")

    infer_data_generator = processor.data_generator(
        batch_size=args.batch_size/dev_count,
        phase="infer",
        epoch=1,
        shuffle=False)
    
    infer_pyreader.set_sample_list_generator(infer_data_generator)
    inference(exe, test_prog, infer_pyreader,
        [probs.name], "infer")
Пример #3
0
        def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
            from tensorflow.python.estimator.model_fn import EstimatorSpec

            tf.logging.info("*** Features ***")
            for name in sorted(features.keys()):
                tf.logging.info("  name = %s, shape = %s" %
                                (name, features[name].shape))

            input_ids = features["input_ids"]
            input_mask = features["input_mask"]
            segment_ids = features["segment_ids"]
            label_ids = features["label_ids"]

            is_training = (mode == tf.estimator.ModeKeys.TRAIN)

            (total_loss, per_example_loss, logits,
             probabilities) = create_model(bert_config, is_training, input_ids,
                                           input_mask, segment_ids, label_ids,
                                           num_labels, use_one_hot_embeddings)

            tvars = tf.trainable_variables()
            initialized_variable_names = {}

            print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            if init_checkpoint:
                print("?????????????????????????????????????????????????????",
                      init_checkpoint)
                (assignment_map, initialized_variable_names) \
                    = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

            tf.logging.info("**** Trainable Variables ****")
            with tf.Session() as sess:
                sess.run(tf.global_variables_initializer())
                for var in tvars:
                    init_string = ""
                    if var.name in initialized_variable_names:
                        if len(var.shape) == 2:
                            print("var.shape=2", var.name, sess.run(var)[0][1])
                        else:
                            print("var.shape=1", var.name, sess.run(var)[1])
                        init_string = ", *INIT_FROM_CKPT*"
                    tf.logging.info("  name = %s, shape = %s%s", var.name,
                                    var.shape, init_string)
            output_spec = EstimatorSpec(mode=mode, predictions=probabilities)

            return output_spec
Пример #4
0
def test_inference_model(args, texts):
    if args.use_cuda:
        dev_count = fluid.core.get_cuda_device_count()
        place = fluid.CUDAPlace(0)
    else:
        dev_count = int(os.environ.get('CPU_NUM', 1))
        place = fluid.CPUPlace()

    test_prog = fluid.default_main_program()
    startup_prog = fluid.default_startup_program()

    with fluid.program_guard(test_prog, startup_prog):
        with fluid.unique_name.guard():
            infer_pyreader, probs, feed_target_names = create_model(
                args,
                pyreader_name='infer_reader',
                num_labels=args.num_labels,
                is_prediction=True)

    test_prog = test_prog.clone(for_test=True)
    exe = fluid.Executor(place)
    exe.run(startup_prog)

    assert (args.inference_model_dir)
    infer_program, feed_names, fetch_targets = fluid.io.load_inference_model(
        dirname=args.inference_model_dir,
        executor=exe,
        model_filename="model.pdmodel",
        params_filename="params.pdparams")
    data = []
    for query in texts:
        wids = utils.query2ids(args.vocab_path, query)
        data.append(wids)
    data_shape = [[len(w) for w in data]]
    pred = exe.run(
        infer_program,
        feed={feed_names[0]: fluid.create_lod_tensor(data, data_shape, place)},
        fetch_list=fetch_targets,
        return_numpy=True)
    for probs in pred[0]:
        print("%d\t%f\t%f\t%f" %
              (np.argmax(probs), probs[0], probs[1], probs[2]))
Пример #5
0
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        # tf.logging.info("*** Features ***")
        # for name in sorted(features.keys()):
        #     tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))

        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        label_ids = features["label_ids"]

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        (total_loss, per_example_loss, logits,
         probabilities) = create_model(bert_config, is_training, input_ids,
                                       input_mask, segment_ids, label_ids,
                                       num_labels, use_one_hot_embeddings)

        output_spec = tf.estimator.EstimatorSpec(
            mode=tf.estimator.ModeKeys.PREDICT, predictions=probabilities)
        return output_spec
def main(data_name,
         out_file,
         model_path,
         step=1,
         config_file="./bert_config_1.json",
         vocab_file="./vocab/vocab_1kmer.txt"):
    tf.logging.set_verbosity(tf.logging.INFO)
    batch_size = 32
    use_tpu = False
    seq_length = 128
    # vocab_file = "./vocab/vocab_2kmer.txt"
    init_checkpoint = model_path
    bert_config = modeling.BertConfig.from_json_file(config_file)
    learning_rate = 2e-5
    num_train_steps = 100
    num_warmup_steps = 10

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.per_process_gpu_memory_fraction = 0.75
    samples_num = fasta2record(data_name,
                               "predict.tf_record",
                               vocab_file,
                               step=step)
    batch_num = math.ceil(samples_num / batch_size)
    input_file = "predict.tf_record"
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=True)
    input_ids = tf.placeholder(dtype=tf.int32, shape=(None, 128))
    input_mask = tf.placeholder(dtype=tf.int32, shape=(None, 128))
    segment_ids = tf.placeholder(dtype=tf.int32, shape=(None, 128))
    label_ids = tf.placeholder(dtype=tf.int32, shape=(None, ))
    is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)
    num_labels = 2
    use_one_hot_embeddings = False
    is_training = True
    (total_loss, per_example_loss, logits,
     probabilities) = create_model(bert_config, is_training, input_ids,
                                   input_mask, segment_ids, label_ids,
                                   num_labels, use_one_hot_embeddings)
    tvars = tf.trainable_variables()
    initialized_variable_names = {}
    scaffold_fn = None

    if init_checkpoint:
        (assignment_map, initialized_variable_names
         ) = modeling.get_assignment_map_from_checkpoint(
             tvars, init_checkpoint)
        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
    train_op = optimization.create_optimizer(total_loss, learning_rate,
                                             num_train_steps, num_warmup_steps,
                                             use_tpu)
    name_to_features = {
        "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
        "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
        "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
        "label_ids": tf.FixedLenFeature([], tf.int64),
        "is_real_example": tf.FixedLenFeature([], tf.int64),
    }
    drop_remainder = False

    def _decode_record(record, name_to_features):
        example = tf.parse_single_example(record, name_to_features)
        for name in list(example.keys()):
            t = example[name]
            if t.dtype == tf.int64:
                t = tf.to_int32(t)
            example[name] = t
        return example

    def input_fn(params):
        batch_size = params["batch_size"]
        d = tf.data.TFRecordDataset(input_file)
        d = d.apply(
            tf.contrib.data.map_and_batch(
                lambda record: _decode_record(record, name_to_features),
                batch_size=batch_size,
            ))
        return d

    predict_data = input_fn({"batch_size": batch_size})
    iterator = predict_data.make_one_shot_iterator().get_next()
    all_prob = []
    with tf.Session(config=config) as sess:
        init = tf.global_variables_initializer()
        sess.run(init)
        for _ in range(batch_num):
            examples = sess.run(iterator)
            prob = \
                sess.run(probabilities,
                         feed_dict={input_ids: examples["input_ids"],
                                    input_mask: examples["input_mask"],
                                    segment_ids: examples["segment_ids"],
                                    label_ids: examples["label_ids"]})
            all_prob.extend(prob[:, 1].tolist())
    # print(all_prob)
    # print(len(all_prob))
    with open(data_name) as f:
        lines = f.readlines()
    with open(out_file, "w") as f:
        index = 0
        for line in lines:
            if line[0] == ">":
                f.write(line)
            else:
                f.write(line.strip() + " " + str(all_prob[index]) + "\n")
                index += 1
Пример #7
0
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        global initialized_variable_names
        tf.logging.info("*** Features ***")
        for name in sorted(features.keys()):
            tf.logging.info("  name = %s, shape = %s" %
                            (name, features[name].shape))

        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        label_ids = features["label_ids"]

        print(features)

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        (total_loss, per_example_loss,
         logits) = create_model(bert_config, is_training, input_ids,
                                input_mask, segment_ids, label_ids, num_labels,
                                use_one_hot_embeddings)

        tvars = tf.trainable_variables()

        scaffold_fn = None
        if init_checkpoint:
            (assignment_map, initialized_variable_names
             ) = modeling.get_assigment_map_from_checkpoint(
                 tvars, init_checkpoint)
            if use_tpu:

                def tpu_scaffold():
                    tf.train.init_from_checkpoint(init_checkpoint,
                                                  assignment_map)
                    return tf.train.Scaffold()

                scaffold_fn = tpu_scaffold
            else:
                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

        # tf.logging.info("**** Trainable Variables ****")
        # residue = []
        # for var in tvars:
        #     init_string = ""
        #     if var.name in initialized_variable_names:
        #         init_string = ", *INIT_FROM_CKPT*"
        #     else:
        #         residue.append(var)
        #     tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
        #                     init_string)

        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:

            train_op = optimization.create_optimizer(total_loss, learning_rate,
                                                     num_train_steps,
                                                     num_warmup_steps, use_tpu)

            output_spec = tpu.TPUEstimatorSpec(mode=mode,
                                               loss=total_loss,
                                               train_op=train_op,
                                               scaffold_fn=scaffold_fn)
        elif mode == tf.estimator.ModeKeys.PREDICT:

            # def metric_fn(_per_example_loss, _label_ids, _logits):
            #     predictions = tf.argmax(_logits, axis=-1, output_type=tf.int32)
            #     accuracy = tf.metrics.accuracy(_label_ids, predictions)
            #
            #     loss = tf.metrics.mean(_per_example_loss)
            #     return {
            #         "eval_accuracy": accuracy,
            #         "eval_loss": loss,
            #     }
            #
            # eval_metrics = (metric_fn, [per_example_loss, label_ids, logits])
            output_spec = tpu.TPUEstimatorSpec(mode=mode,
                                               predictions=logits,
                                               loss=total_loss,
                                               scaffold_fn=scaffold_fn)
        else:
            raise ValueError("Only TRAIN and EVAL modes are supported: %s" %
                             (mode))

        return output_spec
def main():
    # The following are the input parameters.
    # When changing the dictionary, please modify the value of vocab_size in the file bert_config.json
    do_eval = FLAGS.do_eval
    do_save_model = FLAGS.do_save_model
    data_name = FLAGS.data_name
    # Record the number of samples in each data set
    train_dict = {
        "AMPScan": 2132,
        "BiLSTM": 4174,
        "iAMP": 1758,
        "MAMPs": 5234,
        "fold": 2928,
        "all_data": 8978,
    }
    test_dict = {
        "AMPScan": 1424,
        "BiLSTM": 1156,
        "iAMP": 1839,
        "MAMPs": 1666,
        "fold": 2119,
        "all_data": 8978,
    }
    tf.logging.set_verbosity(tf.logging.INFO)
    train_example_num = train_dict[data_name]
    test_example_num = test_dict[data_name]
    batch_size = FLAGS.batch_size  # If the GPU memory is not enough, you can consider reducing it
    train_batch_num = math.ceil(train_example_num / batch_size)
    test_batch_num = math.ceil(test_example_num / batch_size)
    num_train_epochs = FLAGS.num_train_epochs
    warmup_proportion = FLAGS.warmup_proportion
    learning_rate = FLAGS.learning_rate
    use_tpu = FLAGS.using_tpu
    seq_length = FLAGS.seq_length
    data_root = FLAGS.data_root
    vocab_file = FLAGS.vocab_file
    init_checkpoint = FLAGS.init_checkpoint
    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.per_process_gpu_memory_fraction = 0.75  # Prevent directly occupying all GPU
    # Enter the training set, this file is generated using ljy_tsv2record
    input_file = data_root + "/train.tf_record"
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=True)
    num_train_steps = int(train_example_num / batch_size * num_train_epochs)
    num_warmup_steps = int(num_train_steps * warmup_proportion)
    input_ids = tf.placeholder(dtype=tf.int32, shape=(None, 128))
    input_mask = tf.placeholder(dtype=tf.int32, shape=(None, 128))
    segment_ids = tf.placeholder(dtype=tf.int32, shape=(None, 128))
    label_ids = tf.placeholder(
        dtype=tf.int32,
        shape=(None, ))  # Leave four placeholders for entering data and labels
    is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)
    is_training = True
    num_labels = 2
    use_one_hot_embeddings = False
    (total_loss, per_example_loss, logits,
     probabilities) = create_model(bert_config, is_training, input_ids,
                                   input_mask, segment_ids, label_ids,
                                   num_labels, use_one_hot_embeddings)
    tvars = tf.trainable_variables()
    initialized_variable_names = {}
    scaffold_fn = None

    if init_checkpoint:
        (assignment_map, initialized_variable_names
         ) = modeling.get_assignment_map_from_checkpoint(
             tvars, init_checkpoint)
        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
    train_op = optimization.create_optimizer(total_loss, learning_rate,
                                             num_train_steps, num_warmup_steps,
                                             use_tpu)
    name_to_features = {
        "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
        "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
        "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
        "label_ids": tf.FixedLenFeature([], tf.int64),
        "is_real_example": tf.FixedLenFeature([], tf.int64),
    }
    drop_remainder = False

    def _decode_record(record, name_to_features):
        example = tf.parse_single_example(record, name_to_features)
        for name in list(example.keys()):
            t = example[name]
            if t.dtype == tf.int64:
                t = tf.to_int32(t)
            example[name] = t
        return example

    def input_fn(params):
        batch_size = params["batch_size"]
        d = tf.data.TFRecordDataset(input_file)
        if is_training:
            d = d.repeat()
            d = d.shuffle(buffer_size=100)
        d = d.apply(
            tf.contrib.data.map_and_batch(
                lambda record: _decode_record(record, name_to_features),
                batch_size=batch_size,
            ))
        return d

    train_data = input_fn({"batch_size": batch_size})
    # Generate the training set data iterator, the iterator will output data in the loop
    iterator = train_data.make_one_shot_iterator().get_next()
    if do_eval:
        input_file = data_root + "/dev.tf_record"
        dev_data = input_fn({"batch_size": batch_size})
        dev_iterator = dev_data.make_one_shot_iterator().get_next()
    val_accs = []
    sps = []
    sns = []
    if do_save_model:
        saver = tf.train.Saver()
    with tf.Session(config=config) as sess:
        init = tf.global_variables_initializer()
        sess.run(init)

        for step in range(num_train_epochs):
            start_time = time.time()
            for _ in range(train_batch_num):
                examples = sess.run(
                    iterator)  # Run iterator to generate samples
                # print(examples)
                _, loss = \
                    sess.run([train_op, total_loss],
                             feed_dict={input_ids: examples["input_ids"],
                                        input_mask: examples["input_mask"],
                                        segment_ids: examples["segment_ids"],
                                        label_ids: examples["label_ids"]})
            print("step:", step, " loss:", round(loss, 4), end=" ")
            all_prob = []
            all_labels = []
            all_pre_labels = []
            if not do_eval:
                end_time = time.time()
                eta_time = (end_time - start_time) * \
                    (num_train_epochs - step - 1)
                print(" eta time:", eta_time, "s")
                continue
            for _ in range(test_batch_num):
                examples = sess.run(dev_iterator)
                loss, prob = \
                    sess.run([total_loss, probabilities],
                             feed_dict={input_ids: examples["input_ids"],
                                        input_mask: examples["input_mask"],
                                        segment_ids: examples["segment_ids"],
                                        label_ids: examples["label_ids"]})
                all_prob.extend(prob[:, 1].tolist())
                all_labels.extend(examples["label_ids"].tolist())
                pre_labels = np.argmax(prob, axis=-1).tolist()
                all_pre_labels.extend(pre_labels)
            acc = accuracy_score(all_labels, all_pre_labels)
            val_accs.append(acc)
            auc = roc_auc_score(all_labels, all_prob)
            mcc = matthews_corrcoef(all_labels, all_pre_labels)
            c_mat = confusion_matrix(all_labels, all_pre_labels)
            sn = c_mat[1, 1] / np.sum(c_mat[1, :])
            sp = c_mat[0, 0] / np.sum(c_mat[0, :])
            sps.append(sp)
            sns.append(sn)
            end_time = time.time()
            eta_time = (end_time - start_time) * (num_train_epochs - step - 1)
            print("SE:", sn, " SP:", sp, " ACC:", acc, " MCC:", mcc, " auROC:",
                  auc, " eta time:", eta_time, "s")

        if do_save_model:
            save_path = saver.save(sess, FLAGS.save_path)
Пример #9
0
global graph
graph = tf.get_default_graph()
with graph.as_default():
    print("going to restore checkpoint")
    is_training = False
    use_one_hot_embeddings = False
    batch_size = 1
    num_labels = len(label_list)
    input_ids_p = tf.placeholder(tf.int32, [batch_size, None],
                                 name="input_ids")
    input_mask_p = tf.placeholder(tf.int32, [batch_size, None],
                                  name="input_mask")
    label_ids_p = tf.placeholder(tf.int32, [batch_size], name="label_ids")
    segment_ids_p = tf.placeholder(tf.int32, [None], name="segment_ids")
    total_loss, per_example_loss, logits, probabilities = create_model(
        bert_config, is_training, input_ids_p, input_mask_p, segment_ids_p,
        label_ids_p, num_labels, use_one_hot_embeddings)
    saver = tf.train.Saver()
    saver.restore(
        sess,
        tf.train.latest_checkpoint(
            os.path.join(input_model_checkpoint, 'model_output')))
    graph_def = tf.get_default_graph().as_graph_def()

    # 1
    # variables + saved_model.pb
    tf.saved_model.simple_save(sess,
                               export_model_dir,
                               inputs={
                                   "input_ids": input_ids_p,
                                   "input_mask": input_mask_p,
    #sess.run(tf.global_variables_initializer())  #max_seq_length
    input_ids_p = tf.placeholder(tf.int32, [1, config.max_seq_length],
                                 name="input_ids")
    input_mask_p = tf.placeholder(tf.int32, [1, config.max_seq_length],
                                  name="input_mask")

    bert_config = modeling.BertConfig.from_json_file(
        os.path.join(bert_dir, 'bert_config.json'))
    # #def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
    #                  labels, num_labels, use_one_hot_embeddings):

    (total_loss, per_example_loss, logits,
     probabilities) = create_model(bert_config=bert_config,
                                   is_training=False,
                                   input_ids=input_ids_p,
                                   input_mask=input_mask_p,
                                   segment_ids=None,
                                   labels=None,
                                   num_labels=len(labels),
                                   use_one_hot_embeddings=False)

    saver = tf.train.Saver()
    saver.restore(sess, tf.train.latest_checkpoint(model_dir))

tokenizer = tokenization.FullTokenizer(vocab_file=os.path.join(
    bert_dir, 'vocab.txt'),
                                       do_lower_case=True)


@app.route('/class_predict_service', methods=['GET', 'POST'])
def class_predict_service():