Пример #1
0
def predict(args):
    tf.logging.set_verbosity(tf.logging.INFO)

    params = tf.contrib.training.HParams(
        data_path=args.data_path,
        model_dir=args.model_dir,
        model_name=args.model_name,
        vocab_path=args.vocab_path,
        model_params=args.model_params,
        device_list=args.device_list or [0],
        allow_growth=True
    )

    mparams = get_model_params(args.model_name)
    params = merge_params(params, mparams)
    params.parse(args.model_params)
    dparams = decoding_params()
    params = merge_params(params, dparams)
    params.parse(args.decoding_params)

    vocabulary = get_vocabulary(params.vocab_path)
    params.add_hparam("vocabulary", vocabulary)

    if args.emb_path:
        if args.emb_path.find("glove") > 0:
            emb = load_glove_embedding(args.emb_path, None)
        else:
            emb = np.loadtxt(args.emb_path).astype("float32")
    else:
        emb = None

    params.add_hparam("embedding", emb)

    config = tf.contrib.learn.RunConfig(
        model_dir=params.model_dir,
        session_config=session_config(params),
    )

    # create estimator
    estimator = tf.contrib.learn.Estimator(
        model_fn=srl_model,
        model_dir=params.model_dir,
        config=config,
        params=params
    )

    decodes = []
    sorted_inputs, sorted_keys, num_batches, input_fn = get_sorted_input_fn(
        params.data_path,
        params.vocabulary["inputs"],
        params.decode_batch_size * len(params.device_list),
        params
    )

    ivocab = {"inputs": {}, "targets": {}}
    labels = []

    for k, idx in vocabulary["inputs"].items():
        ivocab["inputs"][idx] = k

    for k, idx in vocabulary["targets"].items():
        ivocab["targets"][idx] = k

    for idx in range(len(ivocab["targets"])):
        labels.append(ivocab["targets"][idx])

    tparams = get_transition_params(labels)

    for i in range(num_batches):
        result_iter = estimator.predict(input_fn=input_fn.next,
                                        as_iterable=True)

        for result in result_iter:
            inputs = result["inputs"]
            outputs = result["outputs"]
            dist = result["distribution"]
            input_text = []
            output_text = []

            index = 0

            if args.viterbi:
                seq_len = 0
                while index < len(inputs) and inputs[index] != 0:
                    seq_len += 1
                    index += 1
                dist = dist[:seq_len, :]
                outputs, _ = tf.contrib.crf.viterbi_decode(dist, tparams)

            index = 0

            while index < len(inputs) and inputs[index] != 0:
                input_text.append(ivocab["inputs"][inputs[index]])
                output_text.append(ivocab["targets"][outputs[index]])
                index += 1

            # decode to plain text
            input_text = " ".join(input_text)
            output_text = " ".join(output_text)

            if args.verbose:
                sys.stdout.write("INPUT: %s\n" % input_text)
                sys.stdout.write("OUTPUT: %s\n" % output_text)

            decodes.append(output_text)

    sorted_inputs.reverse()
    decodes.reverse()

    outputs = []

    for index in range(len(sorted_inputs)):
        outputs.append(decodes[sorted_keys[index]])

    if not args.output_name:
        base_filename = os.path.basename(params.data_path)
        decode_filename = base_filename + "." + params.model_name + ".decodes"
    else:
        decode_filename = args.output_name

    outfile = tf.gfile.Open(decode_filename, "w")

    for output in outputs:
        outfile.write("%s\n" % output)

    outfile.close()
Пример #2
0
def ensemble(args):
    if len(args.vocab_path) != len(args.checkpoints) + 1:
        raise ValueError("Unmatched vocabulary number and checkpoint number")

    # override parameters
    params = tf.contrib.training.HParams(
        data_path=args.data_path,
        model_name=args.model_name,
        vocab_path=args.vocab_path,
        model_params=args.model_params,
        device_list=args.device_list or [0],
        allow_growth=True
    )

    mparams = get_model_params(args.model_name)
    params = merge_params(params, mparams)
    params.parse(args.model_params)
    dparams = decoding_params()
    params = merge_params(params, dparams)
    params.parse(args.decoding_params)

    if args.emb_path:
        if args.emb_path.find("glove") > 0:
            emb = load_glove_embedding(args.emb_path, None)
        else:
            emb = np.loadtxt(args.emb_path).astype("float32")
    else:
        emb = None

    vocabularies = get_ensemble_vocabulary(params.vocab_path)

    model_var_lists = []
    model_params_list = []

    for i in range(len(args.checkpoints)):
        cparams = copy.copy(params)
        cparams.add_hparam("embedding", emb)
        cparams.add_hparam("vocabulary", vocabularies[i])
        model_params_list.append(cparams)

    # load checkpoints
    for checkpoint in args.checkpoints:
        var_list = tf.train.list_variables(checkpoint)
        values = {}
        reader = tf.train.load_checkpoint(checkpoint)

        for (name, shape) in var_list:
            if not name.startswith("tagger"):
                continue

            if name.find("losses_avg") >= 0:
                continue

            tensor = reader.get_tensor(name)
            values[name] = tensor

        model_var_lists.append(values)

    # build graph
    inputs = tf.placeholder(tf.int32, [None, None], "inputs")
    preds = tf.placeholder(tf.int32, [None, None], "preds")
    embedding = tf.placeholder(tf.float32, [None, None, None], "embedding")
    mask = tf.placeholder(tf.float32, [None, None], "mask")

    features = {"inputs": inputs, "preds": preds}

    if emb is not None:
        features["embedding"] = embedding
        features["mask"] = mask

    predictions = []

    for i in range(len(args.checkpoints)):
        with tf.variable_scope("tagger_%d" % i):
            model_fn = get_tagger_model(params.model_name,
                                        tf.contrib.learn.ModeKeys.INFER)
            outputs, probs = model_fn(features, model_params_list[i])
            predictions.append(probs)

    labels = []
    ivocab = {}

    for k, idx in vocabularies[0]["targets"].items():
        ivocab[idx] = k

    for idx in range(len(ivocab)):
        labels.append(ivocab[idx])

    tparams = get_transition_params(labels)

    # create session
    with tf.Session(config=session_config(params)) as sess:
        tf.global_variables_initializer().run()

        # restore variables
        all_var_list = tf.trainable_variables()

        for i in range(len(args.checkpoints)):
            uninit_var_list = []

            for v in all_var_list:
                if v.name.startswith("tagger_%d" % i):
                    uninit_var_list.append(v)

            set_variables(uninit_var_list, model_var_lists[i], "tagger_%d" % i)

        # create input_fn
        all_sorted_inputs = []
        all_sorted_keys = []
        all_input_fns = []

        for i in range(len(args.checkpoints)):
            sorted_inputs, sorted_keys, num_batches, fn = get_sorted_input_fn(
                params.data_path,
                model_params_list[i].vocabulary["inputs"],
                params.decode_batch_size * len(params.device_list),
                model_params_list[i]
            )
            all_sorted_inputs.append(sorted_inputs)
            all_sorted_keys.append(sorted_keys)
            all_input_fns.append(fn)

        decodes = []

        for i, input_fn in enumerate(all_input_fns):
            outputs = []
            for features in input_fn:
                feed_dict = {
                    inputs: features["inputs"],
                    preds: features["preds"]
                }

                if args.emb_path:
                    feed_dict[embedding] = features["embedding"]
                    feed_dict[mask] = features["mask"]

                output = sess.run(predictions[i], feed_dict=feed_dict)

                outputs.append(output)

            decodes.append(outputs)

        # ensemble
        decodes = list(zip(*decodes))
        probs = []

        for item in decodes:
            outputs = sum(item) / float(len(item))
            # [batch, max_len, num_label]
            probs.append(outputs)

        count = 0

        for item in probs:
            for dist in item:
                inputs = all_sorted_inputs[0][count]
                seq_len = len(inputs.strip().split()[1:])
                output_text = []

                if args.viterbi:
                    dist = dist[:seq_len, :]
                    outputs, _ = tf.contrib.crf.viterbi_decode(dist,
                                                               tparams)
                else:
                    dist = dist[:seq_len, :]
                    outputs = np.argmax(dist, axis=1)

                index = 0

                while index < seq_len:
                    output_text.append(ivocab[outputs[index]])
                    index += 1

                # decode to plain text
                output_text = " ".join(output_text)
                decodes.append(output_text)
                count += 1

        sorted_inputs.reverse()
        decodes.reverse()

        outputs = []

        for index in range(len(sorted_inputs)):
            outputs.append(decodes[sorted_keys[index]])

        if not args.output_name:
            base_filename = os.path.basename(params.data_path)
            model_name = params.model_name
            decode_filename = base_filename + "." + model_name + ".decodes"
        else:
            decode_filename = args.output_name

        outfile = tf.gfile.Open(decode_filename, "w")

        for output in outputs:
            outfile.write("%s\n" % output)

        outfile.close()
Пример #3
0
def train(args):
    tf.logging.set_verbosity(tf.logging.INFO)

    params = get_params(args)
    vocabulary = get_vocabulary(params.vocab_path)
    params.add_hparam("vocabulary", vocabulary)

    if args.emb_path:
        if args.emb_path.find("glove") > 0:
            emb = load_glove_embedding(args.emb_path,
                                       params.vocabulary["inputs"])
        else:
            emb = np.loadtxt(args.emb_path).astype("float32")
    else:
        emb = None

    params.add_hparam("embedding", emb)

    config = tf.contrib.learn.RunConfig(
        model_dir=params.model_dir,
        session_config=session_config(params),
        keep_checkpoint_max=params.keep_checkpoint_max,
        save_checkpoints_secs=300
    )

    # model_fn: (features, labels, mode, params, conifg) => EstimatorSpec
    # input_fn:  () => (features, labels)

    # create estimator
    estimator = tf.contrib.learn.Estimator(
        model_fn=srl_model,
        model_dir=params.model_dir,
        config=config,
        params=params
    )

    # create input_fn
    train_input_fn = get_input_fn(
        params.data_path + "*train*",
        tf.contrib.learn.ModeKeys.TRAIN,
        params
    )

    if tf.gfile.Glob(params.data_path + "*dev*"):
        eval_input_fn = get_input_fn(
            params.data_path + "*dev*", tf.contrib.learn.ModeKeys.EVAL, params
        )
    else:
        eval_input_fn = None

    # create experiment
    experiment = tf.contrib.learn.Experiment(
        estimator=estimator,
        eval_metrics=create_tagger_evaluation_metrics(),
        train_input_fn=train_input_fn,
        eval_input_fn=eval_input_fn,
        train_steps=params.train_steps,
        eval_steps=params.eval_steps,
        min_eval_frequency=params.min_eval_frequency
    )

    if params.script:
        process = multiprocessing.Process(target=validate, args=[params])
        process.daemon = True
        process.start()
    else:
        process = None

    # start training
    try:
        if eval_input_fn:
            experiment.train_and_evaluate()
        else:
            experiment.train()
    finally:
        if process is not None:
            process.terminate()