def predict(args): tf.logging.set_verbosity(tf.logging.INFO) params = tf.contrib.training.HParams( data_path=args.data_path, model_dir=args.model_dir, model_name=args.model_name, vocab_path=args.vocab_path, model_params=args.model_params, device_list=args.device_list or [0], allow_growth=True ) mparams = get_model_params(args.model_name) params = merge_params(params, mparams) params.parse(args.model_params) dparams = decoding_params() params = merge_params(params, dparams) params.parse(args.decoding_params) vocabulary = get_vocabulary(params.vocab_path) params.add_hparam("vocabulary", vocabulary) if args.emb_path: if args.emb_path.find("glove") > 0: emb = load_glove_embedding(args.emb_path, None) else: emb = np.loadtxt(args.emb_path).astype("float32") else: emb = None params.add_hparam("embedding", emb) config = tf.contrib.learn.RunConfig( model_dir=params.model_dir, session_config=session_config(params), ) # create estimator estimator = tf.contrib.learn.Estimator( model_fn=srl_model, model_dir=params.model_dir, config=config, params=params ) decodes = [] sorted_inputs, sorted_keys, num_batches, input_fn = get_sorted_input_fn( params.data_path, params.vocabulary["inputs"], params.decode_batch_size * len(params.device_list), params ) ivocab = {"inputs": {}, "targets": {}} labels = [] for k, idx in vocabulary["inputs"].items(): ivocab["inputs"][idx] = k for k, idx in vocabulary["targets"].items(): ivocab["targets"][idx] = k for idx in range(len(ivocab["targets"])): labels.append(ivocab["targets"][idx]) tparams = get_transition_params(labels) for i in range(num_batches): result_iter = estimator.predict(input_fn=input_fn.next, as_iterable=True) for result in result_iter: inputs = result["inputs"] outputs = result["outputs"] dist = result["distribution"] input_text = [] output_text = [] index = 0 if args.viterbi: seq_len = 0 while index < len(inputs) and inputs[index] != 0: seq_len += 1 index += 1 dist = dist[:seq_len, :] outputs, _ = tf.contrib.crf.viterbi_decode(dist, tparams) index = 0 while index < len(inputs) and inputs[index] != 0: input_text.append(ivocab["inputs"][inputs[index]]) output_text.append(ivocab["targets"][outputs[index]]) index += 1 # decode to plain text input_text = " ".join(input_text) output_text = " ".join(output_text) if args.verbose: sys.stdout.write("INPUT: %s\n" % input_text) sys.stdout.write("OUTPUT: %s\n" % output_text) decodes.append(output_text) sorted_inputs.reverse() decodes.reverse() outputs = [] for index in range(len(sorted_inputs)): outputs.append(decodes[sorted_keys[index]]) if not args.output_name: base_filename = os.path.basename(params.data_path) decode_filename = base_filename + "." + params.model_name + ".decodes" else: decode_filename = args.output_name outfile = tf.gfile.Open(decode_filename, "w") for output in outputs: outfile.write("%s\n" % output) outfile.close()
def ensemble(args): if len(args.vocab_path) != len(args.checkpoints) + 1: raise ValueError("Unmatched vocabulary number and checkpoint number") # override parameters params = tf.contrib.training.HParams( data_path=args.data_path, model_name=args.model_name, vocab_path=args.vocab_path, model_params=args.model_params, device_list=args.device_list or [0], allow_growth=True ) mparams = get_model_params(args.model_name) params = merge_params(params, mparams) params.parse(args.model_params) dparams = decoding_params() params = merge_params(params, dparams) params.parse(args.decoding_params) if args.emb_path: if args.emb_path.find("glove") > 0: emb = load_glove_embedding(args.emb_path, None) else: emb = np.loadtxt(args.emb_path).astype("float32") else: emb = None vocabularies = get_ensemble_vocabulary(params.vocab_path) model_var_lists = [] model_params_list = [] for i in range(len(args.checkpoints)): cparams = copy.copy(params) cparams.add_hparam("embedding", emb) cparams.add_hparam("vocabulary", vocabularies[i]) model_params_list.append(cparams) # load checkpoints for checkpoint in args.checkpoints: var_list = tf.train.list_variables(checkpoint) values = {} reader = tf.train.load_checkpoint(checkpoint) for (name, shape) in var_list: if not name.startswith("tagger"): continue if name.find("losses_avg") >= 0: continue tensor = reader.get_tensor(name) values[name] = tensor model_var_lists.append(values) # build graph inputs = tf.placeholder(tf.int32, [None, None], "inputs") preds = tf.placeholder(tf.int32, [None, None], "preds") embedding = tf.placeholder(tf.float32, [None, None, None], "embedding") mask = tf.placeholder(tf.float32, [None, None], "mask") features = {"inputs": inputs, "preds": preds} if emb is not None: features["embedding"] = embedding features["mask"] = mask predictions = [] for i in range(len(args.checkpoints)): with tf.variable_scope("tagger_%d" % i): model_fn = get_tagger_model(params.model_name, tf.contrib.learn.ModeKeys.INFER) outputs, probs = model_fn(features, model_params_list[i]) predictions.append(probs) labels = [] ivocab = {} for k, idx in vocabularies[0]["targets"].items(): ivocab[idx] = k for idx in range(len(ivocab)): labels.append(ivocab[idx]) tparams = get_transition_params(labels) # create session with tf.Session(config=session_config(params)) as sess: tf.global_variables_initializer().run() # restore variables all_var_list = tf.trainable_variables() for i in range(len(args.checkpoints)): uninit_var_list = [] for v in all_var_list: if v.name.startswith("tagger_%d" % i): uninit_var_list.append(v) set_variables(uninit_var_list, model_var_lists[i], "tagger_%d" % i) # create input_fn all_sorted_inputs = [] all_sorted_keys = [] all_input_fns = [] for i in range(len(args.checkpoints)): sorted_inputs, sorted_keys, num_batches, fn = get_sorted_input_fn( params.data_path, model_params_list[i].vocabulary["inputs"], params.decode_batch_size * len(params.device_list), model_params_list[i] ) all_sorted_inputs.append(sorted_inputs) all_sorted_keys.append(sorted_keys) all_input_fns.append(fn) decodes = [] for i, input_fn in enumerate(all_input_fns): outputs = [] for features in input_fn: feed_dict = { inputs: features["inputs"], preds: features["preds"] } if args.emb_path: feed_dict[embedding] = features["embedding"] feed_dict[mask] = features["mask"] output = sess.run(predictions[i], feed_dict=feed_dict) outputs.append(output) decodes.append(outputs) # ensemble decodes = list(zip(*decodes)) probs = [] for item in decodes: outputs = sum(item) / float(len(item)) # [batch, max_len, num_label] probs.append(outputs) count = 0 for item in probs: for dist in item: inputs = all_sorted_inputs[0][count] seq_len = len(inputs.strip().split()[1:]) output_text = [] if args.viterbi: dist = dist[:seq_len, :] outputs, _ = tf.contrib.crf.viterbi_decode(dist, tparams) else: dist = dist[:seq_len, :] outputs = np.argmax(dist, axis=1) index = 0 while index < seq_len: output_text.append(ivocab[outputs[index]]) index += 1 # decode to plain text output_text = " ".join(output_text) decodes.append(output_text) count += 1 sorted_inputs.reverse() decodes.reverse() outputs = [] for index in range(len(sorted_inputs)): outputs.append(decodes[sorted_keys[index]]) if not args.output_name: base_filename = os.path.basename(params.data_path) model_name = params.model_name decode_filename = base_filename + "." + model_name + ".decodes" else: decode_filename = args.output_name outfile = tf.gfile.Open(decode_filename, "w") for output in outputs: outfile.write("%s\n" % output) outfile.close()
def train(args): tf.logging.set_verbosity(tf.logging.INFO) params = get_params(args) vocabulary = get_vocabulary(params.vocab_path) params.add_hparam("vocabulary", vocabulary) if args.emb_path: if args.emb_path.find("glove") > 0: emb = load_glove_embedding(args.emb_path, params.vocabulary["inputs"]) else: emb = np.loadtxt(args.emb_path).astype("float32") else: emb = None params.add_hparam("embedding", emb) config = tf.contrib.learn.RunConfig( model_dir=params.model_dir, session_config=session_config(params), keep_checkpoint_max=params.keep_checkpoint_max, save_checkpoints_secs=300 ) # model_fn: (features, labels, mode, params, conifg) => EstimatorSpec # input_fn: () => (features, labels) # create estimator estimator = tf.contrib.learn.Estimator( model_fn=srl_model, model_dir=params.model_dir, config=config, params=params ) # create input_fn train_input_fn = get_input_fn( params.data_path + "*train*", tf.contrib.learn.ModeKeys.TRAIN, params ) if tf.gfile.Glob(params.data_path + "*dev*"): eval_input_fn = get_input_fn( params.data_path + "*dev*", tf.contrib.learn.ModeKeys.EVAL, params ) else: eval_input_fn = None # create experiment experiment = tf.contrib.learn.Experiment( estimator=estimator, eval_metrics=create_tagger_evaluation_metrics(), train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, train_steps=params.train_steps, eval_steps=params.eval_steps, min_eval_frequency=params.min_eval_frequency ) if params.script: process = multiprocessing.Process(target=validate, args=[params]) process.daemon = True process.start() else: process = None # start training try: if eval_input_fn: experiment.train_and_evaluate() else: experiment.train() finally: if process is not None: process.terminate()