class InputEngineSparse(object): def __init__(self, model_path, config_name): vocab_file_in_words = os.path.join(model_path, "vocab_in_words") vocab_file_in_letters = os.path.join(model_path, "vocab_in_letters") vocab_file_out = os.path.join(model_path, "vocab_out") config_file = os.path.join(model_path, config_name) config = Config() config.get_config(config_file) self._data_utility = DataUtility( vocab_file_in_words=vocab_file_in_words, vocab_file_in_letters=vocab_file_in_letters, vocab_file_out=vocab_file_out, max_sentence_length=config.num_steps) self.sparsity = config.sparsity prefix = "import/" self.top_k_name = prefix + "Online/Model/top_k:0" self.state_in_name = prefix + "Online/Model/state:0" self.input_name = prefix + "Online/Model/batched_input_word_ids:0" self.top_k_prediction_name = prefix + "Online/Model/top_k_prediction:1" self.output_name = prefix + "Online/Model/probabilities:0" self.state_out_name = prefix + "Online/Model/state_out:0" saved_model_path = os.path.join( model_path, 'sparse_graph-finetune-' + config_name + '.pb') with open(saved_model_path, 'rb') as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) tf.import_graph_def(graph_def) gpu_config = tf.ConfigProto() gpu_config.gpu_options.per_process_gpu_memory_fraction = config.gpu_fraction self._sess = tf.Session(config=gpu_config) def predict(self, sentence, k): """Feed a sentence (str) and perform inference on this sentence """ global probabilities, top_k_predictions sentence_ids, word_letters = self._data_utility.sentence2ids(sentence) # Feed input sentence word by word. state_out = None for i in range(len(sentence_ids)): feed_values = { self.input_name: [[sentence_ids[i]]], self.top_k_name: k } if i > 0: feed_values[self.state_in_name] = state_out # probabilities is an ndarray of shape (batch_size * time_step) * vocab_size # For inference, batch_size = num_step = 1, thus probabilities.shape = 1 * vocab_size probabilities, top_k_predictions, state_out = self._sess.run( [ self.output_name, self.top_k_prediction_name, self.state_out_name ], feed_dict=feed_values) probability_topk = [ probabilities[0][id] for id in top_k_predictions[0] ] words_out = self._data_utility.ids2outwords(top_k_predictions[0]) return [{ 'word': word, 'probability': float(probability) } if word != '<unk>' else { 'word': '<' + word_letters + '>', 'probability': float(probability) } for word, probability in zip(words_out, probability_topk) ] if len(words_out) > 0 else [] def predict_data(self, sentence): sentence = sentence.rstrip() inputs, words_num, letters_num = self._data_utility.data2ids_line( sentence) if inputs == None: return None words_out = [] state_out = None for i in range(len(inputs)): feed_values = {self.input_name: [[inputs[i]]], self.top_k_name: 3} if i > 0: feed_values[self.state_in_name] = state_out probabilities, top_k_predictions, state_out = self._sess.run( [ self.output_name, self.top_k_prediction_name, self.state_out_name ], feed_dict=feed_values) words = self._data_utility.ids2outwords(top_k_predictions[0]) words_out.append(words) out_str = str( words_out[words_num - 1:words_num + letters_num] if words_num > 0 else [['', '', '']] + words_out[0:letters_num]) return out_str def predict_file(self, test_file_in, test_file_out): testfilein = open(test_file_in, "r") testfileout = open(test_file_out, 'w') t1 = time.time() for sentence in testfilein: sentence = sentence.rstrip() out_str = self.predict_data(sentence) if (out_str): print(sentence + " |#| " + out_str) testfileout.write(sentence + " |#| " + out_str + "\n") else: print("predict error : " + sentence) t2 = time.time() print(t2 - t1) testfilein.close() testfileout.close() def predict_data_probability(self, sentence): sentence = sentence.rstrip() inputs, words_num, letters_num = self._data_utility.data2ids_line( sentence) if inputs == None: return None words_out = [] probability_out = [] state_out = None for i in range(len(inputs)): feed_values = {self.input_name: [[inputs[i]]], self.top_k_name: 3} if i > 0: feed_values[self.state_in_name] = state_out probabilities, top_k_predictions, state_out = self._sess.run( [ self.output_name, self.top_k_prediction_name, self.state_out_name ], feed_dict=feed_values) top3 = top_k_predictions[0] probability_top3 = [probabilities[0][id] for id in top3] words = self._data_utility.ids2outwords(top3) words_out.append(words) probability_out.append(probability_top3) out_str = '' if words_num > 0: words_out_use = words_out[words_num - 1:words_num + letters_num] probability_out_use = probability_out[words_num - 1:words_num + letters_num] for words, probabilities in zip(words_out_use, probability_out_use): out_str_line = '' for word, probability in zip(words, probabilities): out_str_line = out_str_line + " | " + word + ' # ' + '{:.8f}'.format( probability) out_str_line = out_str_line[3:-1] out_str = out_str + " || " + out_str_line out_str = out_str[4:-1] else: words_out_use = words_out[0:letters_num] probability_out_use = probability_out[0:letters_num] for words, probabilities in zip(words_out_use, probability_out_use): out_str_line = '' for word, probability in zip(words, probabilities): out_str_line = out_str_line + " | " + word + ' # ' + '{:.8f}'.format( probability) out_str_line = out_str_line[3:-1] out_str = out_str + " || " + out_str_line return out_str def predict_file_probability(self, test_file_in, test_file_out): testfilein = open(test_file_in, "r") testfileout = open(test_file_out, 'w') t1 = time.time() for sentence in testfilein: sentence = sentence.rstrip() out_str = self.predict_data_probability(sentence) if (out_str): print(sentence + " |#| " + out_str) testfileout.write(sentence + " |#| " + out_str + "\n") else: print("predict error : " + sentence) t2 = time.time() print(t2 - t1) testfilein.close() testfileout.close()
class InputEngineRnn: def __init__(self, graph_file, vocab_path, config_name, use_phrase=False): vocab_file_in_words = os.path.join(vocab_path, "vocab_in_words") vocab_file_in_letters = os.path.join(vocab_path, "vocab_in_letters") vocab_file_out = os.path.join(vocab_path, "vocab_out") vocab_file_phrase = os.path.join(vocab_path, "vocab_phrase") self.use_phrase = use_phrase self._config = Config() self._config.get_config(vocab_path, config_name) self._data_utility = DataUtility(vocab_file_in_words=vocab_file_in_words, vocab_file_in_letters=vocab_file_in_letters, vocab_file_out=vocab_file_out, vocab_file_phrase=vocab_file_phrase) print("in words vocabulary size = %d\nout words vocabulary size = %d\nin letters vocabulary size = %d" "\nphrase vocabulary size = %d" % ( self._config.vocab_size_in, self._config.vocab_size_out, self._config.vocab_size_letter, self._config.vocab_size_phrase)) prefix = "import/" self.lm_state_in_name = prefix + "Online/WordModel/state:0" self.lm_input_name = prefix + "Online/WordModel/batched_input_word_ids:0" self.lm_state_out_name = prefix + "Online/WordModel/state_out:0" self.phrase_p_name = prefix + "Online/WordModel/phrase_p_prediction: 1" self.phrase_p_probability = prefix + "Online/WordModel/phrase_p_probabilities: 0" self.phrase_top_k_name = prefix + "Online/WordModel/phrase_top_k_prediction: 1" self.phrase_top_k_probability = prefix + "Online/WordModel/phrase_probabilities: 0" self.phrase_logits = prefix + "Online/WordModel/logits_phrase: 0" self.kc_top_k_name = prefix + "Online/LetterModel/top_k:0" self.key_length = prefix + "Online/LetterModel/batched_input_sequence_length:0" self.kc_state_in_name = prefix + "Online/LetterModel/state:0" self.kc_lm_state_in_name = prefix + "Online/LetterModel/lm_state_in:0" self.kc_input_name = prefix + "Online/LetterModel/batched_input_word_ids:0" self.kc_top_k_prediction_name = prefix + "Online/LetterModel/top_k_prediction:1" self.kc_output_name = prefix + "Online/LetterModel/probabilities:0" self.kc_state_out_name = prefix + "Online/LetterModel/state_out:0" self.max_test_line = 10000 with open(graph_file, 'rb') as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) tf.import_graph_def(graph_def) gpu_config = tf.ConfigProto() gpu_config.gpu_options.per_process_gpu_memory_fraction = self._config.gpu_fraction self._sess = tf.Session(config=gpu_config) def predict(self, sentence, k): global probabilities, top_k_predictions, probability_topk, probability_p_topk, phrase_p_top_k inputs, inputs_key, word_letters = self._data_utility.sentence2ids(sentence) lm_state_out = np.zeros([self._config.num_layers, 2, 1, self._config.word_hidden_size], dtype=np.float32) kc_state_out = np.zeros([self._config.num_layers, 2, 1, self._config.letter_hidden_size], dtype=np.float32) words_out = list() phrase_logits = None # Phase I: read contexts. if len(inputs) > 0: for i in range(len(inputs)): feed_values = {self.lm_input_name: [[inputs[i]]]} if i > 0: feed_values[self.lm_state_in_name] = lm_state_out # Use previous language model's final state as language model's initial state. if self.use_phrase: lm_state_out, phrase_p_top_k, phrase_p_prob, phrase_logits = self._sess.run([self.lm_state_out_name, self.phrase_p_name, self.phrase_p_probability, self.phrase_logits], feed_dict=feed_values) phrase_p_top_k = [id for id in phrase_p_top_k[0]] probability_p_topk = [phrase_p_prob[0][id] for id in phrase_p_top_k] else: lm_state_out = self._sess.run([self.lm_state_out_name], feed_dict=feed_values)[0] # Phase II: read letters, predict by feed the letters one-by-one. for i in range(len(inputs_key)): feed_values = {self.kc_input_name: [[inputs_key[i]]], self.kc_top_k_name: k} if i == 0 and len(inputs) > 0: feed_values[self.kc_lm_state_in_name] = lm_state_out # Use language model's final state to letter model's initial state when the letters haven't been feed. else: feed_values[self.kc_state_in_name] = kc_state_out # Use letter model's final state to letter model's initial state when feed the letters one-by-one. probabilities, top_k_predictions, kc_state_out = self._sess.run([self.kc_output_name, self.kc_top_k_prediction_name, self.kc_state_out_name], feed_dict=feed_values) probability_topk = [probabilities[0][id] for id in top_k_predictions[0]] words_out = self._data_utility.ids2outwords(top_k_predictions[0]) # Predict phrase if self.use_phrase: if i == 0 and len(inputs) > 0: top_word = words_out[0] top_phrase = self._data_utility.get_top_phrase(phrase_logits, top_word) if top_phrase[0] is not None: is_phrase_p, phrase_p = self.calculate_phrase_p(top_phrase, probability_p_topk, phrase_p_top_k) words_out, probability_topk = self.final_words_out(words_out, top_phrase, phrase_p, probability_topk) return [{'word': word, 'probability': float(probability)} if word != '<unk>' else {'word': '<' + word_letters + '>', 'probability': float(probability)} for word, probability in zip(words_out, probability_topk)] if len(words_out) > 0 else [] def predict_data(self, sentence, k): global probabilities, top_k_predictions, probability_topk, probability_p_topk, phrase_p_top_k sentence = sentence.rstrip() res = self._data_utility.data2ids_line(sentence) if res is None: return None words_line, letters_line, words_ids, letters_ids, words_num, letters_num = res out_str_list = [] probability_topk_list = [] phrase_logits = None lm_state_out = np.zeros([self._config.num_layers, 2, 1, self._config.word_hidden_size], dtype=np.float32) kc_state_out = np.zeros([self._config.num_layers, 2, 1, self._config.letter_hidden_size], dtype=np.float32) for i in range(len(words_ids)): words_out = [] probs_out = [] # Phase I: read contexts. feed_values = {self.lm_input_name: [[words_ids[i]]]} if i > 0: feed_values[self.lm_state_in_name] = lm_state_out # Use previous language model's final state as language model's initial state. if self.use_phrase: lm_state_out, phrase_p_top_k, phrase_p_prob, phrase_logits = self._sess.run( [self.lm_state_out_name, self.phrase_p_name, self.phrase_p_probability, self.phrase_logits], feed_dict=feed_values) phrase_p_top_k = [id for id in phrase_p_top_k[0]] probability_p_topk = [phrase_p_prob[0][id] for id in phrase_p_top_k] else: lm_state_out = self._sess.run([self.lm_state_out_name], feed_dict=feed_values)[0] if i == len(letters_ids): break # Phase II: read letters, predict by feed the letters one-by-one. for j in range(len(letters_ids[i])): feed_values = {self.kc_input_name: [[letters_ids[i][j]]], self.kc_top_k_name: k, self.key_length: [1]} if j == 0 and len(words_ids) > 0: feed_values[self.kc_lm_state_in_name] = lm_state_out # Use language model's final state to letter model's initial state when letters haven't been feed. else: feed_values[self.kc_state_in_name] = kc_state_out # Use letter model's final state to letter model's initial state when feed the letters one-by-one. probabilities, top_k_predictions, kc_state_out = self._sess.run([self.kc_output_name, self.kc_top_k_prediction_name, self.kc_state_out_name], feed_dict=feed_values) probability_topk = [probabilities[0][id] for id in top_k_predictions[0]] words = self._data_utility.ids2outwords(top_k_predictions[0]) # Predict phrase if self.use_phrase: if j == 0 and i > 0: top_word = words[0] top_phrase = self._data_utility.get_top_phrase(phrase_logits, top_word) if top_phrase[0] is not None: is_phrase_p, phrase_p = self.calculate_phrase_p(top_phrase, probability_p_topk, phrase_p_top_k) words, probability_topk = self.final_words_out(words, top_phrase, phrase_p, probability_topk) words_out.append(words) probs_out.append(probability_topk) out_str = words_out if i > 0 else [['','','']] + words_out[1: ] out_str_list.append(out_str) probability_topk_list.append(probs_out) return words_line, letters_line, out_str_list, probability_topk_list def calculate_phrase_p(self, top_phrase, probability_p_topk, phrase_p_top_k): is_phrase_p = probability_p_topk[phrase_p_top_k.index(1)] phrase_p = is_phrase_p * top_phrase[1] return is_phrase_p, phrase_p def final_words_out(self, words, top_phrase, phrase_p, probability_topk): for i in range(len(probability_topk)): if phrase_p >= probability_topk[i]: probability_topk[i] = phrase_p words[i] = top_phrase[0] break return words, probability_topk def result_print(self, out_string, out_prob): string = "" for (word, prob) in zip(out_string, out_prob): prob = str(prob) if word != "" else "0.0" string = string + word + ":" + prob + "|" string = string[:-1] return string def predict_file(self, test_file_in, test_file_out, k): testfilein = open(test_file_in, "r") testfileout = open(test_file_out, 'w') t1 = time.time() line_count = 0 for sentence in testfilein: line_count += 1 if line_count > self.max_test_line: break sentence = sentence.rstrip() result = self.predict_data(sentence, k) if result is not None: words_line, letters_line, out_words_list, out_prob_list = result for i in range(len(out_words_list)): print("\t".join(words_line[:i]) + "|#|" + " ".join(letters_line[i]) + "|#|" + "\t".join(words_line[i:]) + "|#|" + '\t'.join([self.result_print(out_words, out_prob) for (out_words, out_prob) in zip(out_words_list[i], out_prob_list[i])]) + "\n") testfileout.write("\t".join(words_line[:i]) + "|#|" + " ".join(letters_line[i]) + "|#|" + "\t".join(words_line[i:]) + "|#|" + '\t'.join([self.result_print(out_words, out_prob) for (out_words, out_prob) in zip(out_words_list[i], out_prob_list[i])]) + "\n") t2 = time.time() print(t2 - t1) testfilein.close() testfileout.close()
class InputEngineRnn: def __init__(self, model_path, model_name, config_name, full_vocab_path=None): vocab_file_in_words = os.path.join(model_path, "vocab_in_words") vocab_file_in_letters = os.path.join(model_path, "vocab_in_letters") vocab_file_out = os.path.join(model_path, "vocab_out") model_file = os.path.join(model_path, model_name) config_file = os.path.join(model_path, config_name) self._config = Config() self._config.get_config(config_file) self._data_utility = DataUtility( vocab_file_in_words=vocab_file_in_words, vocab_file_in_letters=vocab_file_in_letters, vocab_file_out=vocab_file_out, max_sentence_length=self._config.num_steps, full_vocab_file_in_words=full_vocab_path) self._config.batch_size = 1 self._config.num_steps = 1 with tf.Graph().as_default(): with tf.variable_scope("Model"): self._language_model_test = PTBModel(is_training=False, config=self._config, bucket=1) gpu_config = tf.ConfigProto() gpu_config.gpu_options.per_process_gpu_memory_fraction = self._config.gpu_fraction self._sess = tf.Session(config=gpu_config) with self._sess.as_default(): # Do not restore sparse weights from pretrain phase restore_variables = dict() for v in tf.trainable_variables(): if v.name.startswith("Model/Softmax/softmax_sp_trainable_weights") \ or v.name.startswith("Model/Embedding/embedding_sp_trainable_weights"): continue print("restore:", v.name) restore_variables[v.name] = v saver = tf.train.Saver(restore_variables) saver.restore(self._sess, model_file) self._fetches = { "topk": self._language_model_test._top_k_prediction, "probability": self._language_model_test._probabilities, "final_state": self._language_model_test.final_state } def predict(self, sentence, k): state = self._sess.run(self._language_model_test.initial_state) inputs, word_letters = self._data_utility.sentence2ids(sentence) for i in range(len(inputs)): vals = self._sess.run(self._fetches, feed_dict={ self._language_model_test.initial_state: state, self._language_model_test.input_data: [[inputs[i]]], self._language_model_test.target_data: [[0]], self._language_model_test.output_masks: [[0.0]], self._language_model_test.top_k: k }) state = vals["final_state"] topk = vals["topk"][0] probability = vals["probability"][0] probability_topk = [probability[id] for id in topk] words_out = self._data_utility.ids2outwords(topk) return [{ 'word': word, 'probability': float(probability) } if word != '<unk>' else { 'word': '<' + word_letters + '>', 'probability': float(probability) } for word, probability in zip(words_out, probability_topk) ] if len(words_out) > 0 else [] def predict_data(self, sentence): sentence = sentence.rstrip() state = self._sess.run(self._language_model_test.initial_state) inputs, words_num, letters_num = self._data_utility.data2ids_line( sentence) if inputs == None: return None words_out = [] for i in range(len(inputs)): vals = self._sess.run(self._fetches, feed_dict={ self._language_model_test.initial_state: state, self._language_model_test.input_data: [[inputs[i]]], self._language_model_test.target_data: [[0]], self._language_model_test.output_masks: [[0.0]], self._language_model_test.top_k: 3 }) state = vals["final_state"] top3 = vals["topk"][0] words = self._data_utility.ids2outwords(top3) words_out.append(words) out_str = str( words_out[words_num - 1:words_num + letters_num] if words_num > 0 else [['', '', '']] + words_out[0:letters_num]) return out_str def predict_file(self, test_file_in, test_file_out): testfilein = open(test_file_in, "r") testfileout = open(test_file_out, 'w') t1 = time.time() for sentence in testfilein: sentence = sentence.rstrip() out_str = self.predict_data(sentence) if (out_str): print(sentence + " |#| " + out_str) testfileout.write(sentence + " |#| " + out_str + "\n") else: print("predict error : " + sentence) t2 = time.time() print(t2 - t1) testfilein.close() testfileout.close() def predict_data_probability(self, sentence): sentence = sentence.rstrip() state = self._sess.run(self._language_model_test.initial_state) inputs, words_num, letters_num = self._data_utility.data2ids_line( sentence) if inputs == None: return None words_out = [] probability_out = [] for i in range(len(inputs)): vals = self._sess.run(self._fetches, feed_dict={ self._language_model_test.initial_state: state, self._language_model_test.input_data: [[inputs[i]]], self._language_model_test.target_data: [[0]], self._language_model_test.output_masks: [[0.0]], self._language_model_test.top_k: 3 }) state = vals["final_state"] top3 = vals["topk"][0] probability = vals["probability"][0] probability_top3 = [probability[id] for id in top3] words = self._data_utility.ids2outwords(top3) words_out.append(words) probability_out.append(probability_top3) out_str = '' if words_num > 0: words_out_use = words_out[words_num - 1:words_num + letters_num] probability_out_use = probability_out[words_num - 1:words_num + letters_num] for words, probabilities in zip(words_out_use, probability_out_use): out_str_line = '' for word, probability in zip(words, probabilities): out_str_line = out_str_line + " | " + word + ' # ' + '{:.8f}'.format( probability) out_str_line = out_str_line[3:-1] out_str = out_str + " || " + out_str_line out_str = out_str[4:-1] else: words_out_use = words_out[0:letters_num] probability_out_use = probability_out[0:letters_num] for words, probabilities in zip(words_out_use, probability_out_use): out_str_line = '' for word, probability in zip(words, probabilities): out_str_line = out_str_line + " | " + word + ' # ' + '{:.8f}'.format( probability) out_str_line = out_str_line[3:-1] out_str = out_str + " || " + out_str_line return out_str def predict_file_probability(self, test_file_in, test_file_out): testfilein = open(test_file_in, "r") testfileout = open(test_file_out, 'w') t1 = time.time() for sentence in testfilein: sentence = sentence.rstrip() out_str = self.predict_data_probability(sentence) if (out_str): print(sentence + " |#| " + out_str) testfileout.write(sentence + " |#| " + out_str + "\n") else: print("predict error : " + sentence) t2 = time.time() print(t2 - t1) testfilein.close() testfileout.close() def save_model(self, out_path): tf.train.write_graph(self._sess.graph_def, out_path, "graph_rnn.pb", False)
class InputEngineRnn: def __init__(self, graph_file, vocab_path, config_name): vocab_file_in_words = os.path.join(vocab_path, "vocab_in_words") vocab_file_in_letters = os.path.join(vocab_path, "vocab_in_letters") vocab_file_out = os.path.join(vocab_path, "vocab_out") vocab_file_phrase = os.path.join(vocab_path, "vocab_phrase") self._config = Config() self._config.get_config(vocab_path, config_name) self._data_utility = DataUtility( vocab_file_in_words=vocab_file_in_words, vocab_file_in_letters=vocab_file_in_letters, vocab_file_out=vocab_file_out, vocab_file_phrase=vocab_file_phrase) print( "in words vocabulary size = %d\nout words vocabulary size = %d\nin letters vocabulary size = %d" "\nphrase vocabulary size = %d" % (self._config.vocab_size_in, self._config.vocab_size_out, self._config.vocab_size_letter, self._config.vocab_size_phrase)) prefix = "import/" self.lm_state_in_name = prefix + "Online/WordModel/state:0" self.lm_input_name = prefix + "Online/WordModel/batched_input_word_ids:0" self.lm_state_out_name = prefix + "Online/WordModel/state_out:0" # self.lm_top_k_name = prefix + "Online/WordModel/top_k:0" self.phrase_p_name = prefix + "Online/WordModel/phrase_p_prediction: 1" self.phrase_p_probability = prefix + "Online/WordModel/phrase_p_probabilities: 0" self.phrase_top_k_name = prefix + "Online/WordModel/phrase_top_k_prediction: 1" self.phrase_top_k_probability = prefix + "Online/WordModel/phrase_probabilities: 0" self.phrase_logits = prefix + "Online/WordModel/logits_phrase: 0" self.kc_top_k_name = prefix + "Online/LetterModel/top_k:0" self.key_length = prefix + "Online/LetterModel/batched_input_sequence_length:0" self.kc_state_in_name = prefix + "Online/LetterModel/state:0" self.kc_lm_state_in_name = prefix + "Online/LetterModel/lm_state_in:0" self.kc_input_name = prefix + "Online/LetterModel/batched_input_word_ids:0" self.kc_top_k_prediction_name = prefix + "Online/LetterModel/top_k_prediction:1" self.kc_output_name = prefix + "Online/LetterModel/probabilities:0" self.kc_state_out_name = prefix + "Online/LetterModel/state_out:0" with open(graph_file, 'rb') as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) tf.import_graph_def(graph_def) gpu_config = tf.ConfigProto() gpu_config.gpu_options.per_process_gpu_memory_fraction = self._config.gpu_fraction self._sess = tf.Session(config=gpu_config) def predict(self, sentence, k): global probabilities, top_k_predictions, probability_topk, probability_p_topk, phrase_p_top_k inputs, inputs_key, word_letters = self._data_utility.sentence2ids( sentence) # print(inputs) # print(inputs_key) lm_state_out = np.zeros( [self._config.num_layers, 2, 1, self._config.word_hidden_size], dtype=np.float32) kc_state_out = np.zeros( [self._config.num_layers, 2, 1, self._config.letter_hidden_size], dtype=np.float32) words_out = list() phrase_logits = None if len(inputs) > 0: for i in range(len(inputs)): feed_values = {self.lm_input_name: [[inputs[i]]]} if i > 0: feed_values[self.lm_state_in_name] = lm_state_out lm_state_out, phrase_p_top_k, phrase_p_prob, phrase_logits = self._sess.run( [ self.lm_state_out_name, self.phrase_p_name, self.phrase_p_probability, self.phrase_logits ], feed_dict=feed_values) phrase_p_top_k = [id for id in phrase_p_top_k[0]] probability_p_topk = [ phrase_p_prob[0][id] for id in phrase_p_top_k ] for i in range(len(inputs_key)): feed_values = { self.kc_input_name: [[inputs_key[i]]], self.kc_top_k_name: k } if i == 0 and len(inputs) > 0: feed_values[self.kc_lm_state_in_name] = lm_state_out else: feed_values[self.kc_state_in_name] = kc_state_out probabilities, top_k_predictions, kc_state_out = self._sess.run( [ self.kc_output_name, self.kc_top_k_prediction_name, self.kc_state_out_name ], feed_dict=feed_values) probability_topk = [ probabilities[0][id] for id in top_k_predictions[0] ] words_out = self._data_utility.ids2outwords(top_k_predictions[0]) if i == 0 and len(inputs) > 0: top_word = words_out[0] top_phrase = self._data_utility.get_top_phrase( phrase_logits, top_word) if top_phrase[0] is not None: is_phrase_p, phrase_p = self.calculate_phrase_p( top_phrase, probability_p_topk, phrase_p_top_k) words_out, probability_topk = self.final_words_out( words_out, top_phrase, phrase_p, probability_topk) return [{ 'word': word, 'probability': float(probability) } if word != '<unk>' else { 'word': '<' + word_letters + '>', 'probability': float(probability) } for word, probability in zip(words_out, probability_topk) ] if len(words_out) > 0 else [] # def predict_data(self, sentence, k): # global probabilities, top_k_predictions, probability_topk, probability_p_topk, phrase_p_top_k # sentence = sentence.rstrip() # words_line, letters_line, words_ids, letters_ids, words_num, letters_num = self._data_utility.data2ids_line(sentence) # out_str_list = [] # probability_topk_list = [] # # print(words_ids) # # print(letters_ids) # lm_state_out = np.zeros([self._config.num_layers, 2, 1, self._config.word_hidden_size], dtype=np.float32) # kc_state_out = np.zeros([self._config.num_layers, 2, 1, self._config.letter_hidden_size], dtype=np.float32) # # for i in range(len(words_ids)): # words_out = [] # probs_out = [] # feed_values = {self.lm_input_name: [[words_ids[i]]]} # if i > 0: # feed_values[self.lm_state_in_name] = lm_state_out # # lm_state_out, phrase_p_top_k, phrase_p_prob, phrase_logits = self._sess.run( # [self.lm_state_out_name, self.phrase_p_name, self.phrase_p_probability, # self.phrase_logits], feed_dict=feed_values) # phrase_p_top_k = [id for id in phrase_p_top_k[0]] # probability_p_topk = [phrase_p_prob[0][id] for id in phrase_p_top_k] # # if i == len(letters_ids): # break # for j in range(len(letters_ids[i])): # feed_values = {self.kc_input_name: [[letters_ids[i][j]]], # self.kc_top_k_name: k, self.key_length:[1]} # # if j == 0 and len(words_ids) > 0: # feed_values[self.kc_lm_state_in_name] = lm_state_out # else: # feed_values[self.kc_state_in_name] = kc_state_out # probabilities, top_k_predictions, kc_state_out = self._sess.run([self.kc_output_name, self.kc_top_k_prediction_name, # self.kc_state_out_name], feed_dict=feed_values) # probability_topk = [probabilities[0][id] for id in top_k_predictions[0]] # words = self._data_utility.ids2outwords(top_k_predictions[0]) # # if j == 0 and i > 0: # top_word = words[0] # top_phrase = self._data_utility.get_top_phrase(phrase_logits, top_word) # if top_phrase[0] is not None: # is_phrase_p, phrase_p = self.calculate_phrase_p(top_phrase, probability_p_topk, phrase_p_top_k) # words, probability_topk = self.final_words_out(words, top_phrase, phrase_p, probability_topk) # words_out.append(words) # probs_out.append(probability_topk) # out_str = words_out if i > 0 else [['','','']] + words_out[1: ] # out_str_list.append(out_str) # probability_topk_list.append(probs_out) # # return words_line, letters_line, out_str_list, probability_topk_list def calculate_phrase_p(self, top_phrase, probability_p_topk, phrase_p_top_k): is_phrase_p = probability_p_topk[phrase_p_top_k.index(1)] phrase_p = is_phrase_p * top_phrase[1] return is_phrase_p, phrase_p def final_words_out(self, words, top_phrase, phrase_p, probability_topk): for i in range(len(probability_topk)): if phrase_p >= probability_topk[i]: probability_topk[i] = phrase_p words[i] = top_phrase[0] break return words, probability_topk def result_print(self, out_string, out_prob): string = "" for (word, prob) in zip(out_string, out_prob): prob = str(prob) if word != "" else "0.0" string = string + word + ":" + prob + "|" string = string[:-1] return string # def predict_file(self, test_file_in, test_file_out, k): # testfilein = open(test_file_in, "r") # testfileout = open(test_file_out, 'w') # t1 = time.time() # # for sentence in testfilein: # sentence = sentence.rstrip() # result = self.predict_data(sentence, k) # # if result is not None: # words_line, letters_line, out_words_list, out_prob_list = result # # for i in range(len(out_words_list)): # print("\t".join(words_line[:i]) # + "|#|" + letters_line[i] # + "|#|" + "\t".join(words_line[i:]) + "|#|" # + '\t'.join([self.result_print(out_words, out_prob) # for (out_words, out_prob) in zip(out_words_list[i], out_prob_list[i])]) # + "\n") # testfileout.write("\t".join(words_line[:i]) # + "|#|" + letters_line[i] # + "|#|" + "\t".join(words_line[i:]) + "|#|" # + '\t'.join([self.result_print(out_words, out_prob) # for (out_words, out_prob) in zip(out_words_list[i], out_prob_list[i])]) # + "\n") # # t2 = time.time() # print(t2 - t1) # testfilein.close() # testfileout.close() def predict_data(self, sentence, k): sentence = sentence.rstrip() inputs, inputs_key, words_num, letters_num = self._data_utility.data2ids_line( sentence) #上下文的id,要预测的单词的键码部分id,上下文单词数,要预测的单词的字母数 words_out = [] lm_state = np.zeros( [self._config.num_layers, 2, 1, self._config.word_hidden_size], dtype=np.float32) kc_state = np.zeros( [self._config.num_layers, 2, 1, self._config.letter_hidden_size], dtype=np.float32) if len(inputs) > 0: for i in range(len(inputs)): feed_values = {self.lm_input_name: [[inputs[i]]]} if i > 0: feed_values[self.lm_state_in_name] = lm_state # probabilities is an ndarray of shape (batch_size * time_step) * vocab_size # For inference, batch_size = num_step = 1, thus probabilities.shape = 1 * vocab_size result = self._sess.run([self.lm_state_out_name], feed_dict=feed_values) lm_state = result[0] #probability_topk = [probabilities[0][id] for id in top_k_predictions[0]] #words = self._data_utility.ids2outwords(top_k_predictions[0]) #words_out.append(words) for i in range(len(inputs_key)): feed_values = { self.kc_input_name: [[inputs_key[i]]], self.kc_top_k_name: k } if i > 0 or len(inputs) == 0: feed_values[self.kc_state_in_name] = kc_state else: feed_values[self.kc_lm_state_in_name] = lm_state #print (state_out) probabilities, top_k_predictions, kc_state = self._sess.run( [ self.kc_output_name, self.kc_top_k_prediction_name, self.kc_state_out_name ], feed_dict=feed_values) probability_topk = [ probabilities[0][id] for id in top_k_predictions[0] ] words = self._data_utility.ids2outwords(top_k_predictions[0]) words_out.append(words) out_str = str(words_out if words_num > 0 else [['', '', '']] + words_out[1:]) return out_str def predict_file(self, test_file_in, test_file_out, k): testfilein = open(test_file_in, "r") testfileout = open(test_file_out, 'w') t1 = time.time() topk = k for sentence in testfilein: sentence = sentence.rstrip() sentence_in = sentence.lower() out_str = self.predict_data(sentence_in, topk) if (out_str): print(sentence + " | " + out_str) testfileout.write(sentence + " | " + out_str + "\n") else: print("predict error : " + sentence) t2 = time.time() print(t2 - t1) testfilein.close() testfileout.close()
class InputEngineRnn: def __init__(self, graph_file, vocab_path, config_name): vocab_file_in_words = os.path.join(vocab_path, "vocab_in_words") vocab_file_in_letters = os.path.join(vocab_path, "vocab_in_letters") vocab_file_out = os.path.join(vocab_path, "vocab_out") vocab_file_phrase = os.path.join(vocab_path, "vocab_phrase") self._config = Config() self._config.get_config(vocab_path, config_name) self._data_utility = DataUtility( vocab_file_in_words=vocab_file_in_words, vocab_file_in_letters=vocab_file_in_letters, vocab_file_out=vocab_file_out, vocab_file_phrase=vocab_file_phrase) print( "in words vocabulary size = %d\nout words vocabulary size = %d\nin letters vocabulary size = %d" "\nphrase vocabulary size = %d" % (self._config.vocab_size_in, self._config.vocab_size_out, self._config.vocab_size_letter, self._config.vocab_size_phrase)) prefix = "import/" self.lm_state_in_name = prefix + "Online/WordModel/state:0" self.lm_input_name = prefix + "Online/WordModel/batched_input_word_ids:0" self.lm_state_out_name = prefix + "Online/WordModel/state_out:0" self.lm_output_top_k_name = prefix + "Online/WordModel/top_k_prediction:1" self.lm_output_top_k_probability = prefix + "Online/WordModel/probabilities:0" self.lm_top_k_name = prefix + "Online/WordModel/top_k:0" self.phrase_p_name = prefix + "Online/WordModel/phrase_p_prediction: 1" self.phrase_p_probability = prefix + "Online/WordModel/phrase_p_probabilities: 0" self.phrase_top_k_name = prefix + "Online/WordModel/phrase_top_k_prediction: 1" self.phrase_top_k_probability = prefix + "Online/WordModel/phrase_probabilities: 0" self.phrase_logits = prefix + "Online/WordModel/logits_phrase: 0" self.kc_top_k_name = prefix + "Online/LetterModel/top_k:0" self.key_length = prefix + "Online/LetterModel/batched_input_sequence_length:0" self.kc_state_in_name = prefix + "Online/LetterModel/state:0" self.kc_lm_state_in_name = prefix + "Online/LetterModel/lm_state_in:0" self.kc_input_name = prefix + "Online/LetterModel/batched_input_word_ids:0" self.kc_top_k_prediction_name = prefix + "Online/LetterModel/top_k_prediction:1" self.kc_output_name = prefix + "Online/LetterModel/probabilities:0" self.kc_state_out_name = prefix + "Online/LetterModel/state_out:0" with open(graph_file, 'rb') as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) tf.import_graph_def(graph_def) gpu_config = tf.ConfigProto() gpu_config.gpu_options.per_process_gpu_memory_fraction = self._config.gpu_fraction self._sess = tf.Session(config=gpu_config) def predict(self, sentence, k): global probabilities, top_k_predictions, probability_topk, probability_p_topk, phrase_p_top_k inputs, inputs_key, word_letters = self._data_utility.sentence2ids( sentence) #word_letters是最后一个单词 # print(inputs) # print(inputs_key) lm_state_out = np.zeros( [self._config.num_layers, 2, 1, self._config.word_hidden_size], dtype=np.float32) kc_state_out = np.zeros( [self._config.num_layers, 2, 1, self._config.letter_hidden_size], dtype=np.float32) words_out = list() phrase_logits = None if len(inputs) > 0: #对输入的句子的每个单词循环 for i in range(len(inputs)): feed_values = { self.lm_input_name: [[inputs[i]]], self.lm_top_k_name: k } #外面多加一层列表是为了满足batchsize的那一维。即使batchsize为1 if i > 0: feed_values[self.lm_state_in_name] = lm_state_out # lm_state_out, phrase_p_top_k, phrase_p_prob, phrase_logits = self._sess.run([self.lm_state_out_name, # self.phrase_p_name, # self.phrase_p_probability, # self.phrase_logits], # feed_dict=feed_values) lm_state_out, lm_prob, lm_top_k = self._sess.run( [ self.lm_state_out_name, self.lm_output_top_k_probability, self.lm_output_top_k_name ], feed_dict=feed_values) # phrase_p_top_k = [id for id in phrase_p_top_k[0]]#[0]指的是第一个batchsize,本身是个二维的量,第一个维度是batchsize,但是因为是测试,所以batchsize只有1 # probability_p_topk = [phrase_p_prob[0][id] for id in phrase_p_top_k]#对应的归一化后的概率 lm_top_k = [id for id in lm_top_k[0]] lm_probability_topk = [lm_prob[0][id] for id in lm_top_k] words_out = self._data_utility.ids2outwords(lm_top_k) # for i in range(len(inputs_key)):#对最后一个单词内的字母进行循环。 # feed_values = {self.kc_input_name: [[inputs_key[i]]], # self.kc_top_k_name: k} # if i == 0 and len(inputs) > 0: # feed_values[self.kc_lm_state_in_name] = lm_state_out # else: # feed_values[self.kc_state_in_name] = kc_state_out # probabilities, top_k_predictions, kc_state_out = self._sess.run([self.kc_output_name, self.kc_top_k_prediction_name, # self.kc_state_out_name], feed_dict=feed_values) # probability_topk = [probabilities[0][id] for id in top_k_predictions[0]]#softmax归一后(probabilities[0])的概率的前k个值 # words_out = self._data_utility.ids2outwords(top_k_predictions[0])#前k个id转为词 # if i == 0 and len(inputs) > 0: # top_word = words_out[0]#概率最大的那个词 # top_phrase = self._data_utility.get_top_phrase(phrase_logits, top_word)#以概率最大的那个词为首的概率最大的词组,以及他的概率。是个长度为2的元组 # if top_phrase[0] is not None: # is_phrase_p, phrase_p = self.calculate_phrase_p(top_phrase, probability_p_topk, phrase_p_top_k) # words_out, probability_topk = self.final_words_out(words_out, top_phrase, phrase_p, probability_topk)#把词组概率大于单个词的预测概率的那个词和对应的概率换成词组和对应的词组概率 return [{ 'word': word, 'probability': float(probability) } if word != '<unk>' else { 'word': '<' + word_letters + '>', 'probability': float(probability) } for word, probability in zip(words_out, lm_probability_topk) ] if len(words_out) > 0 else [] def predict_data(self, sentence, k): global probabilities, top_k_predictions, probability_topk, probability_p_topk, phrase_p_top_k sentence = sentence.rstrip() words_line, letters_line, words_ids, letters_ids, words_num, letters_num = self._data_utility.data2ids_line( sentence) #把一行输入拆成单词部分,字母部分,单词部分id表示,字母部分id,单词个数,每个单词的字母个数 print('!!!!!', words_ids) print('!!!!!', letters_ids) out_str_list = [] probability_topk_list = [] # print(words_ids) # print(letters_ids) lm_state_out = np.zeros( [self._config.num_layers, 2, 1, self._config.word_hidden_size], dtype=np.float32) kc_state_out = np.zeros( [self._config.num_layers, 2, 1, self._config.letter_hidden_size], dtype=np.float32) for i in range(len(words_ids)): #对每个单词循环 words_out = [] probs_out = [] feed_values = { self.lm_input_name: [[words_ids[i]]], self.lm_top_k_name: k } if i > 0: feed_values[self.lm_state_in_name] = lm_state_out # lm_state_out, phrase_p_top_k, phrase_p_prob, phrase_logits = self._sess.run( # [self.lm_state_out_name, self.phrase_p_name, self.phrase_p_probability, # self.phrase_logits], feed_dict=feed_values) # phrase_p_top_k = [id for id in phrase_p_top_k[0]]####################################### # probability_p_topk = [phrase_p_prob[0][id] for id in phrase_p_top_k]################################### lm_state_out, lm_prob, lm_top_k = self._sess.run( [ self.lm_state_out_name, self.lm_output_top_k_probability, self.lm_output_top_k_name ], feed_dict=feed_values) lm_top_k = [id for id in lm_top_k[0]] lm_probability_topk = [lm_prob[0][id] for id in lm_top_k] words = self._data_utility.ids2outwords(lm_top_k) if i == len(letters_ids): break for j in range(len(letters_ids[i])): #循环这个单词内部的每个字母 # feed_values = {self.kc_input_name: [[letters_ids[i][j]]], # self.kc_top_k_name: k, self.key_length:[1]} # # if j == 0 and len(words_ids) > 0:#第一个字母的初始状态是从语言模型来的,后面的字母的输入状态是从上一个字母的状态来的 # feed_values[self.kc_lm_state_in_name] = lm_state_out # else: # feed_values[self.kc_state_in_name] = kc_state_out # probabilities, top_k_predictions, kc_state_out = self._sess.run([self.kc_output_name, self.kc_top_k_prediction_name, # self.kc_state_out_name], feed_dict=feed_values) # probability_topk = [probabilities[0][id] for id in top_k_predictions[0]] # words = self._data_utility.ids2outwords(top_k_predictions[0]) # # if j == 0 and i > 0: # top_word = words[0] # top_phrase = self._data_utility.get_top_phrase(phrase_logits, top_word) # if top_phrase[0] is not None: # is_phrase_p, phrase_p = self.calculate_phrase_p(top_phrase, probability_p_topk, phrase_p_top_k) # words, probability_topk = self.final_words_out(words, top_phrase, phrase_p, probability_topk) words_out.append(words) probs_out.append(lm_probability_topk) out_str = words_out if i > 0 else [['', '', '']] + words_out[1:] out_str_list.append(out_str) probability_topk_list.append(probs_out) return words_line, letters_line, out_str_list, probability_topk_list def calculate_phrase_p(self, top_phrase, probability_p_topk, phrase_p_top_k): is_phrase_p = probability_p_topk[phrase_p_top_k.index(1)] phrase_p = is_phrase_p * top_phrase[ 1] #即起到把权重降下来的作用。词组的权重,乘上词组是否在词组表里的概率 return is_phrase_p, phrase_p def final_words_out(self, words, top_phrase, phrase_p, probability_topk): for i in range(len(probability_topk)): if phrase_p >= probability_topk[i]: probability_topk[i] = phrase_p words[i] = top_phrase[0] break return words, probability_topk def result_print(self, out_string, out_prob): string = "" for (word, prob) in zip(out_string, out_prob): prob = str(prob) if word != "" else "0.0" string = string + word + ":" + prob + "|" string = string[:-1] return string def predict_file(self, test_file_in, test_file_out, k): testfilein = open(test_file_in, "r") testfileout = open(test_file_out, 'w') t1 = time.time() jj = 0 for sentence in testfilein: print(jj) jj += 1 sentence = sentence.rstrip() result = self.predict_data(sentence, k) if result is not None: words_line, letters_line, out_words_list, out_prob_list = result for i in range(len(out_words_list)): print("\t".join(words_line[:i]) + "|#|" + letters_line[i] + "|#|" + "\t".join(words_line[i:]) + "|#|" + '\t'.join([ self.result_print(out_words, out_prob) for (out_words, out_prob) in zip( out_words_list[i], out_prob_list[i]) ]) + "\n") testfileout.write( "\t".join(words_line[:i]) + "|#|" + letters_line[i] + "|#|" + "\t".join(words_line[i:]) + "|#|" + '\t'.join([ self.result_print(out_words, out_prob) for (out_words, out_prob ) in zip(out_words_list[i], out_prob_list[i]) ]) + "\n") t2 = time.time() print(t2 - t1) testfilein.close() testfileout.close()