def check_vocab(vocab_file, out_dir, check_special_token=True, sos=None, eos=None, unk=None): """Check if vocab_file doesn't exist, create from corpus_file.""" if tf.gfile.Exists(vocab_file): utils.print_out("# Vocab file %s exists" % vocab_file) vocab, vocab_size = load_vocab(vocab_file) if check_special_token: # Verify if the vocab starts with unk, sos, eos # If not, prepend those tokens & generate a new vocab file if not unk: unk = UNK if not sos: sos = SOS if not eos: eos = EOS assert len(vocab) >= 3 if vocab[0] != unk or vocab[1] != sos or vocab[2] != eos: utils.print_out("The first 3 vocab words [%s, %s, %s]" " are not [%s, %s, %s]" % (vocab[0], vocab[1], vocab[2], unk, sos, eos)) vocab = [unk, sos, eos] + vocab vocab_size += 3 new_vocab_file = os.path.join(out_dir, os.path.basename(vocab_file)) with codecs.getwriter("utf-8")(tf.gfile.GFile( new_vocab_file, "wb")) as f: for word in vocab: f.write("%s\n" % word) vocab_file = new_vocab_file else: raise ValueError("vocab_file '%s' does not exist." % vocab_file) vocab_size = len(vocab) return vocab_size, vocab_file
def load_embed_txt(embed_file): """Load embed_file into a python dictionary. Note: the embed_file should be a Glove/word2vec formatted txt file. Assuming Here is an exampe assuming embed_size=5: the -0.071549 0.093459 0.023738 -0.090339 0.056123 to 0.57346 0.5417 -0.23477 -0.3624 0.4037 and 0.20327 0.47348 0.050877 0.002103 0.060547 For word2vec format, the first line will be: <num_words> <emb_size>. Args: embed_file: file path to the embedding file. Returns: a dictionary that maps word to vector, and the size of embedding dimensions. """ emb_dict = dict() emb_size = None is_first_line = True with codecs.getreader("utf-8")(tf.gfile.GFile(embed_file, "rb")) as f: for line in f: tokens = line.rstrip().split(" ") if is_first_line: is_first_line = False if len(tokens) == 2: # header line emb_size = int(tokens[1]) continue word = tokens[0] vec = list(map(float, tokens[1:])) emb_dict[word] = vec if emb_size: if emb_size != len(vec): utils.print_out( "Ignoring %s since embeding size is inconsistent." % word) del emb_dict[word] else: emb_size = len(vec) return emb_dict, emb_size
def decode_and_evaluate(name, model, sess, trans_file, ref_file, metrics, subword_option, beam_width, tgt_eos, num_translations_per_input=1, decode=True, infer_mode="greedy", index_pair=[]): """Decode a test set and compute a score according to the evaluation task.""" # Decode end_time = None num_sentences = None if decode: utils.print_out(" decoding to output %s" % trans_file) start_time = time.time() num_sentences = 0 with codecs.getwriter("utf-8")(tf.gfile.GFile(trans_file, mode="wb")) as trans_f: trans_f.write("") # Write empty string to ensure file is created. if infer_mode == "greedy": num_translations_per_input = 1 elif infer_mode == "beam_search": num_translations_per_input = min(num_translations_per_input, beam_width) translation = [] while True: try: nmt_outputs, _ = model.decode(sess) if infer_mode != "beam_search": nmt_outputs = np.expand_dims(nmt_outputs, 0) batch_size = nmt_outputs.shape[1] num_sentences += batch_size for sent_id in range(batch_size): for beam_id in range(num_translations_per_input): translation.append( get_translation(nmt_outputs[beam_id], sent_id, tgt_eos=tgt_eos, subword_option=subword_option)) except tf.errors.OutOfRangeError: end_time = time.time() utils.print_time( " done, num sentences %d, num translations per input %d" % (num_sentences, num_translations_per_input), start_time) break if len(index_pair) is 0: for sentence in translation: trans_f.write((sentence + b"\n").decode("utf-8")) else: for i in index_pair: trans_f.write( (translation[index_pair[i]] + b"\n").decode("utf-8")) # Evaluation evaluation_scores = {} if ref_file and tf.gfile.Exists(trans_file): for metric in metrics: score = evaluation_utils.evaluate(ref_file, trans_file, metric, subword_option=subword_option) evaluation_scores[metric] = score utils.print_out(" %s %s: %.1f" % (metric, name, score)) return evaluation_scores, end_time, num_sentences