def __init__(self, inputs_file, reference_file, vocab_file): with tf.io.gfile.GFile(inputs_file) as f: records = f.read().split("\n") inputs = [record.strip() for record in records] if not inputs[-1]: inputs.pop() self.ref_lines = tokenizer.native_to_unicode( tf.io.gfile.GFile(reference_file).read()).strip().splitlines() subtokenizer = Subtokenizer(vocab_file) self.batch = [] token_lens = [] for i, line in enumerate(inputs): enc = subtokenizer.encode(line, add_eos=True) token_lens.append((i, len(enc))) sorted_by_token_input_lens = sorted(token_lens, key=lambda x: x[1], reverse=True) sorted_inputs = [None] * len(sorted_by_token_input_lens) sorted_keys = [0] * len(sorted_by_token_input_lens) lines = [] for i, (index, _) in enumerate(sorted_by_token_input_lens): sorted_inputs[i] = inputs[index] sorted_keys[index] = i enc = subtokenizer.encode(sorted_inputs[i], add_eos=True) lines.append([enc]) for i in sorted_keys: self.batch.append(lines[i])
def input_generator_ws(): """Read and sort lines from the file sorted by decreasing length based on word counts. Args: filename: String name of file to read inputs from. Returns: Sorted list of inputs, and dictionary mapping original index->sorted index of each element. """ with tf.gfile.Open(FLAGS.file) as f: records = f.read().split("\n") inputs = [record.strip() for record in records] if not inputs[-1]: inputs.pop() batch = [] subtokenizer = Subtokenizer(FLAGS.vocab_file) input_lens = [(i, len(line.split())) for i, line in enumerate(inputs)] sorted_input_lens = sorted(input_lens, key=lambda x: x[1], reverse=True) sorted_inputs = [None] * len(sorted_input_lens) sorted_keys = [0] * len(sorted_input_lens) for i, (index, _) in enumerate(sorted_input_lens): sorted_inputs[i] = inputs[index] sorted_keys[index] = i enc = subtokenizer.encode(sorted_inputs[i], add_eos=True) batch.append(enc) return batch, sorted_keys
def input_generator_ts(file_path, vocab_file): """Read and sort lines based on token count from the file sorted by decreasing length based on token sorting. Args: file_path: String path of file to read vocab_file: String path of vocab file Returns: Sorted list of inputs, and dictionary mapping original index->sorted index of each element. """ with tf.io.gfile.GFile(file_path) as f: records = f.read().split("\n") inputs = [record.strip() for record in records] if not inputs[-1]: inputs.pop() subtokenizer = Subtokenizer(vocab_file) batch = [] token_lens = [] for i, line in enumerate(inputs): enc = subtokenizer.encode(line, add_eos=True) token_lens.append((i, len(enc))) sorted_by_token_input_lens = sorted(token_lens, key=lambda x: x[1], reverse=True) sorted_inputs = [None] * len(sorted_by_token_input_lens) sorted_keys = [0] * len(sorted_by_token_input_lens) for i, (index, _) in enumerate(sorted_by_token_input_lens): sorted_inputs[i] = inputs[index] sorted_keys[index] = i enc = subtokenizer.encode(sorted_inputs[i], add_eos=True) batch.append(enc) return batch, sorted_keys