def _build_subtoken_vocab(all_data): print('build subtoken vocab') def data_generator(): for task_data in all_data: train_data, test_data = task_data for d in train_data + test_data: yield ' '.join(d.sentence) for d in fudan.load_unlabeled_data(): yield d def summary(vocab): lens = [len(vocab.encode(sentence)) for sentence in data_generator()] length = sorted(lens) length = np.asarray(length) max_len = np.max(length) avg_len = np.mean(length) med_len = np.median(length) print('max_len: {}, avg_len: {}, med_len: {}'.format(max_len, avg_len, med_len)) encoder = SubwordTextEncoder() vocab_size = 2 ** 10 * FLAGS.vocab_size vocab = encoder.build_from_generator(data_generator(), vocab_size, 200, reserved_tokens=RESERVED_TOKENS) vocab_file = get_vocab_file() base = os.path.dirname(vocab_file) tf.gfile.MakeDirs(base) vocab.store_to_file(vocab_file) summary(vocab) return vocab
def main(args): subword_encoder = SubwordTextEncoder(args.vocab_file) record_iterator = tf.python_io.tf_record_iterator(path=args.path) record_basename = os.path.basename(args.path) with tf.python_io.TFRecordWriter(os.path.join(args.outdir, record_basename)) as writer: for string_record in record_iterator: example = tf.train.Example() example.ParseFromString(string_record) inputs = dict(example.features.feature)['inputs'] inputs = inputs.int64_list.value inputs_as_int64list = inputs inputs = subword_encoder.decode(inputs) targets = dict(example.features.feature)['targets'] targets = targets.int64_list.value targets_as_int64_list = targets targets = subword_encoder.decode(targets) weights = get_weights(inputs, targets, args.weight) example_proto = serialize_example(inputs_as_int64list, targets_as_int64_list, weights) writer.write(example_proto)
def gen(path_zh, path_ru): random.seed(hash(path_ru) % 1000) tokenizer = SubwordTextEncoder( '../data/vocab.translate_zhru_full.47000.subwords') with open(path_zh, 'r') as fzh, open(path_ru, 'r') as fru: sample = {"inputs": [], "targets": []} for line_zh, line_ru in zip(fzh, fru): ids_zh = tokenizer.encode(line_zh.rstrip() + ' ') ids_ru = tokenizer.encode(line_ru.rstrip() + ' ') if 0 == len(sample["inputs"]) or ( len(sample["inputs"]) + 1 + len(ids_zh) <= MAX_LEN and len(sample["targets"]) + 1 + len(ids_ru) <= MAX_LEN and random.random() < 0.5): sample["inputs"].extend(ids_zh) sample["targets"].extend(ids_ru) else: sample["inputs"] = sample["inputs"][:MAX_LEN - 1] + [EOS_ID] sample["targets"] = sample["targets"][:MAX_LEN - 1] + [EOS_ID] yield sample.copy() sample["inputs"] = ids_zh sample["targets"] = ids_ru if sample["inputs"]: sample["inputs"] = sample["inputs"][:MAX_LEN - 1] + [EOS_ID] sample["targets"] = sample["targets"][:MAX_LEN - 1] + [EOS_ID] yield sample.copy()
def LoadorCreateVocabulary(vocab_file, dataset, vocab_size): try: subtokenizer = SubwordTextEncoder(vocab_file) print('Loaded existing vocabulary') except: print('Building vocabulary') subtokenizer = SubwordTextEncoder.build_from_generator( dataset, vocab_size) subtokenizer.store_to_file(vocab_file) print('Vocab File path: ', vocab_file) return subtokenizer
def build(self, token_counts, vocab_filepath): target_size = self._approx_vocab_size # Searching the minimum max_size max_size = self.__INITIAL_MAX_SIZE while True: max_size, success = self._run_max_size_attempt( max_size, token_counts) if success: break min_size = 1 if max_size == self.__INITIAL_MAX_SIZE else int(max_size / 2) # Generating Vocabulary file tf.logging.info("Generating vocab file: %s (min = %d, max = %d)" % (vocab_filepath, min_size, max_size)) encoder = SubwordTextEncoder.build_to_target_size( target_size, token_counts, min_size, max_size, reserved_tokens=self._reserved_tokens) if vocab_filepath is not None: encoder.store_to_file(vocab_filepath) return ModernMTSubwordTextEncoder(vocab_filepath)
def get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size): """Generate a vocabulary from the datasets listed in _DATA_FILE_URLS.""" vocab_filepath = os.path.join(tmp_dir, vocab_filename) if os.path.exists(vocab_filepath): vocab = SubwordTextEncoder(vocab_filepath) return vocab tokenizer = Tokenizer() for source in _DATA_FILE_URLS: url = source[0] filename = os.path.basename(url) read_type = "r:gz" if "tgz" in filename else "r" compressed_file = maybe_download(tmp_dir, filename, url) with tarfile.open(compressed_file, read_type) as corpus_tar: corpus_tar.extractall(tmp_dir) for lang_file in source[1]: tf.logging.info("Reading file: %s" % lang_file) filepath = os.path.join(tmp_dir, lang_file) # For some datasets a second extraction is necessary. if ".gz" in lang_file: new_filepath = os.path.join(tmp_dir, lang_file[:-3]) if os.path.exists(new_filepath): tf.logging.info("Subdirectory %s already exists, skipping unpacking" % filepath) else: tf.logging.info("Unpacking subdirectory %s" % filepath) gunzip_file(filepath, new_filepath) filepath = new_filepath # Use Tokenizer to count the word occurrences. with tf.gfile.GFile(filepath, mode="r") as source_file: file_byte_budget = 3.5e5 if "en" in filepath else 7e5 for line in source_file: if file_byte_budget <= 0: break line = line.strip() file_byte_budget -= len(line) _ = tokenizer.encode(line) vocab = SubwordTextEncoder.build_to_target_size( vocab_size, tokenizer.token_counts, 1, 1e3) vocab.store_to_file(vocab_filepath) return vocab
def get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size): """Generate a vocabulary from the datasets listed in _DATA_FILE_URLS.""" vocab_filepath = os.path.join(tmp_dir, vocab_filename) if os.path.exists(vocab_filepath): vocab = SubwordTextEncoder(vocab_filepath) return vocab tokenizer = Tokenizer() for source in _DATA_FILE_URLS: for lang_file in source[1]: tf.logging.info("Reading file: %s" % lang_file) filepath = os.path.join(tmp_dir, lang_file) # Use Tokenizer to count the word occurrences. with tf.gfile.GFile(filepath, mode="r") as source_file: for line in source_file: line = line.strip() _ = tokenizer.encode(line) vocab = SubwordTextEncoder.build_to_target_size(vocab_size, tokenizer.token_counts, vocab_filepath, 1, 1e3) return vocab
def _build_from_token_counts(args): token_counts, max_size, iterations, vocab_filepath, reserved_tokens = args encoder = SubwordTextEncoder() encoder.build_from_token_counts(token_counts, max_size, num_iterations=iterations) if vocab_filepath is not None: encoder.store_to_file(vocab_filepath) return max_size, encoder.vocab_size
def get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size): """Generate a vocabulary from the datasets listed in _DATA_FILE_URLS.""" vocab_filepath = os.path.join(tmp_dir, vocab_filename) if os.path.exists(vocab_filepath): vocab = SubwordTextEncoder(vocab_filepath) return vocab tokenizer = Tokenizer() for source in _DATA_FILE_URLS: url = source[0] filename = os.path.basename(url) read_type = "r:gz" if "tgz" in filename else "r" compressed_file = maybe_download(tmp_dir, filename, url) with tarfile.open(compressed_file, read_type) as corpus_tar: corpus_tar.extractall(tmp_dir) for lang_file in source[1]: tf.logging.info("Reading file: %s" % lang_file) filepath = os.path.join(tmp_dir, lang_file) # For some datasets a second extraction is necessary. if ".gz" in lang_file: tf.logging.info("Unpacking subdirectory %s" % filepath) new_filepath = os.path.join(tmp_dir, lang_file[:-3]) gunzip_file(filepath, new_filepath) filepath = new_filepath # Use Tokenizer to count the word occurrences. with tf.gfile.GFile(filepath, mode="r") as source_file: file_byte_budget = 3.5e5 if "en" in filepath else 7e5 for line in source_file: if file_byte_budget <= 0: break line = line.strip() file_byte_budget -= len(line) _ = tokenizer.encode(line) vocab = SubwordTextEncoder.build_to_target_size( vocab_size, tokenizer.token_counts, vocab_filepath, 1, 1e3) return vocab
def __init__(self, word_embed, all_data, adv, is_train): # input data # self.all_data = all_data self.is_train = is_train self.adv = adv # embedding initialization if word_embed is not None: self.word_dim = word_embed.shape[1] self.vocab_size = word_embed.shape[0] w_trainable = True if self.word_dim == 50 else False shape = None else: encoder = SubwordTextEncoder(get_vocab_file()) self.word_dim = FLAGS.hidden_size self.vocab_size = encoder.vocab_size word_embed = tf.random_normal_initializer(0.0, self.word_dim**-0.5) w_trainable = True shape = [self.vocab_size, self.word_dim] self.word_embed = tf.get_variable('word_embed', initializer=word_embed, shape=shape, dtype=tf.float32, trainable=w_trainable) with tf.variable_scope("shared"): self.shared_conv = _get_model() self.shared_linear = tf.keras.layers.Dense(TASK_NUM, activation=None, name='leaner_shared') self.tensors = [] self.pred = {} self.separate_acc = {} self.metric_tensors = [] self.data = {} self.alignments = {} for task_name, data in all_data: with tf.variable_scope(task_name): self.build_task_graph(data, task_name)
def decode(s, array=False): encoder = SubwordTextEncoder(get_vocab_file()) if array: return encoder.decode_list(s) return encoder.decode(s)
class Vocab: def __init__(self, model_config, vocab_path=None, lower=False): self.model_config = model_config self.vocab_path = vocab_path if 'bert_token' in self.model_config.bert_mode: self.i2w = [w.strip() for w in open(self.vocab_path)] self.w2i = dict(zip(self.i2w, range(len(self.i2w)))) self.bert_tokenizer = WordpieceTokenizer( vocab=self.i2w, unk_token=constant.SYMBOL_UNK) print('Populate BERT word piece vocab with size %s' % self.vocab_size()) elif self.model_config.subword_vocab_size <= 0: self.init_vocab() if vocab_path is not None: self.populate_vocab() else: if vocab_path is not None: self.populate_subword_vocab() def populate_subword_vocab(self): self.subword = SubwordTextEncoder(self.vocab_path) print('Subword Vocab Populated with size %d for path %s.' % (len(self.subword._all_subtoken_strings), self.vocab_path)) def init_vocab(self): self.w2i = {} self.i2w = [] self.w2i[constant.SYMBOL_GO] = 0 self.i2w.append(constant.SYMBOL_GO) self.w2i[constant.SYMBOL_PAD] = 1 self.i2w.append(constant.SYMBOL_PAD) self.w2i[constant.SYMBOL_UNK] = 2 self.i2w.append(constant.SYMBOL_UNK) self.w2i[constant.SYMBOL_START] = 3 self.i2w.append(constant.SYMBOL_START) self.w2i[constant.SYMBOL_END] = 4 self.i2w.append(constant.SYMBOL_END) unk_id = 0 for voc_id in range(len(self.i2w), constant.REVERED_VOCAB_SIZE): self.w2i['#unk%s#' % unk_id] = voc_id self.i2w.append('#unk%s#' % unk_id) unk_id += 1 def populate_vocab(self, mincount=-1, topcount=50000): mincount = max(mincount, self.model_config.min_count) topcount = min(topcount, self.model_config.top_count) lid = 0 for line in open(self.vocab_path): items = line.strip().split('\t') w = items[0] if len(items) > 1: cnt = int(items[1]) # else: # # Accept all words # cnt = 99999 if cnt >= mincount: self.w2i[w] = len(self.i2w) self.i2w.append(w) lid += 1 if lid >= topcount: break print( 'Vocab Populated with size %d including %d reserved vocab for path %s.' % (len(self.i2w), constant.REVERED_VOCAB_SIZE, self.vocab_path)) def encode(self, w): if 'bert_token' in self.model_config.bert_mode: return [self.w2i[w] for w in self.bert_tokenizer.tokenize(w)] elif self.model_config.subword_vocab_size <= 0: if w in self.w2i: return self.w2i[w] else: return self.w2i[constant.SYMBOL_UNK] else: return self.subword.encode(w) def contain(self, w): return w in self.w2i def describe(self, i): if 'bert_token' in self.model_config.bert_mode: return bert_utils.merge_tokens([self.i2w[ie] for ie in i]) elif self.model_config.subword_vocab_size <= 0: if i < len(self.i2w): return self.i2w[i] else: # Note in subword case, i should be list of id, i.e. ids. return self.subword.decode(i) def vocab_size(self): if self.model_config.subword_vocab_size <= 0 or 'bert_token' in self.model_config.bert_mode: return len(self.i2w) else: return len(self.subword._all_subtoken_strings) @staticmethod def process_word(word, model_config): if word: if model_config.lower_case: word = word.lower() word = data_parse(word) return word
def populate_subword_vocab(self): self.subword = SubwordTextEncoder(self.vocab_path) print('Subword Vocab Populated with size %d for path %s.' % (len(self.subword._all_subtoken_strings), self.vocab_path))