class Corpus(object): def __init__(self, path, dataset, *args, **kwargs): self.dataset = dataset self.vocab = Vocab(*args, **kwargs) self.vocab.count_file(os.path.join(path, "train.txt")) self.vocab.build_vocab() self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True) self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True) self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True) vocab_len = len(self.vocab) self.cutoffs = [0, int(vocab_len * 0.1), int(vocab_len * 0.2), int(vocab_len * 0.4)] + [vocab_len] # self.cutoffs = [] def convert_to_tfrecords(self, split, save_dir, bsz, tgt_len, num_core_per_host, **kwargs): file_names = [] record_name = "record_info-{}.bsz-{}.tlen-{}.json".format(split, bsz, tgt_len) record_info_path = os.path.join(save_dir, record_name) data = getattr(self, split) file_name, num_batch = create_ordered_tfrecords(save_dir, split, data, bsz, tgt_len) file_names.append(file_name) with open(record_info_path, "w") as fp: record_info = { "filenames": file_names, "num_batch": num_batch } json.dump(record_info, fp)
class Corpus(object): def __init__(self, path, dataset, *args, **kwargs): self.dataset = dataset self.vocab = Vocab(*args, **kwargs) self.vocab.count_file(os.path.join( path, "train.txt")) # 更新vocab对象里的counter(用于统计每个不同的词出现的次数) self.vocab.count_file(os.path.join(path, "valid.txt")) # 同上,验证集中更新 self.vocab.build_vocab() # 这一步是为了建立idx2sym和sym2idx,把词映射为索引,把索引还原为词 self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True) self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True) # self.cutoffs = [] # 完全是多余的,从看代码的第一天开始,我就觉得cutoff是多余的,在今天被坑了一天之后,我终于可以确定在没有TPU的情况下,所有设涉及cutoff的代码都是多余的 def convert_to_tfrecords(self, split, save_dir, bsz, tgt_len, **kwargs): file_names = [] record_name = "record_info-{}.bsz-{}.tlen-{}.json".format( split, bsz, tgt_len) record_info_path = os.path.join(save_dir, record_name) bin_sizes = None file_name, num_batch = create_ordered_tfrecords( save_dir, split, getattr(self, split), bsz, tgt_len) file_names.append(file_name) with open(record_info_path, "w") as fp: record_info = { "filenames": file_names, "bin_sizes": bin_sizes, "num_batch": num_batch } json.dump(record_info, fp)
class Corpus(object): def __init__(self, path, dataset, *args, **kwargs): self.dataset = dataset if self.dataset == "generic_dataset": encode_kwargs = dict( add_eos=kwargs.pop('add_eos', False), add_double_eos=kwargs.pop('add_double_eos', False), ordered=True, verbose=True, ) if kwargs.get('vocab_file') is not None: kwargs['vocab_file'] = os.path.join(path, kwargs['vocab_file']) print(self.dataset, 'vocab params', kwargs) self.vocab = Vocab(*args, **kwargs) if self.dataset in ["ptb", "wt2", "enwik8", "text8"]: self.vocab.count_file(os.path.join(path, "train.txt")) self.vocab.count_file(os.path.join(path, "valid.txt")) self.vocab.count_file(os.path.join(path, "test.txt")) elif self.dataset == "generic_dataset" and not self.vocab.vocab_file: self.vocab.count_file(os.path.join(path, "train.txt")) self.vocab.count_file(os.path.join(path, "valid.txt")) self.vocab.count_file(os.path.join(path, "test.txt")) elif self.dataset == "wt103": self.vocab.count_file(os.path.join(path, "train.txt")) elif self.dataset == "lm1b": train_path_pattern = os.path.join( path, "1-billion-word-language-modeling-benchmark-r13output", "training-monolingual.tokenized.shuffled", "news.en-*") train_paths = glob(train_path_pattern) # the vocab will load from file when build_vocab() is called # for train_path in sorted(train_paths): # self.vocab.count_file(train_path, verbose=True) self.vocab.build_vocab() if self.dataset in ["ptb", "wt2", "wt103"]: self.train = self.vocab.encode_file(os.path.join( path, "train.txt"), ordered=True) self.valid = self.vocab.encode_file(os.path.join( path, "valid.txt"), ordered=True) self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True) elif self.dataset == "generic_dataset": self.train = self.vocab.encode_file( os.path.join(path, "train.txt"), **encode_kwargs) self.valid = self.vocab.encode_file( os.path.join(path, "valid.txt"), **encode_kwargs) self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), **encode_kwargs) elif self.dataset in ["enwik8", "text8"]: self.train = self.vocab.encode_file(os.path.join( path, "train.txt"), ordered=True, add_eos=False) self.valid = self.vocab.encode_file(os.path.join( path, "valid.txt"), ordered=True, add_eos=False) self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True, add_eos=False) elif self.dataset == "lm1b": self.train = train_paths valid_path = os.path.join(path, "valid.txt") test_path = valid_path self.valid = self.vocab.encode_file(valid_path, ordered=True, add_double_eos=True) self.test = self.vocab.encode_file(test_path, ordered=True, add_double_eos=True) if self.dataset == "wt103": self.cutoffs = [0, 20000, 40000, 200000] + [len(self.vocab)] elif self.dataset == "generic_dataset": with open(os.path.join(path, "cutoffs.json")) as f: self.cutoffs = json.load(f) elif self.dataset == "lm1b": self.cutoffs = [0, 60000, 100000, 640000] + [len(self.vocab)] else: self.cutoffs = [] def convert_to_tfrecords(self, split, save_dir, bsz, tgt_len, num_core_per_host, **kwargs): FLAGS = kwargs.get('FLAGS') file_names = [] use_tpu = FLAGS.use_tpu and not (split == "test" and num_core_per_host == 1) if use_tpu: record_name = "record_info-{}.bsz-{}.tlen-{}.core-{}.json".format( split, bsz, tgt_len, num_core_per_host) else: record_name = "record_info-{}.bsz-{}.tlen-{}.json".format( split, bsz, tgt_len) record_info_path = os.path.join(save_dir, record_name) if self.dataset in [ "ptb", "wt2", "wt103", "enwik8", "text8", "generic_dataset" ]: data = getattr(self, split) bin_sizes = get_bin_sizes(data, bsz // num_core_per_host, tgt_len, self.cutoffs) file_name, num_batch = create_ordered_tfrecords( save_dir, split, data, bsz, tgt_len, num_core_per_host, self.cutoffs, bin_sizes, num_passes=FLAGS.num_passes if split == 'train' and use_tpu else 1, use_tpu=use_tpu) file_names.append(file_name) elif self.dataset == "lm1b": bin_sizes = get_bin_sizes(self.valid, bsz // num_core_per_host, tgt_len, self.cutoffs) if split == "train": np.random.seed(123456) num_batch = 0 if FLAGS.num_procs > 1: _preprocess_wrapper = partial( _preprocess, train=self.train, vocab=self.vocab, save_dir=save_dir, cutoffs=self.cutoffs, bin_sizes=bin_sizes, bsz=bsz, tgt_len=tgt_len, num_core_per_host=num_core_per_host, use_tpu=use_tpu, num_shuffle=FLAGS.num_shuffle) pool = mp.Pool(processes=FLAGS.num_procs) results = pool.map(_preprocess_wrapper, range(len(self.train))) for res in results: file_names.extend(res[0]) num_batch += res[1] else: for shard, path in enumerate(self.train): data_shard = self.vocab.encode_file( path, ordered=False, add_double_eos=True) num_shuffle = FLAGS.num_shuffle for shuffle in range(num_shuffle): print("Processing shard {} shuffle {}".format( shard, shuffle)) basename = "train-{:03d}-{:02d}".format( shard, shuffle) np.random.shuffle(data_shard) file_name, num_batch_ = create_ordered_tfrecords( save_dir, basename, np.concatenate(data_shard), bsz, tgt_len, num_core_per_host, self.cutoffs, bin_sizes, use_tpu=use_tpu) file_names.append(file_name) num_batch += num_batch_ else: file_name, num_batch = create_ordered_tfrecords( save_dir, split, getattr(self, split), bsz, tgt_len, num_core_per_host, self.cutoffs, bin_sizes, use_tpu=use_tpu) file_names.append(file_name) with open(record_info_path, "w") as fp: record_info = { "filenames": file_names, "bin_sizes": bin_sizes, "num_batch": num_batch } json.dump(record_info, fp)
class Corpus(object): def __init__(self, path, dataset, *args, **kwargs): self.dataset = dataset self.vocab = Vocab(*args, **kwargs) if self.vocab.vocab_file == None: self.vocab.count_file(os.path.join(path, "train.txt")) self.vocab.build_vocab() self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), add_eos=True, ordered=True) self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), add_eos=True, ordered=True) self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), add_eos=True, ordered=True) self.cutoffs = [] def convert_to_tfrecords(self, split, save_dir, bsz, tgt_len, num_core_per_host, **kwargs): FLAGS = kwargs.get('FLAGS') file_names = [] use_tpu = FLAGS.use_tpu and not (split == "test" and num_core_per_host == 1) if use_tpu: record_name = "record_info-{}.bsz-{}.tlen-{}.core-{}.json".format( split, bsz, tgt_len, num_core_per_host) else: record_name = "record_info-{}.bsz-{}.tlen-{}.json".format( split, bsz, tgt_len) record_info_path = os.path.join(save_dir, record_name) data = getattr(self, split) bin_sizes = get_bin_sizes(data, bsz // num_core_per_host, tgt_len, self.cutoffs) file_name, num_batch = create_ordered_tfrecords( save_dir, split, data, bsz, tgt_len, num_core_per_host, self.cutoffs, bin_sizes, num_passes=FLAGS.num_passes if split == 'train' and use_tpu else 1, use_tpu=use_tpu) file_names.append(file_name) with open(record_info_path, "w") as fp: record_info = { "filenames": file_names, "bin_sizes": bin_sizes, "num_batch": num_batch } json.dump(record_info, fp)
class Corpus(object): def __init__(self, path, dataset, *args, **kwargs): self.dataset = dataset self.vocab = Vocab(*args, **kwargs) train_path = os.path.join(path, "train.txt") valid_path = os.path.join(path, "valid.txt") # test_path = os.path.join(path, "test.txt") # self.vocab.count_file(train_path) # self.vocab.count_file(valid_path) # self.vocab.count_file(test_path) self.vocab.build_vocab(add_bytes=True) self.train = train_path self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True, add_eos=False) # self.test = self.vocab.encode_file( # os.path.join(path, "test.txt"), ordered=True, add_eos=False) self.cutoffs = [] def convert_to_tfrecords(self, split, save_dir, bsz, tgt_len, num_core_per_host, **kwargs): FLAGS = kwargs.get('FLAGS') file_names = [] use_tpu = FLAGS.use_tpu and not (split == "test" and num_core_per_host == 1) record_name = "record_info-{}.bsz-{}.tlen-{}.json".format( split, bsz, tgt_len) record_info_path = os.path.join(save_dir, record_name) # pretty sure this is a tpu only thing bin_sizes = [] if split == "train": np.random.seed(123456) num_batch = 0 for shard, shard_c in self.file_sharder(self.train, FLAGS.train_shard_size): print("Processing shard {}".format(shard_c)) basename = "train-{:03d}".format(shard_c) file_name, num_batch_ = create_ordered_tfrecords( save_dir, basename, shard, bsz, tgt_len, num_core_per_host, self.cutoffs, bin_sizes, use_tpu=use_tpu) file_names.append(file_name) num_batch += num_batch_ else: file_name, num_batch = create_ordered_tfrecords(save_dir, split, getattr( self, split), bsz, tgt_len, num_core_per_host, self.cutoffs, bin_sizes, use_tpu=use_tpu) file_names.append(file_name) with open(record_info_path, "w") as fp: record_info = { "filenames": file_names, "bin_sizes": bin_sizes, "num_batch": num_batch } json.dump(record_info, fp) def file_sharder(self, file_name, shard_size): """ Shard a file into manageable sizes. """ cur_shard_size = 0 cur_shard = [] count = 0 with open(file_name, 'r') as f: for line in f: toks = self.vocab.tokenize(line) cur_shard.append(self.vocab.convert_to_nparray(toks)) cur_shard_size += len(toks) if cur_shard_size >= shard_size: cur_shard = np.concatenate(cur_shard) print("Compiled shard of size {}".format(cur_shard_size)) yield cur_shard, count cur_shard = [] cur_shard_size = 0 count += 1 # want at least more than 50MB to write a shard if cur_shard_size >= 50000000: cur_shard = np.concatenate(cur_shard) yield cur_shard, count
class Corpus(object): def __init__(self, path, dataset, *args, **kwargs): self.dataset = dataset self.vocab = Vocab(*args, **kwargs) if self.dataset in ["ptb", "wt2", "enwik8", "text8", "sb2", "sb92"]: self.vocab.count_file(os.path.join(path, "train.txt")) self.vocab.count_file(os.path.join(path, "valid.txt")) self.vocab.count_file(os.path.join(path, "test.txt")) elif self.dataset in ["wt103", "wt103small"]: self.vocab.count_file(os.path.join(path, "train.txt")) elif self.dataset == "lm1b": train_path_pattern = os.path.join( path, "1-billion-word-language-modeling-benchmark-r13output", "training-monolingual.tokenized.shuffled", "news.en-*") train_paths = glob(train_path_pattern) # the vocab will load from file when build_vocab() is called # for train_path in sorted(train_paths): # self.vocab.count_file(train_path, verbose=True) self.vocab.build_vocab() if self.dataset in ["ptb", "sb2", "sb92"]: self.train = self.vocab.encode_file(os.path.join( path, "train.txt"), ordered=True) self.valid = self.vocab.encode_file(os.path.join( path, "valid.txt"), ordered=True) self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True) elif self.dataset in ["wt2", "wt103", "wt103small"]: self.train, self.train_boundary = self.vocab.encode_file( os.path.join(path, "train.txt"), ordered=True, ret_doc_boundary=True, pattern="\=[^=]+\=") self.valid, self.valid_boundary = self.vocab.encode_file( os.path.join(path, "valid.txt"), ordered=True, ret_doc_boundary=True, pattern="\=[^=]+\=") self.test, self.test_boundary = self.vocab.encode_file( os.path.join(path, "test.txt"), ordered=True, ret_doc_boundary=True, pattern="\=[^=]+\=") elif self.dataset in ["enwik8", "text8"]: self.train = self.vocab.encode_file(os.path.join( path, "train.txt"), ordered=True, add_eos=False) self.valid = self.vocab.encode_file(os.path.join( path, "valid.txt"), ordered=True, add_eos=False) self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True, add_eos=False) elif self.dataset == "lm1b": self.train = train_paths valid_path = os.path.join(path, "valid.txt") test_path = valid_path self.valid = self.vocab.encode_file(valid_path, ordered=True, add_double_eos=True) self.test = self.vocab.encode_file(test_path, ordered=True, add_double_eos=True) if self.dataset == "sb92": self.cutoffs = [0, 10000, 20000] + [len(self.vocab)] elif self.dataset == "wt103small": self.cutoffs = [0, 20000, 40000] + [len(self.vocab)] elif self.dataset == "wt103": self.cutoffs = [0, 20000, 40000, 200000] + [len(self.vocab)] elif self.dataset == "lm1b": self.cutoffs = [0, 60000, 100000, 640000] + [len(self.vocab)] else: self.cutoffs = [] def convert_to_tfrecords(self, split, save_dir, bsz, tgt_len, **kwargs): FLAGS = kwargs.get('FLAGS') file_names = [] record_name = "record_info-{}.bsz-{}.tlen-{}.json".format( split, bsz, tgt_len) record_info_path = os.path.join(save_dir, record_name) if self.dataset in ["ptb", "enwik8", "text8", "sb2", "sb92"]: data = getattr(self, split) file_name, num_batch = create_ordered_tfrecords( save_dir, split, data, bsz, tgt_len, num_passes=FLAGS.num_passes) file_names.append(file_name) if self.dataset in ["wt2", "wt103", "wt103small"]: data = getattr(self, split) boundary = getattr(self, split + "_boundary") file_name, num_batch = create_ordered_tfrecords( save_dir, split, data, bsz, tgt_len, num_passes=FLAGS.num_passes, boundary=boundary) file_names.append(file_name) elif self.dataset == "lm1b": if split == "train": np.random.seed(123456) num_batch = 0 if FLAGS.num_procs > 1: _preprocess_wrapper = partial( _preprocess, train=self.train, vocab=self.vocab, save_dir=save_dir, bsz=bsz, tgt_len=tgt_len, num_shuffle=FLAGS.num_shuffle) pool = mp.Pool(processes=FLAGS.num_procs) results = pool.map(_preprocess_wrapper, range(len(self.train))) for res in results: file_names.extend(res[0]) num_batch += res[1] else: for shard, path in enumerate(self.train): data_shard = self.vocab.encode_file( path, ordered=False, add_double_eos=True) num_shuffle = FLAGS.num_shuffle for shuffle in range(num_shuffle): print("Processing shard {} shuffle {}".format( shard, shuffle)) basename = "train-{:03d}-{:02d}".format( shard, shuffle) np.random.shuffle(data_shard) file_name, num_batch_ = create_ordered_tfrecords( save_dir, basename, np.concatenate(data_shard), bsz, tgt_len) file_names.append(file_name) num_batch += num_batch_ else: file_name, num_batch = create_ordered_tfrecords( save_dir, split, getattr(self, split), bsz, tgt_len) file_names.append(file_name) with open(record_info_path, "w") as fp: record_info = {"filenames": file_names, "num_batch": num_batch} if self.dataset in ["wt2", "wt103", "wt103small"]: record_info["boundary"] = True else: record_info["boundary"] = False json.dump(record_info, fp)