def main(args): args.rank = 0 args.make_vocab_size_divisible_by = 128 args.tensor_model_parallel_size = 1 args.model_parallel_size = 1 args.merge_file = None pin = Path(args.input) data_path_prefix = [str(each)[:-4] for each in pin.glob("*.bin")] pout = Path(args.output) pout.mkdir(parents=True, exist_ok=True) output_bin_files = pout / f"{args.output_prefix}.bin" output_idx_files = pout / f"{args.output_prefix}.idx" try: os.remove(output_bin_files) os.remove(output_idx_files) except: pass tokenizer = build_tokenizer(args) builders = indexed_dataset.make_builder(output_bin_files, impl='mmap', vocab_size=tokenizer.vocab_size) for each in data_path_prefix: builders.merge_file_(each) builders.finalize(output_idx_files)
def main(): args = get_args() startup_start = time.time() print("Opening", args.input) fin = open(args.input, 'r', encoding='utf-8') if nltk_available and args.split_sentences: nltk.download("punkt", quiet=True) encoder = Encoder(args) tokenizer = build_tokenizer(args) pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer) encoded_docs = pool.imap(encoder.encode, fin, 25) #encoded_docs = map(encoder.encode, fin) level = "document" if args.split_sentences: level = "sentence" print(f"Vocab size: {tokenizer.vocab_size}") print(f"Output prefix: {args.output_prefix}") output_bin_files = {} output_idx_files = {} builders = {} for key in args.json_keys: output_bin_files[key] = "{}_{}_{}.bin".format(args.output_prefix, key, level) output_idx_files[key] = "{}_{}_{}.idx".format(args.output_prefix, key, level) builders[key] = indexed_dataset.make_builder( output_bin_files[key], impl=args.dataset_impl, vocab_size=tokenizer.vocab_size) startup_end = time.time() proc_start = time.time() total_bytes_processed = 0 print("Time to startup:", startup_end - startup_start) for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1): total_bytes_processed += bytes_processed for key, sentences in doc.items(): if len(sentences) == 0: continue for sentence in sentences: builders[key].add_item(torch.IntTensor(sentence)) builders[key].end_document() if i % args.log_interval == 0: current = time.time() elapsed = current - proc_start mbs = total_bytes_processed / elapsed / 1024 / 1024 print(f"Processed {i} documents", f"({i/elapsed} docs/s, {mbs} MB/s).", file=sys.stderr) for key in args.json_keys: builders[key].finalize(output_idx_files[key])
def main(): args = get_args() startup_start = time.time() print("Opening", args.input) fin = _multi_lmd(args.input.split(",")) if nltk_available and args.split_sentences: nltk.download("punkt", quiet=True) encoder = Encoder(args) tokenizer = build_tokenizer(args) pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer) encoded_docs = pool.imap(encoder.encode, fin, 25) level = "document" if args.split_sentences: level = "sentence" print(f"Vocab size: {tokenizer.vocab_size}") print(f"Output prefix: {args.output_prefix}") output_bin_files = {} output_idx_files = {} builders = {} for key in args.json_keys: output_bin_files[key] = "{}_{}_{}.bin".format(args.output_prefix, key, level) output_idx_files[key] = "{}_{}_{}.idx".format(args.output_prefix, key, level) builders[key] = indexed_dataset.make_builder( output_bin_files[key], impl=args.dataset_impl, vocab_size=tokenizer.vocab_size) startup_end = time.time() proc_start = time.time() total_bytes_processed = 0 print("Time to startup:", startup_end - startup_start) pbar = tqdm.tqdm() for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1): total_bytes_processed += bytes_processed for key, sentences in doc.items(): for sentence in sentences: builders[key].add_item(torch.IntTensor(sentence)) builders[key].end_document() if i % args.log_interval == 0: current = time.time() elapsed = current - proc_start mbs = total_bytes_processed / elapsed / 1024 / 1024 pbar.set_description( f"Processed {i}{'' if args.num_docs is None else '/' + str(args.num_docs)} documents ({i / elapsed} docs/s, {mbs} MB/s)." ) if i != 0: pbar.update(args.log_interval) for key in args.json_keys: builders[key].finalize(output_idx_files[key])
def main(): args = get_args() encoder = Encoder(args) tokenizer = build_tokenizer(args) print(f"Vocab size: {tokenizer.vocab_size}") print(f"Output prefix: {args.output_prefix}") # build a semaphore object to stop `yield_from_files` from getting ahead of encoder.encode and # hence building up memory semaphore = Semaphore(10000 + args.workers) # use multiprocessing to iterate over input documents fin = yield_from_files(args.input.split(","), semaphore) if args.workers > 1: pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer) encoded_docs = pool.imap(encoder.encode, fin, chunksize=25) else: encoder.initializer() encoded_docs = (encoder.encode(doc) for doc in fin) # make a dataset builder for each key in args.json_keys # each key will output to a different file beginning with args.output_prefix output_bin_files = {} output_idx_files = {} builders = {} for key in args.json_keys: output_bin_files[key] = "{}_{}_{}.bin".format(args.output_prefix, key, "document") output_idx_files[key] = "{}_{}_{}.idx".format(args.output_prefix, key, "document") builders[key] = indexed_dataset.make_builder(output_bin_files[key], impl=args.dataset_impl, vocab_size=tokenizer.vocab_size) # actually do tokenization proc_start = time.time() total_bytes_processed = 0 pbar = tqdm.tqdm() for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1): total_bytes_processed += bytes_processed # release semaphore so `yield_from_files` can add another file to the buffer semaphore.release() # add each tokenized document / sentence for key, sentences in doc.items(): for sentence in sentences: builders[key].add_item(torch.IntTensor(sentence)) # separate with eos token builders[key].end_document() # log progress if i % args.log_interval == 0: current = time.time() elapsed = current - proc_start mbs = total_bytes_processed / elapsed / 1024 / 1024 pbar.set_description( f"Processed {i}{'' if args.num_docs is None else '/' + str(args.num_docs)} documents ({i / elapsed} docs/s, {mbs} MB/s).") if i != 0: pbar.update(args.log_interval) # save output file for key in args.json_keys: builders[key].finalize(output_idx_files[key])