def get_enc_dec_tokenizers( encoder_tokenizer_name=None, encoder_tokenizer_model=None, encoder_bpe_dropout=0.0, encoder_model_name=None, decoder_tokenizer_name=None, decoder_tokenizer_model=None, decoder_bpe_dropout=0.0, decoder_model_name=None, ): # if encoder_tokenizer_name != 'yttm' or decoder_tokenizer_name != 'yttm': # raise NotImplementedError(f"Currently we only support yttm tokenizer.") encoder_tokenizer = get_nmt_tokenizer( library=encoder_tokenizer_name, model_name=encoder_model_name, tokenizer_model=encoder_tokenizer_model, bpe_dropout=encoder_bpe_dropout, ) decoder_tokenizer = get_nmt_tokenizer( library=decoder_tokenizer_name, model_name=decoder_model_name, tokenizer_model=decoder_tokenizer_model, bpe_dropout=decoder_bpe_dropout, ) return encoder_tokenizer, decoder_tokenizer
def test_init_prompt_learning_dataset(self): tokenizer = get_nmt_tokenizer(library='megatron', model_name='GPT2BPETokenizer') task_templates = get_task_templates() dataset_path = create_temp_dataset() # Setup virtual token place holders total_virtual_tokens = 10 pseudo_tokens = get_pseudo_tokens(total_virtual_tokens) tokenizer.add_special_tokens( {'additional_special_tokens': pseudo_tokens}) dataset = get_prompt_tuning_dataset( dataset_path, tokenizer, VirtualPromptSource.PROMPT_TABLE, task_templates, pseudo_tokens, ) dataset = get_prompt_tuning_dataset( dataset_path, tokenizer, VirtualPromptSource.PROMPT_ENCODER, task_templates, pseudo_tokens, ) print(type(dataset)) assert isinstance(dataset, Dataset) os.remove(dataset_path)
def initializer(self): # Use Encoder class as a container for global data Encoder.tokenizer = get_nmt_tokenizer( library=self.args.tokenizer_library, model_name=self.args.tokenizer_type, tokenizer_model=self.args.tokenizer_model, vocab_file=self.args.vocab_file, merges_file=self.args.merge_file, delimiter=self.args.delimiter, ) if self.args.split_sentences: if not nltk_available: print("NLTK is not available to split sentences.") exit() splitter = nltk.load("tokenizers/punkt/english.pickle") if self.args.keep_newlines: # this prevents punkt from eating newlines after sentences Encoder.splitter = nltk.tokenize.punkt.PunktSentenceTokenizer( train_text=splitter._params, lang_vars=CustomLanguageVars()) else: Encoder.splitter = splitter else: Encoder.splitter = IdentitySplitter()
def main(): args = get_args() startup_start = time.time() print("Opening", args.input) fin = open(args.input, 'r', encoding='utf-8') if nltk_available and args.split_sentences: nltk.download("punkt", quiet=True) encoder = Encoder(args) tokenizer = get_nmt_tokenizer( library=args.tokenizer_library, model_name=args.tokenizer_type, tokenizer_model=args.tokenizer_model, vocab_file=args.vocab_file, merges_file=args.merge_file, ) pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer) encoded_docs = pool.imap(encoder.encode, fin, 25) # encoded_docs = map(encoder.encode, fin) level = "document" if args.split_sentences: level = "sentence" print(f"Vocab size: {tokenizer.vocab_size}") print(f"Output prefix: {args.output_prefix}") output_bin_files = {} output_idx_files = {} builders = {} for key in args.json_keys: output_bin_files[key] = "{}_{}_{}.bin".format(args.output_prefix, key, level) output_idx_files[key] = "{}_{}_{}.idx".format(args.output_prefix, key, level) builders[key] = indexed_dataset.make_builder( output_bin_files[key], impl=args.dataset_impl, vocab_size=tokenizer.vocab_size ) startup_end = time.time() proc_start = time.time() total_bytes_processed = 0 print("Time to startup:", startup_end - startup_start) for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1): total_bytes_processed += bytes_processed for key, sentences in doc.items(): if len(sentences) == 0: continue for sentence in sentences: builders[key].add_item(torch.IntTensor(sentence)) builders[key].end_document() if i % args.log_interval == 0: current = time.time() elapsed = current - proc_start mbs = total_bytes_processed / elapsed / 1024 / 1024 print(f"Processed {i} documents", f"({i/elapsed} docs/s, {mbs} MB/s).", file=sys.stderr) for key in args.json_keys: builders[key].finalize(output_idx_files[key])
def test_prompt_learning_dataset_collate_fn_prompt_encoder(self): tokenizer = get_nmt_tokenizer(library='megatron', model_name='GPT2BPETokenizer') task_templates = get_task_templates() dataset_path = create_temp_dataset() # Setup virtual token place holders total_virtual_tokens = 10 pseudo_tokens = get_pseudo_tokens(total_virtual_tokens) tokenizer.add_special_tokens( {'additional_special_tokens': pseudo_tokens}) dataset = get_prompt_tuning_dataset( dataset_path, tokenizer, VirtualPromptSource.PROMPT_ENCODER, task_templates, pseudo_tokens, ) batch = [dataset[i] for i in range(8)] batch = dataset.collate_fn(batch) assert len(batch) == 6 _, _, _, _, _, taskname_ids = batch assert list( taskname_ids[0].numpy()) == tokenizer.text_to_ids("task name A") os.remove(dataset_path)
def setup_enc_dec_tokenizers( self, encoder_tokenizer_library=None, encoder_tokenizer_model=None, encoder_bpe_dropout=0.0, encoder_model_name=None, encoder_r2l=False, encoder_tokenizer_vocab_file=None, decoder_tokenizer_library=None, decoder_tokenizer_model=None, decoder_bpe_dropout=0.0, decoder_model_name=None, decoder_r2l=False, ): supported_tokenizers = [ 'yttm', 'huggingface', 'sentencepiece', 'megatron', 'byte-level' ] if (encoder_tokenizer_library not in supported_tokenizers or decoder_tokenizer_library not in supported_tokenizers): raise NotImplementedError( f"Currently we only support tokenizers in {supported_tokenizers}." ) self.encoder_tokenizer = get_nmt_tokenizer( library=encoder_tokenizer_library, tokenizer_model=self.register_artifact( "encoder_tokenizer.tokenizer_model", encoder_tokenizer_model), bpe_dropout=encoder_bpe_dropout, model_name=encoder_model_name, vocab_file=self.register_artifact("encoder_tokenizer.vocab_file", encoder_tokenizer_vocab_file), special_tokens=None, use_fast=False, r2l=encoder_r2l, ) self.decoder_tokenizer = get_nmt_tokenizer( library=decoder_tokenizer_library, tokenizer_model=self.register_artifact( "decoder_tokenizer.tokenizer_model", decoder_tokenizer_model), bpe_dropout=decoder_bpe_dropout, model_name=decoder_model_name, vocab_file=None, special_tokens=None, use_fast=False, r2l=decoder_r2l, )
def test_init_prompt_tuning_dataset(self): tokenizer = get_nmt_tokenizer(library='huggingface', model_name='gpt2') dataset_path = create_temp_dataset() num_prompt_tokens = 10 dataset = get_prompt_tuning_dataset(tokenizer, dataset_path, num_prompt_tokens) print(type(dataset)) assert isinstance(dataset, Dataset) os.remove(dataset_path)
def get_tokenizer(args): tokenizer = get_nmt_tokenizer( library=args.tokenizer_library, model_name=args.tokenizer_type, tokenizer_model=args.tokenizer_model, vocab_file=args.vocab_file, merges_file=args.merge_file, delimiter=args.delimiter, ) if not hasattr(tokenizer, "pad_id"): tokenizer.add_special_tokens({'pad_token': '<pad>'}) elif hasattr(tokenizer, "pad_id") and (tokenizer.pad_id is None or tokenizer.pad_id < 0): tokenizer.add_special_tokens({'pad_token': '<pad>'}) return tokenizer
def _build_tokenizer(self): """ Default tokenizer is based on available nemo tokenizers. Override this method to use an external tokenizer. All tokenizers are expected to provide compatible interface. Override default Encoder-decoder tokenizer to use legacy=True for sentencepiece. """ self.tokenizer = get_nmt_tokenizer( library=self._cfg.tokenizer.library, model_name=self._cfg.tokenizer.type, tokenizer_model=self.register_artifact("tokenizer.model", self._cfg.tokenizer.model), vocab_file=self.register_artifact("tokenizer.vocab_file", self._cfg.tokenizer.vocab_file), merges_file=self.register_artifact("tokenizer.merge_file", self._cfg.tokenizer.merge_file), legacy=True if self._cfg.tokenizer.library == 'sentencepiece' else False, )
def test_prompt_tuning_dataset_collate_fn(self): tokenizer = get_nmt_tokenizer(library='megatron', model_name='GPT2BPETokenizer') dataset_path = create_temp_dataset() num_prompt_tokens = 10 dataset = get_prompt_tuning_dataset(tokenizer, dataset_path, num_prompt_tokens) batch = [dataset[i] for i in range(8)] batch = dataset.collate_fn(batch) assert len(batch) == 6 tokens, labels, prompt_tags, attention_mask, loss_mask, text_position_ids = batch assert len(tokens) == len(loss_mask) == len(attention_mask) == len(text_position_ids) assert len(tokens) == len(prompt_tags) assert len(tokens[0]) + num_prompt_tokens == len(loss_mask[0]) assert len(tokens[0]) + num_prompt_tokens == attention_mask[0].size()[-1] os.remove(dataset_path)
def _build_tokenizer(self): self.tokenizer = get_nmt_tokenizer( library=self._cfg.tokenizer.library, model_name=self._cfg.tokenizer.type, tokenizer_model=self.register_artifact("tokenizer.model", self._cfg.tokenizer.model), vocab_file=self.register_artifact("tokenizer.vocab_file", self._cfg.tokenizer.vocab_file), merges_file=self.register_artifact("tokenizer.merge_file", self._cfg.tokenizer.merge_file), delimiter=self.cfg.tokenizer.get('delimiter', None), legacy=False, ) # add pad special token if not hasattr(self.tokenizer, "pad_id"): self.tokenizer.add_special_tokens({'pad_token': '<pad>'}) elif hasattr(self.tokenizer, "pad_id") and (self.tokenizer.pad_id is None or self.tokenizer.pad_id < 0): self.tokenizer.add_special_tokens({'pad_token': '<pad>'})
def test_prompt_learning_dataset_collate_fn_prompt_table(self): tokenizer = get_nmt_tokenizer(library='megatron', model_name='GPT2BPETokenizer') task_templates = get_task_templates() dataset_path = create_temp_dataset() # Setup virtual token place holders pseudo_token_base = 'PROMPT_' total_virtual_tokens = 10 pseudo_tokens = [ pseudo_token_base + str(i) for i in range(total_virtual_tokens) ] tokenizer.add_special_tokens( {'additional_special_tokens': pseudo_tokens}) dataset = get_prompt_tuning_dataset( dataset_path, tokenizer, 'prompt-table', task_templates, pseudo_tokens, ) batch = [dataset[i] for i in range(8)] batch = dataset.collate_fn(batch) assert len(batch) == 6 input_ids, labels, loss_mask, position_ids, attention_mask, taskname_ids = batch assert len(input_ids) == len(loss_mask) == len(attention_mask) == len( position_ids) assert len(input_ids) == len(taskname_ids) assert len(labels) == len(input_ids) assert len(labels[0]) == len(loss_mask[0]) assert len(input_ids[0]) == attention_mask[0].size()[-1] assert len(taskname_ids.shape) == 1 assert taskname_ids[0] == 0 os.remove(dataset_path)
def test_init_prompt_learning_dataset(self): tokenizer = get_nmt_tokenizer(library='megatron', model_name='GPT2BPETokenizer') task_templates = get_task_templates() dataset_path = create_temp_dataset() # Setup virtual token place holders pseudo_token_base = 'PROMPT_' max_virtual_tokens = 10 pseudo_tokens = [ pseudo_token_base + str(i) for i in range(max_virtual_tokens) ] tokenizer.add_special_tokens( {'additional_special_tokens': pseudo_tokens}) dataset = get_prompt_tuning_dataset( dataset_path, tokenizer, 'prompt-table', task_templates, pseudo_tokens, ) dataset = get_prompt_tuning_dataset( dataset_path, tokenizer, 'prompt-encoder', task_templates, pseudo_tokens, ) print(type(dataset)) assert isinstance(dataset, Dataset) os.remove(dataset_path)
def __init__(self, cfg: DictConfig, trainer: Trainer): super().__init__(cfg, trainer=trainer) self.cfg = cfg # used in NVIDIA NGC PyTorch containers self._enable_nvidia_optimizations() if self.cfg.get('use_cpu_initialization', False) is False: torch.cuda.set_device(trainer.local_rank) # buffer used during train_step for logging average loss over gradient accumulation steps self._reduced_loss_buffer = [] self._reduced_lm_loss_buffer = [] self._reduced_sop_loss_buffer = [] initialize_model_parallel_for_nemo( world_size=trainer.world_size, global_rank=trainer.global_rank, local_rank=trainer.local_rank, tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1), seed=self.cfg.get('seed', 1234), ) self.tokenizer = get_nmt_tokenizer( library=self.cfg.tokenizer.library, model_name=self.cfg.tokenizer.type, tokenizer_model=self.register_artifact("tokenizer_model", self.cfg.tokenizer.model), vocab_file=self.register_artifact("vocab_file", self.cfg.tokenizer.vocab_file), merges_file=self.register_artifact("merges_file", self.cfg.tokenizer.merge_file), ) vocab_size = self.tokenizer.vocab_size padded_vocab_size = self._vocab_size_with_padding( orig_vocab_size=vocab_size, make_vocab_size_divisible_by=cfg.get( 'make_vocab_size_divisible_by', 128), tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1), ) num_tokentypes = 2 if cfg.bert_binary_head else 0 self.model = BertModel( vocab_size=padded_vocab_size, hidden_size=cfg.hidden_size, max_position_embeddings=cfg.max_position_embeddings, num_layers=cfg.num_layers, num_attention_heads=cfg.num_attention_heads, apply_query_key_layer_scaling=cfg.get( 'apply_query_key_layer_scaling', True), kv_channels=cfg.get('kv_channels', None), ffn_hidden_size=cfg.ffn_hidden_size, num_tokentypes=num_tokentypes, parallel_output=True, pre_process=cfg.get('pre_process', True), post_process=cfg.get('post_process', True), init_method_std=cfg.get('init_method_std', 0.02), fp16_lm_cross_entropy=cfg.get('fp16_lm_cross_entropy', False), use_cpu_initialization=cfg.get('use_cpu_initialization', False), hidden_dropout=cfg.get('hidden_dropout', 0.1), precision=cfg.get('precision', 16), fp32_residual_connection=cfg.get('fp32_residual_connection', False), activations_checkpoint_method=cfg.get( 'activations_checkpoint_method', None), activations_checkpoint_num_layers=cfg.get( 'activations_checkpoint_num_layers', 1), layernorm_epsilon=cfg.get('layernorm_epsilon', 1e-5), onnx_safe=cfg.get('onnx_safe', False), add_binary_head=cfg.bert_binary_head, )
def init_tokenizer(library, tokenizer_model): tokenizer = get_nmt_tokenizer(library=library, tokenizer_model=tokenizer_model) worker_data["tokenizer"] = tokenizer
help='Path to the vocab file') group.add_argument('--merge-file', type=str, default=None, help='Path to the BPE merge file (if necessary).') group.add_argument('--delimiter', type=str, default=None, help='delimiter used for tabular tokenizer') args = parser.parse_args() tokenizer = get_nmt_tokenizer( library=args.tokenizer_library, model_name=args.tokenizer_type, tokenizer_model=args.tokenizer_model, vocab_file=args.vocab_file, merges_file=args.merge_file, delimiter=args.delimiter, ) data_ds = MMapRetrievalIndexedDataset(args.input_data_prefix) retrieval_ds = MMapRetrievalIndexedDataset(args.input_retrieval_prefix) knn_index = KNNIndex(args.knn_index) assert knn_index.len == data_ds.chunks logging.info(f'Data index has {data_ds.chunks} chunks') logging.info(f'Retrieval Data index has {retrieval_ds.chunks} chunks') logging.info(f'KNN index has {knn_index.K} neighbors') assert knn_index.knn_map.max() < retrieval_ds.chunks assert data_ds._index.chunk_size == retrieval_ds._index.chunk_size for chunk_id in args.chunk_ids:
def __init__(self, cfg: DictConfig, trainer: Trainer): if not HAVE_APEX: raise ImportError( "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt." ) super().__init__(cfg, trainer=trainer) self.cfg = cfg # used in NVIDIA NGC PyTorch containers self._enable_nvidia_optimizations() if self.cfg.get('use_cpu_initialization', False) is False: torch.cuda.set_device(trainer.local_rank) # buffer used during train_step for logging average loss over gradient accumulation steps self._reduced_loss_buffer = [] self._reduced_lm_loss_buffer = [] self._reduced_sop_loss_buffer = [] # not saved as part of nemo model graph but required during export to ONNX input_names = ['input_ids', 'attention_mask', 'token_type_ids'] initialize_model_parallel_for_nemo( world_size=trainer.world_size, global_rank=trainer.global_rank, local_rank=trainer.local_rank, tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1), seed=self.cfg.get('seed', 1234), ) self.tokenizer = get_nmt_tokenizer( library=self.cfg.tokenizer.library, model_name=self.cfg.tokenizer.type, tokenizer_model=self.register_artifact("tokenizer.model", self.cfg.tokenizer.model), vocab_file=self.register_artifact("tokenizer.vocab_file", self.cfg.tokenizer.vocab_file), merges_file=self.register_artifact("tokenizer.merge_file", self.cfg.tokenizer.merge_file), ) vocab_size = self.tokenizer.vocab_size padded_vocab_size = self._vocab_size_with_padding( orig_vocab_size=vocab_size, make_vocab_size_divisible_by=cfg.get( 'make_vocab_size_divisible_by', 128), tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1), ) num_tokentypes = 2 if cfg.bert_binary_head else 0 self.model = BertModel( vocab_size=padded_vocab_size, hidden_size=cfg.hidden_size, max_position_embeddings=cfg.max_position_embeddings, num_layers=cfg.num_layers, num_attention_heads=cfg.num_attention_heads, apply_query_key_layer_scaling=cfg.get( 'apply_query_key_layer_scaling', True), kv_channels=cfg.get('kv_channels', None), ffn_hidden_size=cfg.ffn_hidden_size, num_tokentypes=num_tokentypes, parallel_output=True, pre_process=cfg.get('pre_process', True), post_process=cfg.get('post_process', True), init_method_std=cfg.get('init_method_std', 0.02), fp16_lm_cross_entropy=cfg.get('fp16_lm_cross_entropy', False), use_cpu_initialization=cfg.get('use_cpu_initialization', False), hidden_dropout=cfg.get('hidden_dropout', 0.1), precision=cfg.get('precision', 16), fp32_residual_connection=cfg.get('fp32_residual_connection', False), activations_checkpoint_method=cfg.get( 'activations_checkpoint_method', None), activations_checkpoint_num_layers=cfg.get( 'activations_checkpoint_num_layers', 1), layernorm_epsilon=cfg.get('layernorm_epsilon', 1e-5), masked_softmax_fusion=cfg.get('masked_softmax_fusion', True), bias_gelu_fusion=cfg.get('bias_gelu_fusion', True), onnx_safe=cfg.get('onnx_safe', False), add_binary_head=cfg.bert_binary_head, megatron_legacy=cfg.get('megatron_legacy', False), )
def __init__(self, cfg: DictConfig, trainer: Trainer): if not HAVE_APEX: raise ImportError( "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt." ) # this prevents base constructor from initializing tokenizer self.tokenizer = None super().__init__(cfg, trainer=trainer, no_lm_init=True) self._validate_trainer() # used in NVIDIA NGC PyTorch containers self._enable_nvidia_optimizations() if self.cfg.get('use_cpu_initialization', False) is False: torch.cuda.set_device(trainer.local_rank) initialize_model_parallel_for_nemo( world_size=trainer.world_size, global_rank=trainer.global_rank, local_rank=trainer.local_rank, tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1), pipeline_model_parallel_size=cfg.get( 'pipeline_model_parallel_size', 1), micro_batch_size=cfg.get('micro_batch_size'), global_batch_size=cfg.get('global_batch_size'), seed=self.cfg.get('seed', 1234), apex_transformer_log_level=self.cfg.get( 'apex_transformer_log_level', 30), ) self.tokenizer = get_nmt_tokenizer( library=self.cfg.tokenizer.library, model_name=self.cfg.tokenizer.type, tokenizer_model=self.register_artifact("tokenizer.model", self.cfg.tokenizer.model), vocab_file=self.register_artifact("tokenizer.vocab_file", self.cfg.tokenizer.vocab_file), merges_file=self.register_artifact("tokenizer.merge_file", self.cfg.tokenizer.merge_file), delimiter=self.cfg.tokenizer.get('delimiter', None), ) vocab_size = self.tokenizer.vocab_size self.padded_vocab_size = self._vocab_size_with_padding( orig_vocab_size=vocab_size, make_vocab_size_divisible_by=cfg.get( 'make_vocab_size_divisible_by', 128), tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1), ) # TODO: Not sure how to use lists of modules with PTL. # This means we can only use pipeline parallelism without the interleaved schedule. self.model = build_model(model_provider_func=self.model_provider_func, wrap_with_ddp=False)[0] self.setup_optimizer_param_groups() self.megatron_amp_o2 = cfg.get('megatron_amp_O2', False) if self.megatron_amp_o2: # Pre-allocate the model on GPU to have master parameters allocated on the same device with matching data type self.model.cuda(torch.cuda.current_device()) # Model wrapper to convert both model and inputs to half precision self.model = Float16Module(module=self.model, precision=cfg.precision) if self.trainer.precision == 32: self.autocast_dtype = torch.float elif self.trainer.precision == 16: self.autocast_dtype = torch.half elif self.trainer.precision == 'bf16': self.autocast_dtype = torch.bfloat16 else: raise ValueError('precision must be in [32, 16, "bf16"]') # configuration used for inference self._inference_config = None
def __init__(self, cfg: DictConfig, trainer: Trainer = None): """Initializes the PTune TextClassifier model.""" super().__init__(cfg=cfg, trainer=trainer) initialize_model_parallel_for_nemo( world_size=trainer.world_size, global_rank=trainer.global_rank, local_rank=trainer.local_rank, tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1), seed=cfg.get('seed', 1234), ) # shared params for dataset and data loaders self.dataset_cfg = cfg.dataset # tokenizer needs to get initialized before the super.__init__() # as dataloaders and datasets need it to process the data self.tokenizer = get_nmt_tokenizer( library=cfg.tokenizer.library, model_name=cfg.tokenizer.type, tokenizer_model=self.register_artifact("tokenizer.model", cfg.tokenizer.model), vocab_file=self.register_artifact("tokenizer.vocab_file", cfg.tokenizer.vocab_file), merges_file=self.register_artifact("tokenizer.merges_file", cfg.tokenizer.merge_file), ) self.class_weights = None self.model = MegatronGPTModel.restore_from( self.register_artifact('language_model.nemo_file', cfg.language_model.get('nemo_file', None)), trainer=trainer, ) if not cfg.use_lm_finetune: self.model.freeze() hidden_size = self.model.cfg.hidden_size # register the file containing the labels into the artifacts to get stored in the '.nemo' file later self.classes = cfg.dataset.classes self.embeddings = self.model.model.language_model.embedding.word_embeddings # set allowed vocab set self.vocab = self.tokenizer.tokenizer.get_vocab() # make sure classes are part of the vocab for k in cfg.dataset.classes: if token_wrapper(k) not in self.vocab: logging.error(f'class {k} is not part of the vocabulary. Please add it to your vocab') self.allowed_vocab_ids = set(self.vocab[token_wrapper(k)] for k in cfg.dataset.classes) # map from id to label self.allowed_vocab = {} self.label_ids = {} self.id_to_label = {} for i, k in enumerate(cfg.dataset.classes): self.allowed_vocab[self.vocab[token_wrapper(k)]] = i self.label_ids[k] = i self.id_to_label[i] = k self.template = cfg.prompt_encoder.template self.prompt_encoder = PromptEncoder( template=cfg.prompt_encoder.template, hidden_size=hidden_size, lstm_dropout=cfg.prompt_encoder.dropout, num_layers=cfg.prompt_encoder.num_layers, ) # load prompt encoder self.hidden_size = hidden_size self.tokenizer.add_special_tokens({'additional_special_tokens': [cfg.pseudo_token]}) self.pseudo_token_id = self.tokenizer.tokenizer.get_vocab()[cfg.pseudo_token] self.pad_token_id = ( self.tokenizer.tokenizer.pad_token_id if self.tokenizer.tokenizer.pad_token_id is not None else self.tokenizer.tokenizer.unk_token_id ) self.spell_length = sum(self.template)
def main(): args = get_args() startup_start = time.time() if args.preproc_folder: print('Searching folder for .json or .json.gz files...') assert os.path.exists( args.input), f'Folder does not exist: {args.input}' files_in_folder = os.listdir(args.input) json_files = [ os.path.join(args.input, f) for f in files_in_folder if f.endswith('.json') or f.endswith('.json.gz') ] if len(json_files) == 0: raise FileNotFoundError( 'No .json or .json.gz files found in folder.') else: print(f'Found {len(json_files)} .json or .json.gz files.') else: assert os.path.exists(args.input), f'File does not exist: {args.input}' json_files = [args.input] if nltk_available and args.split_sentences: nltk.download("punkt", quiet=True) encoder = Encoder(args) tokenizer = get_nmt_tokenizer( library=args.tokenizer_library, model_name=args.tokenizer_type, tokenizer_model=args.tokenizer_model, vocab_file=args.vocab_file, merges_file=args.merge_file, ) level = "document" if args.split_sentences: level = "sentence" print(f"Vocab size: {tokenizer.vocab_size}") print(f"Output prefix: {args.output_prefix}") output_bin_files = {} output_idx_files = {} builders = {} for key in args.json_keys: output_bin_files[key] = "{}_{}_{}.bin".format(args.output_prefix, key, level) output_idx_files[key] = "{}_{}_{}.idx".format(args.output_prefix, key, level) builders[key] = indexed_dataset.make_builder( output_bin_files[key], impl=args.dataset_impl, vocab_size=tokenizer.vocab_size) startup_end = time.time() proc_start = time.time() total_bytes_processed = 0 print("Time to startup:", startup_end - startup_start) pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer) for idx, json_file in enumerate(json_files): print(f'Processing file {json_file} {idx + 1}/{len(json_files)}') if json_file.endswith('.gz'): fin = gzip.open(json_file, 'r') else: fin = open(args.input, 'r', encoding='utf-8') encoded_docs = pool.imap(encoder.encode, fin, 25) for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1): total_bytes_processed += bytes_processed for key, sentences in doc.items(): if len(sentences) == 0: continue for sentence in sentences: builders[key].add_item(torch.IntTensor(sentence)) builders[key].end_document() if i % args.log_interval == 0: current = time.time() elapsed = current - proc_start mbs = total_bytes_processed / elapsed / 1024 / 1024 print(f"Processed {i} documents", f"({i/elapsed} docs/s, {mbs} MB/s).", file=sys.stderr) for key in args.json_keys: builders[key].finalize(output_idx_files[key])
type=int, default=-1, help='Max number of lines to parse') parser.add_argument('--batch_size', type=int, default=10000000, help='Batch size to parse in parallel') parser.add_argument('--out_dir', type=str, default="", help='Path to store data and plots') args = parser.parse_args() tokenizer = get_nmt_tokenizer( library=args.tokenizer_library, tokenizer_model=args.tokenizer_model, ) all_len = [] for fn in args.input_files: print(f"Parsing fn = {fn}") # read file fh = open(fn) # read all batches while True: lines = read_batch(fh, args.batch_size) # move to next file when no lines are read if not lines: