def t5_base_tokenizer_fast(self): return T5TokenizerFast.from_pretrained("t5-base")
def get_tokenizer(self, opt): return T5TokenizerFast.from_pretrained(opt['t5_model_arch'], truncation=True)
def __init__(self, split='train', raw_dataset=None, rank=-1, topk=-1, verbose=True, args=None, mode='train'): super().__init__() self.topk = topk self.verbose = verbose self.args = args self.mode = mode # Loading datasets to data self.split = split self.sources = split.split(',') if self.verbose: print('Data sources: ', self.sources) if 't5' in self.args.backbone: if self.args.use_vision: self.tokenizer = VLT5TokenizerFast.from_pretrained( args.backbone, max_length=self.args.max_text_length, do_lower_case=self.args.do_lower_case) else: self.tokenizer = T5TokenizerFast.from_pretrained( args.backbone, max_length=self.args.max_text_length, do_lower_case=self.args.do_lower_case) elif 'bart' in self.args.backbone: self.tokenizer = BartTokenizer.from_pretrained( args.backbone, # max_length=self.args.max_text_length, do_lower_case=self.args.do_lower_case) additional_special_tokens = [f'<extra_id_{i}>' for i in range(100-1, -1, -1)] + \ [f'<vis_extra_id_{i}>' for i in range(100-1, -1, -1)] special_tokens_dict = { 'additional_special_tokens': additional_special_tokens } num_added_toks = self.tokenizer.add_special_tokens( special_tokens_dict) self.img_ids_to_source = {} data_info_dicts = [] for source in self.sources: data_info_path = dataset_dir.joinpath(f'VCR/{source}.jsonl') with open(data_info_path) as f: _data_info_dicts = [json.loads(s) for s in f] for _d in _data_info_dicts: self.img_ids_to_source[_d['img_id']] = source _d['source'] = source data_info_dicts.extend(_data_info_dicts) if self.verbose: print(f"Loaded {len(_data_info_dicts)} data from", source) data = data_info_dicts self.rank = rank if self.topk > 0: data = data[:self.topk] if self.verbose: print(f"Use only {self.topk} data") self.data = data if self.verbose: print("# all sentences:", len(self.data)) self.n_boxes = args.n_boxes self.source_to_h5 = { 'train': vcr_feature_dir.joinpath(f'train_boxes36.h5'), 'val': vcr_feature_dir.joinpath(f'val_boxes36.h5'), 'test': vcr_feature_dir.joinpath(f'test_boxes36.h5'), 'train_GT': vcr_feature_dir.joinpath(f'train_boxes_GT.h5'), 'val_GT': vcr_feature_dir.joinpath(f'val_boxes_GT.h5'), 'test_GT': vcr_feature_dir.joinpath(f'test_boxes_GT.h5'), }
def __init__(self, hparams: argparse.Namespace, num_labels=None, mode="base", config=None, tokenizer=None, model=None, **config_kwargs): """Initialize a model, tokenizer and config.""" super().__init__() # TODO: move to self.save_hyperparameters() # self.save_hyperparameters() # can also expand arguments into trainer signature for easier reading self.save_hyperparameters(hparams) self.step_count = 0 self.output_dir = Path(self.hparams.output_dir) cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None if config is None: self.config = AutoConfig.from_pretrained( self.hparams.config_name if self.hparams.config_name else self.hparams.model_name_or_path, **({ "num_labels": num_labels } if num_labels is not None else {}), cache_dir=cache_dir, **config_kwargs, ) if self.hparams.tokenizer_name == "t5": self.config.vocab_size = hparams.vocab_size self.config.decoder_start_token_id = 1 self.config.eos_token_id = 2 self.config.pad_token_id = 3 print("pretrained", self.config) else: self.config: PretrainedConfig = config extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout") for p in extra_model_params: if getattr(self.hparams, p, None): assert hasattr( self.config, p), f"model config doesn't have a `{p}` attribute" setattr(self.config, p, getattr(self.hparams, p)) if tokenizer is None: if self.hparams.tokenizer_name and self.hparams.tokenizer_name == "t5" and self.hparams.vocab_file: from transformers import T5TokenizerFast, T5Tokenizer print(self.hparams.vocab_file) self.tokenizer = T5TokenizerFast(self.hparams.vocab_file) print("custom tokenizer", self.tokenizer) elif self.hparams.tokenizer_name and self.hparams.tokenizer_name == "pegasus" and self.hparams.vocab_file: from transformers import PegasusTokenizerFast, PegasusTokenizer print(self.hparams.vocab_file) self.tokenizer = PegasusTokenizerFast(self.hparams.vocab_file) print("custom tokenizer", self.tokenizer) else: self.tokenizer = AutoTokenizer.from_pretrained( self.hparams.tokenizer_name if self.hparams.tokenizer_name else self.hparams.model_name_or_path, cache_dir=cache_dir, ) else: self.tokenizer: PreTrainedTokenizer = tokenizer self.model_type = MODEL_MODES[mode] if model is None: # self.model = self.model_type.from_pretrained( # self.hparams.model_name_or_path, # from_tf=bool(".ckpt" in self.hparams.model_name_or_path), # config=self.config, # cache_dir=cache_dir, # ) print(self.config) self.model = self.model_type.from_config( # self.hparams.model_name_or_path, # from_tf=bool(".ckpt" in self.hparams.model_name_or_path), config=self.config, # cache_dir=cache_dir, ) print(self.model) else: self.model = model
data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] if extension == "txt": extension = "text" datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer if model_args.tokenizer_name: tokenizer = T5TokenizerFast.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer) elif model_args.model_name_or_path: tokenizer = T5TokenizerFast.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.config_name: config = T5Config.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir,
def __init__(self, split='train', raw_dataset=None, rank=-1, topk=-1, verbose=True, args=None, mode='train'): super().__init__() self.raw_dataset = raw_dataset self.topk = topk self.verbose = verbose self.args = args self.mode = mode # Loading datasets to data self.split = split if self.verbose: print('Data source: ', self.split) data = self.raw_dataset.data if topk > 0: data = data[:topk] if self.verbose: print(f"Use only {topk} data") self.n_gpus = torch.cuda.device_count() self.rank = rank self.data = data if self.verbose: # if 'sent' not in self.data_out: # print("# all images:", len(self.data)) # else: print("# all sentences:", len(self.data)) self.n_boxes = args.n_boxes if 't5' in self.args.backbone: if self.args.use_vision: self.tokenizer = VLT5TokenizerFast.from_pretrained( args.backbone, # max_length=self.args.max_text_length, do_lower_case=self.args.do_lower_case) else: self.tokenizer = T5TokenizerFast.from_pretrained( args.backbone, # max_length=self.args.max_text_length, do_lower_case=self.args.do_lower_case) elif 'bart' in self.args.backbone: self.tokenizer = BartTokenizer.from_pretrained( args.backbone, # max_length=self.args.max_text_length, do_lower_case=self.args.do_lower_case) additional_special_tokens = [f'<extra_id_{i}>' for i in range(100-1, -1, -1)] + \ [f'<vis_extra_id_{i}>' for i in range(100-1, -1, -1)] special_tokens_dict = { 'additional_special_tokens': additional_special_tokens } num_added_toks = self.tokenizer.add_special_tokens( special_tokens_dict) self.source_to_h5 = { 'train': nlvr_feature_dir.joinpath(f'train_obj36.h5'), 'valid': nlvr_feature_dir.joinpath(f'valid_obj36.h5'), 'test': nlvr_feature_dir.joinpath(f'test_obj36.h5'), }