def __init__(self, cfg, device): super().__init__() tokenizer = RobertaTokenizerFast.from_pretrained('./bird_bpe_vocab', max_len=256) _config = RobertaConfig( vocab_size=tokenizer._tokenizer.get_vocab_size(), hidden_size=512, num_hidden_layers=4, num_attention_heads=8, max_position_embeddings=256, pad_token_id=1, eos_token_id=0, bos_token_id=2, output_attentions=False, output_hidden_states=False ) _model = RobertaForMaskedLM(_config) _model.load_state_dict(torch.load('bert_small/checkpoint-1100/pytorch_model.bin')) _model.eval() self.tokenizer = tokenizer self._model = _model self.device = device self.pad_token = 0 self.batch_size = cfg.batch_size self.proj = None if cfg.proj_lang: self.proj = nn.Sequential(*[EqualisedLinearLayer(512, cfg.latent_dim, weight_scaling=cfg.weight_scaling), nn.Tanh()])
def main(): random.seed(1012) logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) chars = string.ascii_lowercase number_of_entity_trials = 10 tokenizer = RobertaTokenizer.from_pretrained('roberta-large') # checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/save_step_92160/checkpoint.pt' # state_dict = torch.load(checkpoint_path)["model"] # roberta = RobertaForMaskedLM.from_pretrained('roberta-base', state_dict=state_dict) # # Initializing a RoBERTa configuration # config = RobertaConfig.from_pretrained('roberta-base') # # Initializing a model from the configuration # roberta = RobertaForMaskedLM(config) # checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/save_step_92160/checkpoint.pt' # state_dict = torch.load(checkpoint_path)["model"] # roberta.load_state_dict(state_dict) roberta = HappyROBERTA('roberta-large') config = RobertaConfig.from_pretrained('roberta-large') mlm = RobertaForMaskedLM(config) #checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/save_step_92160/checkpoint.pt' #checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/roberta-base/save_step_230400/checkpoint.pt' #checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/roberta-base/roberta_base_best_sample_from_sets/checkpoint.pt' checkpoint_path = '../data/finetune_data/roberta-large/save_step_57000/checkpoint.pt' state_dict = torch.load(checkpoint_path)["model"] mlm.load_state_dict(state_dict) mlm.eval() roberta.mlm = mlm fictitious_entities = proc.generate_pairs_of_random_strings( number_of_pairs=100, min_length=3, max_length=12, character_set=chars) with open("../data/truism_data/physical_data_sentences_2.json", "r") as f: physical_sents = json.load(f) with open("../data/truism_data/physical_data_2.json", "r") as f: physical_config = json.load(f) with open("../data/finetune_data/sample_from_sets/test_keys.json", "r") as f: test_keys = json.load(f) phy_filtered = {} for key in test_keys['phy']: index = key.split("-")[0] ling_pert = key.split("-")[1] asym_pert = key.split("-")[2] if index not in phy_filtered.keys(): phy_filtered[index] = {} phy_filtered[index][ling_pert] = {} phy_filtered[index][ling_pert][asym_pert] = physical_sents[index][ ling_pert][asym_pert] elif ling_pert not in phy_filtered[index].keys(): phy_filtered[index][ling_pert] = {} phy_filtered[index][ling_pert][asym_pert] = physical_sents[index][ ling_pert][asym_pert] else: phy_filtered[index][ling_pert][asym_pert] = physical_sents[index][ ling_pert][asym_pert] # physical_sents = {k: physical_sents[k] for k in ('11', '16')} # physical_config = {k: physical_config[k] for k in ('11', '16')} logger.info("finished reading in physical data") output_df = run_pipeline(model=roberta, tokenizer=tokenizer, fictitious_entities=fictitious_entities, sentences=phy_filtered, config=physical_config, number_of_entity_trials=number_of_entity_trials, logger=logger) output_df.to_csv( "../data/masked_word_result_data/roberta/sample_from_set/physical_perf_ft19_new_{}.csv" .format(number_of_entity_trials), index=False) logger.info("finished saving physical dataset results") with open("../data/truism_data/material_data_sentences_2.json", "r") as f: material_sents = json.load(f) with open("../data/truism_data/material_data_2.json", "r") as f: material_config = json.load(f) mat_filtered = {} for key in test_keys['mat']: index = key.split("-")[0] ling_pert = key.split("-")[1] asym_pert = key.split("-")[2] if index not in mat_filtered.keys(): mat_filtered[index] = {} mat_filtered[index][ling_pert] = {} mat_filtered[index][ling_pert][asym_pert] = material_sents[index][ ling_pert][asym_pert] elif ling_pert not in mat_filtered[index].keys(): mat_filtered[index][ling_pert] = {} mat_filtered[index][ling_pert][asym_pert] = material_sents[index][ ling_pert][asym_pert] else: mat_filtered[index][ling_pert][asym_pert] = material_sents[index][ ling_pert][asym_pert] logger.info("finished reading in material data") output_df = run_pipeline(model=roberta, tokenizer=tokenizer, fictitious_entities=fictitious_entities, sentences=mat_filtered, config=material_config, number_of_entity_trials=number_of_entity_trials, logger=logger) output_df.to_csv( "../data/masked_word_result_data/roberta/sample_from_set/material_perf_ft19_new_{}.csv" .format(number_of_entity_trials), index=False) logger.info("finished saving physical material results") with open("../data/truism_data/social_data_sentences_2.json", "r") as f: social_sents = json.load(f) with open("../data/truism_data/social_data_2.json", "r") as f: social_config = json.load(f) soc_filtered = {} for key in test_keys['soc']: index = key.split("-")[0] ling_pert = key.split("-")[1] asym_pert = key.split("-")[2] if index not in soc_filtered.keys(): soc_filtered[index] = {} soc_filtered[index][ling_pert] = {} soc_filtered[index][ling_pert][asym_pert] = social_sents[index][ ling_pert][asym_pert] elif ling_pert not in soc_filtered[index].keys(): soc_filtered[index][ling_pert] = {} soc_filtered[index][ling_pert][asym_pert] = social_sents[index][ ling_pert][asym_pert] else: soc_filtered[index][ling_pert][asym_pert] = social_sents[index][ ling_pert][asym_pert] logger.info("finished reading in social data") output_df = run_pipeline(model=roberta, tokenizer=tokenizer, fictitious_entities=fictitious_entities, sentences=soc_filtered, config=social_config, number_of_entity_trials=number_of_entity_trials, logger=logger) output_df.to_csv( "../data/masked_word_result_data/roberta/sample_from_set/social_perf_ft19_new_{}.csv" .format(number_of_entity_trials), index=False) logger.info("finished saving physical social results")
class Roberta(object): def __init__(self, args): # self.dict_file = "{}/{}".format(args.roberta_model_dir, args.roberta_vocab_name) self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base') if args.model_path is not None: print("Testing CoLAKE...") print('loading model parameters from {}...'.format( args.model_path)) config = RobertaConfig.from_pretrained('roberta-base', type_vocab_size=3) self.model = RobertaForMaskedLM(config=config) states_dict = torch.load(os.path.join(args.model_path, 'model.bin')) self.model.load_state_dict(states_dict, strict=False) else: print("Testing RoBERTa baseline...") self.model = RobertaForMaskedLM.from_pretrained('roberta-base') self._build_vocab() self._init_inverse_vocab() self._model_device = 'cpu' self.max_sentence_length = args.max_sentence_length def _cuda(self): self.model.cuda() def _build_vocab(self): self.vocab = [] for key in range(len(self.tokenizer)): value = self.tokenizer.decode([key]) if value[0] == " ": # if the token starts with a whitespace value = value.strip() else: # this is subword information value = "_{}_".format(value) if value in self.vocab: # print("WARNING: token '{}' is already in the vocab".format(value)) value = "{}_{}".format(value, key) self.vocab.append(value) print("size of vocabulary: {}".format(len(self.vocab))) def _init_inverse_vocab(self): self.inverse_vocab = {w: i for i, w in enumerate(self.vocab)} def try_cuda(self): """Move model to GPU if one is available.""" if torch.cuda.is_available(): if self._model_device != 'cuda': self._cuda() self._model_device = 'cuda' else: print('No CUDA found') def init_indices_for_filter_logprobs(self, vocab_subset): index_list = [] new_vocab_subset = [] for word in vocab_subset: if word in self.inverse_vocab: inverse_id = self.inverse_vocab[word] index_list.append(inverse_id) new_vocab_subset.append(word) else: msg = "word {} from vocab_subset not in model vocabulary!".format( word) print("WARNING: {}".format(msg)) indices = torch.as_tensor(index_list) return indices, index_list def filter_logprobs(self, log_probs, indices): new_log_probs = log_probs.index_select(dim=2, index=indices) return new_log_probs def get_id(self, input_string): # Roberta predicts ' London' and not 'London' string = " " + str(input_string).strip() tokens = self.tokenizer.encode(string, add_special_tokens=False) # return [element.item() for element in tokens.long().flatten()] return tokens def get_batch_generation(self, samples_list, try_cuda=True): if not samples_list: return None if try_cuda: self.try_cuda() tensor_list = [] masked_indices_list = [] max_len = 0 output_tokens_list = [] seq_len = [] for sample in samples_list: masked_inputs_list = sample["masked_sentences"] tokens_list = [self.tokenizer.bos_token_id] for idx, masked_input in enumerate(masked_inputs_list): tokens_list.extend( self.tokenizer.encode(" " + masked_input.strip(), add_special_tokens=False)) tokens_list.append(self.tokenizer.eos_token_id) # tokens = torch.cat(tokens_list)[: self.max_sentence_length] tokens = torch.tensor(tokens_list)[:self.max_sentence_length] output_tokens_list.append(tokens.long().cpu().numpy()) seq_len.append(len(tokens)) if len(tokens) > max_len: max_len = len(tokens) tensor_list.append(tokens) masked_index = ( tokens == self.tokenizer.mask_token_id).nonzero().numpy() for x in masked_index: masked_indices_list.append([x[0]]) tokens_list = [] for tokens in tensor_list: pad_lenght = max_len - len(tokens) if pad_lenght > 0: pad_tensor = torch.full([pad_lenght], self.tokenizer.pad_token_id, dtype=torch.int) tokens = torch.cat((tokens, pad_tensor.long())) tokens_list.append(tokens) batch_tokens = torch.stack(tokens_list) seq_len = torch.LongTensor(seq_len) attn_mask = seq_len_to_mask(seq_len) with torch.no_grad(): # with utils.eval(self.model.model): self.model.eval() outputs = self.model( batch_tokens.long().to(device=self._model_device), attention_mask=attn_mask.to(device=self._model_device)) log_probs = outputs[0] return log_probs.cpu(), output_tokens_list, masked_indices_list
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments, ArchitectureArguments, CustomOthersArguments)) (model_args, data_args, training_args, arch_args, custom_args) = parser.parse_args_into_dataclasses() # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) train_files = list( sorted(glob.glob(f'{data_args.train_dir}/*.{custom_args.ext}'))) validation_files = list( sorted(glob.glob(f'{data_args.eval_dir}/*.{custom_args.ext}'))) if len(train_files) > 1: logger.warning( f'Got {len(train_files)} train files, only pick first file.') train_files = train_files[:1] if len(validation_files) > 1: logger.warning( f'Got {len(validation_files)} validation files, only pick first file.' ) validation_files = validation_files[:1] # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. # Create config for LM model if model_args.tokenizer_type == 'ThaiRobertaTokenizer': tokenizer = ThaiRobertaTokenizer.from_pretrained( model_args.tokenizer_name_or_path, use_fast=model_args.use_fast_tokenizer) elif model_args.tokenizer_type == 'ThaiWordsNewmmTokenizer': tokenizer = ThaiWordsNewmmTokenizer.from_pretrained( model_args.tokenizer_name_or_path) elif model_args.tokenizer_type == 'ThaiWordsSyllableTokenizer': tokenizer = ThaiWordsSyllableTokenizer.from_pretrained( model_args.tokenizer_name_or_path) elif model_args.tokenizer_type == 'FakeSefrCutTokenizer': tokenizer = FakeSefrCutTokenizer.from_pretrained( model_args.tokenizer_name_or_path) else: raise NotImplementedError( f'tokenizer_type {model_args.tokenizer_type} is not implemeted.') if custom_args.ext == 'txt': if len(train_files) > 1 or len(validation_files) > 1: raise NotImplementedError('only one txt file support for now') if data_args.datasets_type == 'MemmapLineByLineTextDataset': datasets = { 'train': MemmapLineByLineTextDataset( tokenizer, train_files[0], data_args.max_seq_length, os.path.join(data_args.datasets_cache_dir, 'train'), custom_args.tokenize_chunksize, data_args.overwrite_cache), 'validation': MemmapLineByLineTextDataset( tokenizer, validation_files[0], data_args.max_seq_length, os.path.join(data_args.datasets_cache_dir, 'validation'), custom_args.tokenize_chunksize, data_args.overwrite_cache) } elif data_args.datasets_type == 'MemmapConcatFullSentenceTextDataset': datasets = { 'train': MemmapConcatFullSentenceTextDataset( tokenizer, train_files[0], data_args.max_seq_length, os.path.join(data_args.datasets_cache_dir, 'train'), custom_args.tokenize_chunksize, data_args.overwrite_cache), 'validation': PaddedDataset( MemmapConcatFullSentenceTextDataset( tokenizer, validation_files[0], data_args.max_seq_length, os.path.join(data_args.datasets_cache_dir, 'validation'), custom_args.tokenize_chunksize, data_args.overwrite_cache), tokenizer.pad_token_id, data_args.max_seq_length) } else: raise NotImplementedError( f'No specified datasets type {data_args.datasets_type}') else: raise NotImplementedError(f'not supprt {custom_args.ext},' f'but this should be possible to support.') if custom_args.build_dataset_only: return ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { "roberta-base": "../roberta_config/th-roberta-base-config.json", "roberta-large": "../roberta_config/th-roberta-large-config.json", } config = AutoConfig.from_pretrained( pretrained_model_name_or_path=ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP[ arch_args.architecture], vocab_size=tokenizer.vocab_size) # Some sanity check tokenizer_and_model_config_mismatch(config, tokenizer) block_size_exceed_max_position_embeddings(config, data_args.max_seq_length) # Initialize model model = RobertaForMaskedLM(config=config) if custom_args.model_dir is not None: model_path = os.path.join(custom_args.model_dir, 'pytorch_model.bin') logger.info( f'[INFO] Load pretrianed model (state_dict) from {model_path}') # Use strict=False to kept model compatible with older version, # so we can bumb transformers version up and use new datasets library # see this issues https://github.com/huggingface/transformers/issues/6882 # The program itself will run but does it has any side effect? # Maybe bad idea? try: model.load_state_dict(state_dict=torch.load(model_path)) except RuntimeError: logger.info( '[INFO] RuntimeError, try loading with strict=False instead.') model.load_state_dict(state_dict=torch.load(model_path), strict=False) # If we did not add strict=False, this will raise Error since the keys are not match # RuntimeError: Error(s) in loading state_dict for RobertaForMaskedLM: # Missing key(s) in state_dict: "roberta.embeddings.position_ids". # Unexpected key(s) in state_dict: "roberta.pooler.dense.weight", # "roberta.pooler.dense.bias". data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm_probability=data_args.mlm_probability) # Initialize trainer trainer = Trainer(model=model, args=training_args, train_dataset=datasets["train"], eval_dataset=datasets["validation"], data_collator=data_collator) # Training if custom_args.model_dir is not None: trainer.train(model_path=custom_args.model_dir) else: trainer.train() # save output_model_dir = os.path.join(training_args.output_dir, 'roberta_thai') logging.info(" Save final model to '%s'.", output_model_dir) trainer.save_model(output_model_dir) if trainer.is_world_process_zero(): output_tokenizer_dir = os.path.join(training_args.output_dir, 'roberta_thai_tokenizer') tokenizer.save_pretrained(output_tokenizer_dir) # evaluate trainer.evaluate()