def create_and_check_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = RobertaForMaskedLM(config=config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) self.parent.assertEqual( result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
def create_and_check_roberta_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = RobertaForMaskedLM(config=config) model.to(torch_device) model.eval() loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels) result = { "loss": loss, "prediction_scores": prediction_scores, } self.parent.assertListEqual( list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]) self.check_loss_output(result)
def main(): random.seed(1012) logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) chars = string.ascii_lowercase number_of_entity_trials = 10 tokenizer = RobertaTokenizer.from_pretrained('roberta-large') # checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/save_step_92160/checkpoint.pt' # state_dict = torch.load(checkpoint_path)["model"] # roberta = RobertaForMaskedLM.from_pretrained('roberta-base', state_dict=state_dict) # # Initializing a RoBERTa configuration # config = RobertaConfig.from_pretrained('roberta-base') # # Initializing a model from the configuration # roberta = RobertaForMaskedLM(config) # checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/save_step_92160/checkpoint.pt' # state_dict = torch.load(checkpoint_path)["model"] # roberta.load_state_dict(state_dict) roberta = HappyROBERTA('roberta-large') config = RobertaConfig.from_pretrained('roberta-large') mlm = RobertaForMaskedLM(config) #checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/save_step_92160/checkpoint.pt' #checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/roberta-base/save_step_230400/checkpoint.pt' #checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/roberta-base/roberta_base_best_sample_from_sets/checkpoint.pt' checkpoint_path = '../data/finetune_data/roberta-large/save_step_57000/checkpoint.pt' state_dict = torch.load(checkpoint_path)["model"] mlm.load_state_dict(state_dict) mlm.eval() roberta.mlm = mlm fictitious_entities = proc.generate_pairs_of_random_strings( number_of_pairs=100, min_length=3, max_length=12, character_set=chars) with open("../data/truism_data/physical_data_sentences_2.json", "r") as f: physical_sents = json.load(f) with open("../data/truism_data/physical_data_2.json", "r") as f: physical_config = json.load(f) with open("../data/finetune_data/sample_from_sets/test_keys.json", "r") as f: test_keys = json.load(f) phy_filtered = {} for key in test_keys['phy']: index = key.split("-")[0] ling_pert = key.split("-")[1] asym_pert = key.split("-")[2] if index not in phy_filtered.keys(): phy_filtered[index] = {} phy_filtered[index][ling_pert] = {} phy_filtered[index][ling_pert][asym_pert] = physical_sents[index][ ling_pert][asym_pert] elif ling_pert not in phy_filtered[index].keys(): phy_filtered[index][ling_pert] = {} phy_filtered[index][ling_pert][asym_pert] = physical_sents[index][ ling_pert][asym_pert] else: phy_filtered[index][ling_pert][asym_pert] = physical_sents[index][ ling_pert][asym_pert] # physical_sents = {k: physical_sents[k] for k in ('11', '16')} # physical_config = {k: physical_config[k] for k in ('11', '16')} logger.info("finished reading in physical data") output_df = run_pipeline(model=roberta, tokenizer=tokenizer, fictitious_entities=fictitious_entities, sentences=phy_filtered, config=physical_config, number_of_entity_trials=number_of_entity_trials, logger=logger) output_df.to_csv( "../data/masked_word_result_data/roberta/sample_from_set/physical_perf_ft19_new_{}.csv" .format(number_of_entity_trials), index=False) logger.info("finished saving physical dataset results") with open("../data/truism_data/material_data_sentences_2.json", "r") as f: material_sents = json.load(f) with open("../data/truism_data/material_data_2.json", "r") as f: material_config = json.load(f) mat_filtered = {} for key in test_keys['mat']: index = key.split("-")[0] ling_pert = key.split("-")[1] asym_pert = key.split("-")[2] if index not in mat_filtered.keys(): mat_filtered[index] = {} mat_filtered[index][ling_pert] = {} mat_filtered[index][ling_pert][asym_pert] = material_sents[index][ ling_pert][asym_pert] elif ling_pert not in mat_filtered[index].keys(): mat_filtered[index][ling_pert] = {} mat_filtered[index][ling_pert][asym_pert] = material_sents[index][ ling_pert][asym_pert] else: mat_filtered[index][ling_pert][asym_pert] = material_sents[index][ ling_pert][asym_pert] logger.info("finished reading in material data") output_df = run_pipeline(model=roberta, tokenizer=tokenizer, fictitious_entities=fictitious_entities, sentences=mat_filtered, config=material_config, number_of_entity_trials=number_of_entity_trials, logger=logger) output_df.to_csv( "../data/masked_word_result_data/roberta/sample_from_set/material_perf_ft19_new_{}.csv" .format(number_of_entity_trials), index=False) logger.info("finished saving physical material results") with open("../data/truism_data/social_data_sentences_2.json", "r") as f: social_sents = json.load(f) with open("../data/truism_data/social_data_2.json", "r") as f: social_config = json.load(f) soc_filtered = {} for key in test_keys['soc']: index = key.split("-")[0] ling_pert = key.split("-")[1] asym_pert = key.split("-")[2] if index not in soc_filtered.keys(): soc_filtered[index] = {} soc_filtered[index][ling_pert] = {} soc_filtered[index][ling_pert][asym_pert] = social_sents[index][ ling_pert][asym_pert] elif ling_pert not in soc_filtered[index].keys(): soc_filtered[index][ling_pert] = {} soc_filtered[index][ling_pert][asym_pert] = social_sents[index][ ling_pert][asym_pert] else: soc_filtered[index][ling_pert][asym_pert] = social_sents[index][ ling_pert][asym_pert] logger.info("finished reading in social data") output_df = run_pipeline(model=roberta, tokenizer=tokenizer, fictitious_entities=fictitious_entities, sentences=soc_filtered, config=social_config, number_of_entity_trials=number_of_entity_trials, logger=logger) output_df.to_csv( "../data/masked_word_result_data/roberta/sample_from_set/social_perf_ft19_new_{}.csv" .format(number_of_entity_trials), index=False) logger.info("finished saving physical social results")
class Roberta(object): def __init__(self, args): # self.dict_file = "{}/{}".format(args.roberta_model_dir, args.roberta_vocab_name) self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base') if args.model_path is not None: print("Testing CoLAKE...") print('loading model parameters from {}...'.format( args.model_path)) config = RobertaConfig.from_pretrained('roberta-base', type_vocab_size=3) self.model = RobertaForMaskedLM(config=config) states_dict = torch.load(os.path.join(args.model_path, 'model.bin')) self.model.load_state_dict(states_dict, strict=False) else: print("Testing RoBERTa baseline...") self.model = RobertaForMaskedLM.from_pretrained('roberta-base') self._build_vocab() self._init_inverse_vocab() self._model_device = 'cpu' self.max_sentence_length = args.max_sentence_length def _cuda(self): self.model.cuda() def _build_vocab(self): self.vocab = [] for key in range(len(self.tokenizer)): value = self.tokenizer.decode([key]) if value[0] == " ": # if the token starts with a whitespace value = value.strip() else: # this is subword information value = "_{}_".format(value) if value in self.vocab: # print("WARNING: token '{}' is already in the vocab".format(value)) value = "{}_{}".format(value, key) self.vocab.append(value) print("size of vocabulary: {}".format(len(self.vocab))) def _init_inverse_vocab(self): self.inverse_vocab = {w: i for i, w in enumerate(self.vocab)} def try_cuda(self): """Move model to GPU if one is available.""" if torch.cuda.is_available(): if self._model_device != 'cuda': self._cuda() self._model_device = 'cuda' else: print('No CUDA found') def init_indices_for_filter_logprobs(self, vocab_subset): index_list = [] new_vocab_subset = [] for word in vocab_subset: if word in self.inverse_vocab: inverse_id = self.inverse_vocab[word] index_list.append(inverse_id) new_vocab_subset.append(word) else: msg = "word {} from vocab_subset not in model vocabulary!".format( word) print("WARNING: {}".format(msg)) indices = torch.as_tensor(index_list) return indices, index_list def filter_logprobs(self, log_probs, indices): new_log_probs = log_probs.index_select(dim=2, index=indices) return new_log_probs def get_id(self, input_string): # Roberta predicts ' London' and not 'London' string = " " + str(input_string).strip() tokens = self.tokenizer.encode(string, add_special_tokens=False) # return [element.item() for element in tokens.long().flatten()] return tokens def get_batch_generation(self, samples_list, try_cuda=True): if not samples_list: return None if try_cuda: self.try_cuda() tensor_list = [] masked_indices_list = [] max_len = 0 output_tokens_list = [] seq_len = [] for sample in samples_list: masked_inputs_list = sample["masked_sentences"] tokens_list = [self.tokenizer.bos_token_id] for idx, masked_input in enumerate(masked_inputs_list): tokens_list.extend( self.tokenizer.encode(" " + masked_input.strip(), add_special_tokens=False)) tokens_list.append(self.tokenizer.eos_token_id) # tokens = torch.cat(tokens_list)[: self.max_sentence_length] tokens = torch.tensor(tokens_list)[:self.max_sentence_length] output_tokens_list.append(tokens.long().cpu().numpy()) seq_len.append(len(tokens)) if len(tokens) > max_len: max_len = len(tokens) tensor_list.append(tokens) masked_index = ( tokens == self.tokenizer.mask_token_id).nonzero().numpy() for x in masked_index: masked_indices_list.append([x[0]]) tokens_list = [] for tokens in tensor_list: pad_lenght = max_len - len(tokens) if pad_lenght > 0: pad_tensor = torch.full([pad_lenght], self.tokenizer.pad_token_id, dtype=torch.int) tokens = torch.cat((tokens, pad_tensor.long())) tokens_list.append(tokens) batch_tokens = torch.stack(tokens_list) seq_len = torch.LongTensor(seq_len) attn_mask = seq_len_to_mask(seq_len) with torch.no_grad(): # with utils.eval(self.model.model): self.model.eval() outputs = self.model( batch_tokens.long().to(device=self._model_device), attention_mask=attn_mask.to(device=self._model_device)) log_probs = outputs[0] return log_probs.cpu(), output_tokens_list, masked_indices_list