def main2(): config = BartWithAdapterConfig.from_pretrained('facebook/bart-base') bart = MyBartWithAdapter(config) bart_old = BartModel.from_pretrained("facebook/bart-base") bart.model.load_state_dict(bart_old.state_dict(), strict=False) config = BartWithAdapterConfig.from_pretrained('facebook/bart-base') # config.adapt_layer_norm = True generator = ParameterGenerator(config) output = generator(torch.tensor([[1,2,3]])) print(output) print(output[0].size()) growingbart = GrowingBart(bart, generator, config) output = growingbart(torch.tensor([[4,1,3,4,3,5,6,3,2]]), torch.tensor([[1,1,1,1,1,1,1,1,1]]), torch.tensor([[4,1,3,4,3,5,6,3,2]]), torch.tensor([[1,1,1,1,1,1,1,1,1]]), torch.tensor([[4,1,3,4,3,5,6,3,2]]), torch.tensor([[1,1,1,1,1,1,1,1,1]])) print(output) loss = output[0].sum(-1).sum(-1).sum(-1) print(loss) loss.backward()
def main(): config = BartWithAdapterConfig.from_pretrained('facebook/bart-base') bart = MyBartWithAdapter(config) bart_old = BartModel.from_pretrained("facebook/bart-base") ret = bart.model.load_state_dict(bart_old.state_dict(), strict=False) print(ret)
def __init__(self, model_name): super().__init__() bart = BartModel.from_pretrained(model_name) self.hidden_dim = bart.config.hidden_size self.bart_encoder = bart.encoder self.bart_encoder.embed_tokens = lambda x: x self.bart_encoder.embed_positions = lambda x: torch.zeros( (x.shape[0], x.shape[1], self.hidden_dim), dtype=torch.float32)
def __init__(self): super(MoralClassifier, self).__init__() # self.l1 = DistilBertModel.from_pretrained('distilbert-base-uncased') self.l1 = BartModel.from_pretrained('facebook/bart-large-cnn') # Pooler self.l2 = torch.nn.Linear(1024, 1024) self.act = torch.nn.Tanh() # Classifier self.l3 = torch.nn.Dropout(0.3) self.l4 = torch.nn.Linear(1024, 11) # 11 categories
def test_inference_no_head(self): model = BartModel.from_pretrained("facebook/bart-large").to(torch_device) input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) inputs_dict = prepare_bart_inputs_dict(model.config, input_ids) with torch.no_grad(): output = model(**inputs_dict)[0] expected_shape = torch.Size((1, 11, 1024)) self.assertEqual(output.shape, expected_shape) expected_slice = torch.tensor( [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device ) self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
def __init__(self): super(Model, self).__init__() self.model = BartModel.from_pretrained( model_config.pretrain_model_path) self.config = self.model.config self.classification_head = BartClassificationHead( self.config.d_model, self.config.d_model, config.num_labels, self.config.classif_dropout, ) self.model._init_weights(self.classification_head.dense) self.model._init_weights(self.classification_head.out_proj)
def __init__(self, args): super(MoralClassifier, self).__init__() self.hparams = args self.l1 = BartModel.from_pretrained('facebook/bart-large-cnn') # freeze bart weights # for param in self.l1.parameters(): # param.requires_grad = False # Pooler self.l2 = torch.nn.Linear(1024, 1024) self.act = torch.nn.Tanh() # Classifier self.l3 = torch.nn.Dropout(0.2) self.l4 = torch.nn.Linear(1024, 10) # 10 categories
def __init__(self, model_name, use_pretrained_embeddings=False): super().__init__() # or some flag that indicates the bart encoder in it's entirety could be used. if use_pretrained_embeddings: # will use the entire bart encoder including all embeddings bart = PretrainedTransformerEmbedder(model_name, sub_module="encoder") else: bart = BartModel.from_pretrained(model_name) self.bart_encoder.embed_tokens = lambda x: x self.bart_encoder.embed_positions = lambda x: torch.zeros( (x.shape[0], x.shape[1], self.hidden_dim), dtype=torch.float32) self.hidden_dim = bart.config.hidden_size self.bart_encoder = bart.transformer_model
def __init__(self, large, temp_dir, finetune=False, bart=False): # def __init__(self, large, temp_dir, finetune=False): super(Bert, self).__init__() if(large): self.model = BertModel.from_pretrained('bert-large-uncased', cache_dir=temp_dir) else: # self.model = BertModel.from_pretrained('bert-base-uncased', cache_dir=temp_dir) if bart: self.model = BartModel.from_pretrained('/home/ybai/downloads/bart', cache_dir=temp_dir, local_files_only=True) # self.model = BartForConditionalGeneration.from_pretrained('/home/ybai/downloads/bart', cache_dir=temp_dir, local_files_only=True) else: self.model = BertModel.from_pretrained('bert-base-multilingual-uncased', cache_dir=temp_dir, local_files_only=False) self.finetune = finetune
def __init__(self, args, use_mask=True): super(OneHotMoralClassifier, self).__init__() self.hparams = args self.bart = BartModel.from_pretrained('facebook/bart-large-cnn') self.use_mask = use_mask self.vocab_size = 50264 self.onehot_embeddings = nn.Linear(self.vocab_size, 1024, bias=False) self.onehot_embeddings.weight = nn.Parameter(self.build_lookups()) # self.bart.encoder.embed_tokens = nn.Identity() # freeze bert weights # self.onehot_embeddings.requires_grad = False # self.onehot_embeddings.weight.requires_grad = False # for param in self.bart.parameters(): # param.requires_grad = False # Pooler self.l2 = torch.nn.Linear(1024, 1024) self.act = torch.nn.Tanh() # Classifier self.l3 = torch.nn.Dropout(0.2) self.l4 = torch.nn.Linear(1024, 10) # 10 categories
def __init__(self, n_vocab=50264): self.n_vocab = n_vocab self.true_embedding = BartModel.from_pretrained( 'facebook/bart-large-cnn').encoder.embed_tokens
def Seq2Seq(df): model_type = 'facebook/bart-large' tokenizer = BartTokenizer.from_pretrained(model_type) model = BartModel.from_pretrained(model_type) mask_model = BartForConditionalGeneration.from_pretrained(model_type) sep_token = '</s>' mask_token = '<mask>' mask_id = tokenizer(mask_token, return_tensors='pt')['input_ids'][0][1] sep_id = tokenizer(sep_token, return_tensors='pt')['input_ids'][0][1] optimizer = AdamW(model.parameters()) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) mask_model.to(device) df['mask_text'] = 0 df['auxiliary_text'] = 0 for i in range(len(df)): aspect = df['aspect'].iloc[i] sentiment = df['sentiment'].iloc[i] if aspect == 'NULL' or isinstance(aspect, (int, float)): aspect = 'aspect' if DPM_type == 'Senti': mask_sent = 'the polarity of the ' + aspect + ' is ' + mask_token + ' ' + sep_token + ' ' auxiliary_sent = 'the polarity of the ' + aspect + ' is ' + sentiment + ' ' + sep_token + ' ' elif DPM_type == 'AS': mask_sent = 'the polarity of the ' + mask_token + ' is ' + sentiment + ' ' + sep_token + ' ' auxiliary_sent = 'the polarity of the ' + aspect + ' is ' + sentiment + ' ' + sep_token + ' ' df['mask_text'].iloc[i] = mask_sent + df['text'].iloc[i] df['auxiliary_text'].iloc[i] = auxiliary_sent + df['text'].iloc[i] df['distance'] = 0 df = df.astype('object') for i in range(len(df)): tokenized = df['mask_text'][i:i + 1].apply((lambda x: tokenizer.encode( x, add_special_tokens=True, max_length=MAX_LEN, truncation=True))) sep_index = tokenized[i].index(sep_id) mask_index = tokenized[i].index(mask_id) padded = pad_sequences(tokenized, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post") attention_mask = np.where(padded != 0, 1, 0) input_ids = torch.tensor(padded).to(device) attention_mask = torch.tensor(attention_mask).to(device) with torch.no_grad(): last_hidden_states = model(input_ids, attention_mask=attention_mask) original_mask_embedding = last_hidden_states[0][:, mask_index, :].cpu( ).numpy() distance = [] for pertubed_index in range(sep_index + 1, MAX_LEN): padded = pad_sequences(tokenized, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post") if padded[0][pertubed_index] != 0 and padded[0][ pertubed_index] != sep_id: #print(padded.shape) cur_id = padded[0][pertubed_index] padded[0][pertubed_index] = mask_id cur_embedding = mask_embedding(model, padded, mask_index) d = dist(original_mask_embedding, cur_embedding) distance.append((cur_id, d)) df['distance'].iloc[i] = distance df['perturbed_mask_index'] = 0 df = df.astype('object') for i in range(len(df)): perturbed_mask_index = [] mask_threshold = calculate_threshold( np.array(df['distance'].iloc[i])[:, 1], std_strength) for dis_index in range(len(df['distance'].iloc[i])): if df['distance'].iloc[i][dis_index][1] < mask_threshold: perturbed_mask_index.append(dis_index) df['perturbed_mask_index'].iloc[i] = perturbed_mask_index df['augment_token_id'] = 0 df = df.astype('object') for i in range(len(df)): tokenized = tokenizer.encode(df['auxiliary_text'].iloc[i]) tokenized = torch.Tensor(tokenized).unsqueeze(0).to( torch.int64).to(device) augment_tokenized = tokenizer.encode(df['auxiliary_text'].iloc[i]) mask_tokenized = tokenizer.encode(df['auxiliary_text'].iloc[i]) sep_index = mask_tokenized.index(sep_id) for j in range(len(df['perturbed_mask_index'].iloc[i])): perturbed_mask_index = df['perturbed_mask_index'].iloc[i][ j] + sep_index + 1 mask_tokenized[perturbed_mask_index] = mask_id mask_tokenized = torch.Tensor(mask_tokenized).unsqueeze(0).to( torch.int64).to(device) logits = mask_model(mask_tokenized).logits for j in range(len(df['perturbed_mask_index'].iloc[i])): perturbed_mask_index = df['perturbed_mask_index'].iloc[i][ j] + sep_index + 1 probs = logits[0, perturbed_mask_index].softmax(dim=0) values, predictions = probs.topk(1) augment_tokenized[perturbed_mask_index] = int( predictions.cpu().numpy()) df['augment_token_id'].iloc[i] = augment_tokenized df['augment_text'] = 0 df = df.astype('object') for i in range(len(df)): sep_index = df['augment_token_id'].iloc[i].index(sep_id) df['augment_text'].iloc[i] = tokenizer.decode( df['augment_token_id'].iloc[i][sep_index + 1:-1]) return df
def get_kobart_model(): return BartModel.from_pretrained("hyunwoongko/kobart")
def _load_model(self, model_name: str): if model_name in ("facebook/mbart-large-cc25"): return BartModel.from_pretrained(model_name, config=self.config).eval() return AutoModel.from_pretrained(model_name, config=self.config).eval()
print("==== preparing data ====") make_path(args.cache_dir) tokenizer = BartTokenizer.from_pretrained('facebook/bart-base', cache_dir=args.cache_dir) with open('synt_vocab.pkl', 'rb') as f: synt_vocab = pickle.load(f) dataset = prepare_dataset(para_data, tokenizer, num) print("==== loading model ====") config = BartConfig.from_pretrained('facebook/bart-base', cache_dir=args.cache_dir) config.word_dropout = args.word_dropout config.max_sent_len = args.max_sent_len config.max_synt_len = args.max_synt_len bart = BartModel.from_pretrained('facebook/bart-base', cache_dir=args.cache_dir) model = ParaBart(config) model.load_state_dict(bart.state_dict(), strict=False) model.zero_grad() del bart no_decay_params = [] no_decay_fast_params = [] fast_params = [] all_other_params = [] adv_no_decay_params = [] adv_all_other_params = [] for n, p in model.named_parameters(): if 'adv' in n:
def load_model(config, checkpoint): args = config['args'] labels = load_label(args.label_path) label_size = len(labels) config['labels'] = labels if config['emb_class'] == 'glove': if config['enc_class'] == 'gnb': model = TextGloveGNB(config, args.embedding_path, label_size) if config['enc_class'] == 'cnn': model = TextGloveCNN(config, args.embedding_path, label_size, emb_non_trainable=True) if config['enc_class'] == 'densenet-cnn': model = TextGloveDensenetCNN(config, args.embedding_path, label_size, emb_non_trainable=True) if config['enc_class'] == 'densenet-dsa': model = TextGloveDensenetDSA(config, args.embedding_path, label_size, emb_non_trainable=True) else: if config['emb_class'] == 'bart' and config['use_kobart']: from transformers import BartModel from kobart import get_kobart_tokenizer, get_pytorch_kobart_model bert_tokenizer = get_kobart_tokenizer() bert_tokenizer.cls_token = '<s>' bert_tokenizer.sep_token = '</s>' bert_tokenizer.pad_token = '<pad>' bert_model = BartModel.from_pretrained(get_pytorch_kobart_model()) bert_config = bert_model.config elif config['emb_class'] in ['gpt']: bert_tokenizer = AutoTokenizer.from_pretrained( args.bert_output_dir) bert_tokenizer.bos_token = '<|startoftext|>' bert_tokenizer.eos_token = '<|endoftext|>' bert_tokenizer.cls_token = '<|startoftext|>' bert_tokenizer.sep_token = '<|endoftext|>' bert_tokenizer.pad_token = '<|pad|>' bert_config = AutoConfig.from_pretrained(args.bert_output_dir) bert_model = AutoModel.from_pretrained(args.bert_output_dir) elif config['emb_class'] in ['t5']: from transformers import T5EncoderModel bert_tokenizer = AutoTokenizer.from_pretrained( args.bert_output_dir) bert_tokenizer.cls_token = '<s>' bert_tokenizer.sep_token = '</s>' bert_tokenizer.pad_token = '<pad>' bert_config = AutoConfig.from_pretrained(args.bert_output_dir) bert_model = T5EncoderModel(bert_config) else: bert_tokenizer = AutoTokenizer.from_pretrained( args.bert_output_dir) bert_config = AutoConfig.from_pretrained(args.bert_output_dir) bert_model = AutoModel.from_config(bert_config) ModelClass = TextBertCNN if config['enc_class'] == 'cls': ModelClass = TextBertCLS if config['enc_class'] == 'densenet-cnn': ModelClass = TextBertDensenetCNN model = ModelClass(config, bert_config, bert_model, bert_tokenizer, label_size) if args.enable_qat: assert args.device == 'cpu' model.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm') ''' # fuse if applicable # model = torch.quantization.fuse_modules(model, [['']]) ''' model = torch.quantization.prepare_qat(model) model.eval() model.to('cpu') logger.info("[Convert to quantized model with device=cpu]") model = torch.quantization.convert(model) if args.enable_qat_fx: import torch.quantization.quantize_fx as quantize_fx qconfig_dict = { "": torch.quantization.get_default_qat_qconfig('fbgemm') } model = quantize_fx.prepare_qat_fx(model, qconfig_dict) logger.info("[Convert to quantized model]") model = quantize_fx.convert_fx(model) if args.enable_diffq: quantizer = DiffQuantizer(model) config['quantizer'] = quantizer quantizer.restore_quantized_state(checkpoint) else: model.load_state_dict(checkpoint) model = model.to(args.device) ''' for name, param in model.named_parameters(): print(name, param.data, param.device, param.requires_grad) ''' logger.info("[model] :\n{}".format(model.__str__())) logger.info("[Model loaded]") return model
def test_model_from_pretrained(self): # Forces 1.6GB download from S3 for each model for model_name in BART_PRETRAINED_MODEL_ARCHIVE_LIST: model = BartModel.from_pretrained(model_name) self.assertIsNotNone(model)
def tokenizer_and_model(type_embedding): ######### #PORTUGUESE ######### if type_embedding.split('_')[0] == 'BERT' or type_embedding.split( '_')[0] == 'bert': if type_embedding == 'BERT_portuguese_large_neural_mind': path = '/home/jeanfarfan/bin/Semi_supervised_learning/data/Brazilian_Bert/BERT_large_portuguese/' elif type_embedding == 'BERT_portuguese_base_neural_mind': path = '/home/jeanfarfan/bin/Semi_supervised_learning/data/Brazilian_Bert/BERT_base_portuguese/' elif type_embedding == 'bert_base_multilingual_cased': path = 'bert-base-multilingual-cased' elif type_embedding == 'bert_base_multilingual_uncased': path = 'bert-base-multilingual-uncased' #load tokenizer and model tokenizer = BertTokenizer.from_pretrained(path) model = BertModel.from_pretrained(path, output_hidden_states=True, return_dict=True) special_tokens_dict = { 'additional_special_tokens': ['[USER]', '[SYSTEM]'] } orig_num_tokens = len(tokenizer) num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict) total_num_tokens = orig_num_tokens + num_added_tokens model.resize_token_embeddings(total_num_tokens) elif type_embedding.split('_')[0] == 'xlmroberta': if type_embedding == 'xlmroberta_base': path = 'xlm-roberta-base' elif type_embedding == 'xlmroberta_large': path = 'xlm-roberta-large' #load tokenizer and model tokenizer = XLMRobertaTokenizer.from_pretrained(path) model = XLMRobertaModel.from_pretrained(path, output_hidden_states=True, return_dict=True) elif type_embedding == 'xlm': path = 'xlm-mlm-100-1280' #load tokenizer and model tokenizer = XLMTokenizer.from_pretrained(path) model = XLMModel.from_pretrained(path, output_hidden_states=True, return_dict=True) ######### #ENGLISH ######### elif type_embedding == 'en_bert_base_uncased': path = 'bert-base-uncased' #load tokenizer and model tokenizer = BertTokenizer.from_pretrained(path) model = BertModel.from_pretrained(path, output_hidden_states=True, return_dict=True) elif type_embedding == 'en_xlm_mlm_enfr_1024': path = 'xlm-mlm-enfr-1024' #load tokenizer and model tokenizer = XLMTokenizer.from_pretrained(path) model = XLMModel.from_pretrained(path, output_hidden_states=True, return_dict=True) elif type_embedding == 'en_xlm_roberta_base': path = 'xlm-roberta-base' #load tokenizer and model tokenizer = XLMRobertaTokenizer.from_pretrained(path) model = XLMRobertaModel.from_pretrained(path, output_hidden_states=True, return_dict=True) elif type_embedding == 'distilbert_base_cased': path = 'distilbert-base-cased' #load tokenizer and model tokenizer = DistilBertTokenizer.from_pretrained(path) model = DistilBertModel.from_pretrained(path, output_hidden_states=True, return_dict=True) elif type_embedding == 'Mobile_Bert': path = 'google/mobilebert-uncased' #load tokenizer and model tokenizer = MobileBertTokenizer.from_pretrained(path) model = MobileBertModel.from_pretrained(path, output_hidden_states=True, return_dict=True) elif type_embedding == 'Electra': path = 'google/electra-small-discriminator' #load tokenizer and model tokenizer = ElectraTokenizer.from_pretrained(path) model = ElectraModel.from_pretrained(path, output_hidden_states=True, return_dict=True) elif type_embedding == 'BART': path = 'facebook/bart-large' #load tokenizer and model tokenizer = BartTokenizer.from_pretrained(path) model = BartModel.from_pretrained(path, output_hidden_states=True, return_dict=True) return tokenizer, model
def build_lookups(self): embeddings = BartModel.from_pretrained( 'facebook/bart-large-cnn').encoder.embed_tokens ids = torch.LongTensor([i for i in range(self.vocab_size)]) return torch.transpose(embeddings(ids), 0, 1).detach()
def get_embedding(type_embedding, data): if type_embedding.split('_')[0] == 'BERT' or type_embedding.split( '_')[0] == 'bert': if type_embedding == 'BERT_portuguese_large_neural_mind': path = '/home/jeanfranco/Movile_project/Semi_supervised_learning/data/Brazilian_Bert/BERT_large_portuguese/' elif type_embedding == 'BERT_portuguese_base_neural_mind': path = '/home/jeanfranco/Movile_project/Semi_supervised_learning/data/Brazilian_Bert/BERT_base_portuguese/' elif type_embedding == 'bert_base_multilingual_cased': path = 'bert-base-multilingual-cased' elif type_embedding == 'bert_base_multilingual_uncased': data = [x.lower() for x in data] path = 'bert-base-multilingual-uncased' #load tokenizer and model tokenizer = BertTokenizer.from_pretrained(path) model = BertModel.from_pretrained(path, output_hidden_states=True) elif type_embedding.split('_')[0] == 'xlmroberta': if type_embedding == 'xlmroberta_base': path = 'xlm-roberta-base' elif type_embedding == 'xlmroberta_large': path = 'xlm-roberta-large' #load tokenizer and model tokenizer = XLMRobertaTokenizer.from_pretrained(path) model = XLMRobertaModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'xlm': path = 'xlm-mlm-100-1280' #load tokenizer and model tokenizer = XLMTokenizer.from_pretrained(path) model = XLMModel.from_pretrained(path, output_hidden_states=True) ######### #ENGLISH ######### elif type_embedding == 'en_bert_base_uncased': path = 'bert-base-uncased' #load tokenizer and model tokenizer = BertTokenizer.from_pretrained(path) model = BertModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'en_xlm_mlm_enfr_1024': path = 'xlm-mlm-enfr-1024' #load tokenizer and model tokenizer = XLMTokenizer.from_pretrained(path) model = XLMModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'en_xlm_roberta_base': path = 'xlm-roberta-base' #load tokenizer and model tokenizer = XLMRobertaTokenizer.from_pretrained(path) model = XLMRobertaModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'distilbert_base_cased': path = 'distilbert-base-cased' #load tokenizer and model tokenizer = DistilBertTokenizer.from_pretrained(path) model = DistilBertModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'Mobile_Bert': path = 'google/mobilebert-uncased' #load tokenizer and model tokenizer = MobileBertTokenizer.from_pretrained(path) model = MobileBertModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'Electra': path = 'google/electra-small-discriminator' #load tokenizer and model tokenizer = ElectraTokenizer.from_pretrained(path) model = ElectraModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'BART': path = 'facebook/bart-large' #load tokenizer and model tokenizer = BartTokenizer.from_pretrained(path) model = BartModel.from_pretrained(path, output_hidden_states=True) # Set the device to GPU (cuda) if available, otherwise stick with CPU device = 'cuda' if torch.cuda.is_available() else 'cpu' list_of_four_last_embeddings = [] list_of_mean = [] for l in data: # Convert the string "granola bars" to tokenized vocabulary IDs input_ids = tokenizer.encode(l) #print(input_ids) # Convert the list of IDs to a tensor of IDs input_ids = torch.LongTensor(input_ids) #print(input_ids) model = model.to(device) input_ids = input_ids.to(device) #print(input_ids) model.eval() # unsqueeze IDs to get batch size of 1 as added dimension input_ids = input_ids.unsqueeze(0) with torch.no_grad(): out = model(input_ids=input_ids) # we only want the hidden_states if type_embedding == 'xlm': hidden_states = out[1] else: hidden_states = out[2] #mean of layers sentence_embedding = torch.mean(hidden_states[-1], dim=1).squeeze() list_of_mean.append(sentence_embedding.tolist()) # get last four layers last_four_layers = [hidden_states[i] for i in (-1, -2, -3, -4)] # cast layers to a tuple and concatenate over the last dimension cat_hidden_states = torch.cat(tuple(last_four_layers), dim=-1) # take the mean of the concatenated vector over the token dimension cat_sentence_embedding = torch.mean(cat_hidden_states, dim=1).squeeze() list_of_four_last_embeddings.append(cat_sentence_embedding.tolist()) #print('list of four last embeddings', np.array(list_of_four_last_embeddings).shape) #print('list of mean', np.array(list_of_mean).shape) return list_of_mean, list_of_four_last_embeddings
def __init__(self): super(BART, self).__init__() self.bart = BartModel.from_pretrained('facebook/bart-large') self.dropout = nn.Dropout(0.1) self.cls = nn.Linear(in_features=1024, out_features=4)
def test_model_from_pretrained(self): # Forces 1.6GB download from S3 for each model for model_name in list(BART_PRETRAINED_MODEL_ARCHIVE_MAP.keys()): model = BartModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model)
def prepare_model(config, bert_model_name_or_path=None): args = config['args'] emb_non_trainable = not args.embedding_trainable labels = load_label(args.label_path) label_size = len(labels) config['labels'] = labels # prepare model if config['emb_class'] == 'glove': if config['enc_class'] == 'gnb': model = TextGloveGNB(config, args.embedding_path, label_size) if config['enc_class'] == 'cnn': model = TextGloveCNN(config, args.embedding_path, label_size, emb_non_trainable=emb_non_trainable) if config['enc_class'] == 'densenet-cnn': model = TextGloveDensenetCNN(config, args.embedding_path, label_size, emb_non_trainable=emb_non_trainable) if config['enc_class'] == 'densenet-dsa': model = TextGloveDensenetDSA(config, args.embedding_path, label_size, emb_non_trainable=emb_non_trainable) else: model_name_or_path = args.bert_model_name_or_path if bert_model_name_or_path: model_name_or_path = bert_model_name_or_path if config['emb_class'] == 'bart' and config['use_kobart']: from transformers import BartModel from kobart import get_kobart_tokenizer, get_pytorch_kobart_model bert_tokenizer = get_kobart_tokenizer() bert_tokenizer.cls_token = '<s>' bert_tokenizer.sep_token = '</s>' bert_tokenizer.pad_token = '<pad>' bert_model = BartModel.from_pretrained(get_pytorch_kobart_model()) elif config['emb_class'] in ['gpt']: bert_tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) bert_tokenizer.bos_token = '<|startoftext|>' bert_tokenizer.eos_token = '<|endoftext|>' bert_tokenizer.cls_token = '<|startoftext|>' bert_tokenizer.sep_token = '<|endoftext|>' bert_tokenizer.pad_token = '<|pad|>' bert_model = AutoModel.from_pretrained( model_name_or_path, from_tf=bool(".ckpt" in model_name_or_path)) # 3 new tokens added bert_model.resize_token_embeddings(len(bert_tokenizer)) elif config['emb_class'] in ['t5']: from transformers import T5EncoderModel bert_tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) bert_tokenizer.cls_token = '<s>' bert_tokenizer.sep_token = '</s>' bert_tokenizer.pad_token = '<pad>' bert_model = T5EncoderModel.from_pretrained( model_name_or_path, from_tf=bool(".ckpt" in model_name_or_path)) else: bert_tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) bert_model = AutoModel.from_pretrained( model_name_or_path, from_tf=bool(".ckpt" in model_name_or_path)) bert_config = bert_model.config # bert model reduction reduce_bert_model(config, bert_model, bert_config) ModelClass = TextBertCNN if config['enc_class'] == 'cls': ModelClass = TextBertCLS if config['enc_class'] == 'densenet-cnn': ModelClass = TextBertDensenetCNN model = ModelClass(config, bert_config, bert_model, bert_tokenizer, label_size, feature_based=args.bert_use_feature_based, finetune_last=args.bert_use_finetune_last) if args.restore_path: checkpoint = load_checkpoint(args.restore_path) model.load_state_dict(checkpoint) if args.enable_qat: model.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm') ''' # fuse if applicable # model = torch.quantization.fuse_modules(model, [['']]) ''' model = torch.quantization.prepare_qat(model) if args.enable_qat_fx: import torch.quantization.quantize_fx as quantize_fx model.train() qconfig_dict = { "": torch.quantization.get_default_qat_qconfig('fbgemm') } model = quantize_fx.prepare_qat_fx(model, qconfig_dict) logger.info("[model] :\n{}".format(model.__str__())) logger.info("[model prepared]") return model
'tenacity': 5, 'epoch_size': 4 } # Set up logger logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--model', default='bart-large', help='model name or path') args = parser.parse_args() config = BartConfig.from_pretrained(args.model) model = BartModel.from_pretrained(args.model, config=config) tokenizer = BartTokenizer.from_pretrained(args.model) params_senteval['model'] = model.cuda().eval() params_senteval['tokenizer'] = tokenizer params_senteval['config'] = config se = senteval.engine.SE(params_senteval, batcher, prepare) transfer_tasks = [ 'STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 'Length', 'WordContent', 'Depth', 'TopConstituents', 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', 'OddManOut', 'CoordinationInversion', 'ImageCaptionRetrieval', 'SNLI' ] results = se.eval(transfer_tasks)
def Seq2Seq(df): model_type = 'bart-large' tokenizer = BartTokenizer.from_pretrained('facebook/bart-large') model = BartModel.from_pretrained('facebook/bart-large') mask_model = BartForConditionalGeneration.from_pretrained( 'facebook/bart-large') sep_token = '</s>' mask_token = '<mask>' mask_id = tokenizer(mask_token, return_tensors='pt')['input_ids'][0][1] sep_id = tokenizer(sep_token, return_tensors='pt')['input_ids'][0][1] optimizer = AdamW(model.parameters()) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) mask_model.to(device) auxiliary_tokens = ['the', 'aspect', 'term', 'is'] df['mask_tokens'] = 0 df['auxiliary_tokens'] = 0 df = df.astype('object') for i in range(len(df)): #for j in range(len(df['aspect_terms'].iloc[i])): auxiliary_sents = [] for j in range(len(df['aspect_terms'].iloc[i])): aspect_terms = df['aspect_terms'].iloc[i][j] auxiliary_sent = auxiliary_tokens + [aspect_terms] + [ sep_token ] + df['tokens'].iloc[i] auxiliary_sents.append(auxiliary_sent) mask_sent = auxiliary_tokens + [mask_token] + [sep_token ] + df['tokens'].iloc[i] df['mask_tokens'].iloc[i] = mask_sent df['auxiliary_tokens'].iloc[i] = auxiliary_sents df['distance'] = 0 df = df.astype('object') for i in range(len(df)): tokenized = tokenizer.encode(df['mask_tokens'].iloc[i]) sep_index = tokenized.index(sep_id) mask_index = tokenized.index(mask_id) tokenized = pd.Series([tokenized]) padded = pad_sequences(tokenized, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post") attention_mask = np.where(padded != 0, 1, 0) input_ids = torch.tensor(padded).to(device) attention_mask = torch.tensor(attention_mask).to(device) with torch.no_grad(): last_hidden_states = model(input_ids, attention_mask=attention_mask) original_mask_embedding = last_hidden_states[0][:, mask_index, :].cpu( ).numpy() distance = [] for pertubed_index in range(sep_index + 1, MAX_LEN): padded = pad_sequences(tokenized, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post") if padded[0][pertubed_index] != 0 and padded[0][ pertubed_index] != sep_id: #print(padded.shape) cur_id = padded[0][pertubed_index] padded[0][pertubed_index] = mask_id cur_embedding = mask_embedding(model, padded, mask_index) d = dist(original_mask_embedding, cur_embedding) distance.append((cur_id, d)) df['distance'].iloc[i] = distance df['perturbed_mask_index'] = 0 df = df.astype('object') for i in range(len(df)): perturbed_mask_index = [] mask_threshold = calculate_threshold( np.array(df['distance'].iloc[i])[:, 1], std_strength) for dis_index in range(len(df['distance'].iloc[i])): if df['distance'].iloc[i][dis_index][1] < mask_threshold and df[ 'labels'].iloc[i][dis_index] != 'B' and df['labels'].iloc[ i][dis_index] != 'I': perturbed_mask_index.append(dis_index) df['perturbed_mask_index'].iloc[i] = perturbed_mask_index df['augment_token_id'] = 0 df = df.astype('object') for i in range(len(df)): augment_tokenizeds = [] for j in range(len(df['aspect_terms'].iloc[i])): tokenized = tokenizer.encode(df['auxiliary_tokens'].iloc[i][j]) tokenized = torch.Tensor(tokenized).unsqueeze(0).to( torch.int64).to(device) augment_tokenized = tokenizer.encode( df['auxiliary_tokens'].iloc[i][j]) for k in range(len(df['perturbed_mask_index'].iloc[i])): mask_tokenized = tokenizer.encode( df['auxiliary_tokens'].iloc[i][j]) sep_index = mask_tokenized.index(sep_id) perturbed_mask_index = df['perturbed_mask_index'].iloc[i][ k] + sep_index + 1 mask_tokenized[perturbed_mask_index] = mask_id mask_tokenized = torch.Tensor(mask_tokenized).unsqueeze(0).to( torch.int64).to(device) logits = mask_model(mask_tokenized).logits probs = logits[0, perturbed_mask_index].softmax(dim=0) values, predictions = probs.topk(1) augment_tokenized[perturbed_mask_index] = int( predictions.cpu().numpy()) augment_tokenizeds.append(augment_tokenized) df['augment_token_id'].iloc[i] = augment_tokenizeds df['augment_tokens'] = 0 df = df.astype('object') for i in range(len(df)): tokens_lists = [] for j in range(len(df['aspect_terms'].iloc[i])): tokens_list = [] for k in range(1, len(df['augment_token_id'].iloc[i][j]) - 1): tokens_list.append( tokenizer.decode([df['augment_token_id'].iloc[i][j][k]])) sep_index = tokens_list.index(sep_token) tokens_list = tokens_list[sep_index + 1:] tokens_lists.append(tokens_list) df['augment_tokens'].iloc[i] = tokens_lists return df