예제 #1
0
def main2():
    config = BartWithAdapterConfig.from_pretrained('facebook/bart-base')
    bart = MyBartWithAdapter(config)

    bart_old = BartModel.from_pretrained("facebook/bart-base")
    bart.model.load_state_dict(bart_old.state_dict(), strict=False)

    config = BartWithAdapterConfig.from_pretrained('facebook/bart-base')
    # config.adapt_layer_norm = True
    generator = ParameterGenerator(config)

    output = generator(torch.tensor([[1,2,3]]))
    print(output)
    print(output[0].size())

    growingbart = GrowingBart(bart, generator, config)

    output = growingbart(torch.tensor([[4,1,3,4,3,5,6,3,2]]), torch.tensor([[1,1,1,1,1,1,1,1,1]]),
        torch.tensor([[4,1,3,4,3,5,6,3,2]]), torch.tensor([[1,1,1,1,1,1,1,1,1]]),
        torch.tensor([[4,1,3,4,3,5,6,3,2]]), torch.tensor([[1,1,1,1,1,1,1,1,1]]))

    print(output)
    
    loss = output[0].sum(-1).sum(-1).sum(-1)
    print(loss)
    loss.backward()
예제 #2
0
def main():
    config = BartWithAdapterConfig.from_pretrained('facebook/bart-base')
    bart = MyBartWithAdapter(config)

    bart_old = BartModel.from_pretrained("facebook/bart-base")
    ret = bart.model.load_state_dict(bart_old.state_dict(), strict=False)

    print(ret)
예제 #3
0
    def __init__(self, model_name):
        super().__init__()

        bart = BartModel.from_pretrained(model_name)
        self.hidden_dim = bart.config.hidden_size
        self.bart_encoder = bart.encoder
        self.bart_encoder.embed_tokens = lambda x: x
        self.bart_encoder.embed_positions = lambda x: torch.zeros(
            (x.shape[0], x.shape[1], self.hidden_dim), dtype=torch.float32)
예제 #4
0
 def __init__(self):
     super(MoralClassifier, self).__init__()
     # self.l1 = DistilBertModel.from_pretrained('distilbert-base-uncased')
     self.l1 = BartModel.from_pretrained('facebook/bart-large-cnn')
     # Pooler
     self.l2 = torch.nn.Linear(1024, 1024)
     self.act = torch.nn.Tanh()
     # Classifier
     self.l3 = torch.nn.Dropout(0.3)
     self.l4 = torch.nn.Linear(1024, 11)  # 11 categories
예제 #5
0
 def test_inference_no_head(self):
     model = BartModel.from_pretrained("facebook/bart-large").to(torch_device)
     input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
     inputs_dict = prepare_bart_inputs_dict(model.config, input_ids)
     with torch.no_grad():
         output = model(**inputs_dict)[0]
     expected_shape = torch.Size((1, 11, 1024))
     self.assertEqual(output.shape, expected_shape)
     expected_slice = torch.tensor(
         [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device
     )
     self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
예제 #6
0
 def __init__(self):
     super(Model, self).__init__()
     self.model = BartModel.from_pretrained(
         model_config.pretrain_model_path)
     self.config = self.model.config
     self.classification_head = BartClassificationHead(
         self.config.d_model,
         self.config.d_model,
         config.num_labels,
         self.config.classif_dropout,
     )
     self.model._init_weights(self.classification_head.dense)
     self.model._init_weights(self.classification_head.out_proj)
예제 #7
0
 def __init__(self, args):
     super(MoralClassifier, self).__init__()
     self.hparams = args
     self.l1 = BartModel.from_pretrained('facebook/bart-large-cnn')
     # freeze bart weights
     # for param in self.l1.parameters():
     #     param.requires_grad = False
     # Pooler
     self.l2 = torch.nn.Linear(1024, 1024)
     self.act = torch.nn.Tanh()
     # Classifier
     self.l3 = torch.nn.Dropout(0.2)
     self.l4 = torch.nn.Linear(1024, 10)  # 10 categories
예제 #8
0
 def __init__(self, model_name, use_pretrained_embeddings=False):
     super().__init__()
     # or some flag that indicates the bart encoder in it's entirety could be used.
     if use_pretrained_embeddings:
         # will use the entire bart encoder including all embeddings
         bart = PretrainedTransformerEmbedder(model_name,
                                              sub_module="encoder")
     else:
         bart = BartModel.from_pretrained(model_name)
         self.bart_encoder.embed_tokens = lambda x: x
         self.bart_encoder.embed_positions = lambda x: torch.zeros(
             (x.shape[0], x.shape[1], self.hidden_dim), dtype=torch.float32)
     self.hidden_dim = bart.config.hidden_size
     self.bart_encoder = bart.transformer_model
예제 #9
0
    def __init__(self, large, temp_dir, finetune=False, bart=False):
    # def __init__(self, large, temp_dir, finetune=False):
        super(Bert, self).__init__()
        if(large):
            self.model = BertModel.from_pretrained('bert-large-uncased', cache_dir=temp_dir)
        else:
            # self.model = BertModel.from_pretrained('bert-base-uncased', cache_dir=temp_dir)
            if bart:
                self.model = BartModel.from_pretrained('/home/ybai/downloads/bart', cache_dir=temp_dir, local_files_only=True)
                # self.model = BartForConditionalGeneration.from_pretrained('/home/ybai/downloads/bart', cache_dir=temp_dir, local_files_only=True)
            else:
                self.model = BertModel.from_pretrained('bert-base-multilingual-uncased', cache_dir=temp_dir,
                                                       local_files_only=False)

        self.finetune = finetune
    def __init__(self, args, use_mask=True):
        super(OneHotMoralClassifier, self).__init__()
        self.hparams = args
        self.bart = BartModel.from_pretrained('facebook/bart-large-cnn')
        self.use_mask = use_mask

        self.vocab_size = 50264
        self.onehot_embeddings = nn.Linear(self.vocab_size, 1024, bias=False)
        self.onehot_embeddings.weight = nn.Parameter(self.build_lookups())

        # self.bart.encoder.embed_tokens = nn.Identity()
        # freeze bert weights
        # self.onehot_embeddings.requires_grad = False
        # self.onehot_embeddings.weight.requires_grad = False
        # for param in self.bart.parameters():
        #     param.requires_grad = False

        # Pooler
        self.l2 = torch.nn.Linear(1024, 1024)
        self.act = torch.nn.Tanh()
        # Classifier
        self.l3 = torch.nn.Dropout(0.2)
        self.l4 = torch.nn.Linear(1024, 10)  # 10 categories
 def __init__(self, n_vocab=50264):
     self.n_vocab = n_vocab
     self.true_embedding = BartModel.from_pretrained(
         'facebook/bart-large-cnn').encoder.embed_tokens
예제 #12
0
def Seq2Seq(df):
    model_type = 'facebook/bart-large'

    tokenizer = BartTokenizer.from_pretrained(model_type)
    model = BartModel.from_pretrained(model_type)
    mask_model = BartForConditionalGeneration.from_pretrained(model_type)

    sep_token = '</s>'
    mask_token = '<mask>'

    mask_id = tokenizer(mask_token, return_tensors='pt')['input_ids'][0][1]
    sep_id = tokenizer(sep_token, return_tensors='pt')['input_ids'][0][1]

    optimizer = AdamW(model.parameters())
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    mask_model.to(device)

    df['mask_text'] = 0
    df['auxiliary_text'] = 0

    for i in range(len(df)):
        aspect = df['aspect'].iloc[i]
        sentiment = df['sentiment'].iloc[i]

        if aspect == 'NULL' or isinstance(aspect, (int, float)):
            aspect = 'aspect'

        if DPM_type == 'Senti':
            mask_sent = 'the polarity of the ' + aspect + ' is ' + mask_token + ' ' + sep_token + ' '
            auxiliary_sent = 'the polarity of the ' + aspect + ' is ' + sentiment + ' ' + sep_token + ' '
        elif DPM_type == 'AS':
            mask_sent = 'the polarity of the ' + mask_token + ' is ' + sentiment + ' ' + sep_token + ' '
            auxiliary_sent = 'the polarity of the ' + aspect + ' is ' + sentiment + ' ' + sep_token + ' '

        df['mask_text'].iloc[i] = mask_sent + df['text'].iloc[i]
        df['auxiliary_text'].iloc[i] = auxiliary_sent + df['text'].iloc[i]

    df['distance'] = 0
    df = df.astype('object')

    for i in range(len(df)):

        tokenized = df['mask_text'][i:i + 1].apply((lambda x: tokenizer.encode(
            x, add_special_tokens=True, max_length=MAX_LEN, truncation=True)))

        sep_index = tokenized[i].index(sep_id)
        mask_index = tokenized[i].index(mask_id)

        padded = pad_sequences(tokenized,
                               maxlen=MAX_LEN,
                               dtype="long",
                               value=0,
                               truncating="post",
                               padding="post")

        attention_mask = np.where(padded != 0, 1, 0)

        input_ids = torch.tensor(padded).to(device)
        attention_mask = torch.tensor(attention_mask).to(device)

        with torch.no_grad():
            last_hidden_states = model(input_ids,
                                       attention_mask=attention_mask)

        original_mask_embedding = last_hidden_states[0][:, mask_index, :].cpu(
        ).numpy()

        distance = []

        for pertubed_index in range(sep_index + 1, MAX_LEN):
            padded = pad_sequences(tokenized,
                                   maxlen=MAX_LEN,
                                   dtype="long",
                                   value=0,
                                   truncating="post",
                                   padding="post")
            if padded[0][pertubed_index] != 0 and padded[0][
                    pertubed_index] != sep_id:
                #print(padded.shape)
                cur_id = padded[0][pertubed_index]
                padded[0][pertubed_index] = mask_id

                cur_embedding = mask_embedding(model, padded, mask_index)
                d = dist(original_mask_embedding, cur_embedding)
                distance.append((cur_id, d))

        df['distance'].iloc[i] = distance

    df['perturbed_mask_index'] = 0
    df = df.astype('object')

    for i in range(len(df)):
        perturbed_mask_index = []
        mask_threshold = calculate_threshold(
            np.array(df['distance'].iloc[i])[:, 1], std_strength)
        for dis_index in range(len(df['distance'].iloc[i])):
            if df['distance'].iloc[i][dis_index][1] < mask_threshold:
                perturbed_mask_index.append(dis_index)

        df['perturbed_mask_index'].iloc[i] = perturbed_mask_index

    df['augment_token_id'] = 0
    df = df.astype('object')

    for i in range(len(df)):
        tokenized = tokenizer.encode(df['auxiliary_text'].iloc[i])
        tokenized = torch.Tensor(tokenized).unsqueeze(0).to(
            torch.int64).to(device)
        augment_tokenized = tokenizer.encode(df['auxiliary_text'].iloc[i])

        mask_tokenized = tokenizer.encode(df['auxiliary_text'].iloc[i])
        sep_index = mask_tokenized.index(sep_id)

        for j in range(len(df['perturbed_mask_index'].iloc[i])):
            perturbed_mask_index = df['perturbed_mask_index'].iloc[i][
                j] + sep_index + 1
            mask_tokenized[perturbed_mask_index] = mask_id

        mask_tokenized = torch.Tensor(mask_tokenized).unsqueeze(0).to(
            torch.int64).to(device)
        logits = mask_model(mask_tokenized).logits

        for j in range(len(df['perturbed_mask_index'].iloc[i])):
            perturbed_mask_index = df['perturbed_mask_index'].iloc[i][
                j] + sep_index + 1
            probs = logits[0, perturbed_mask_index].softmax(dim=0)
            values, predictions = probs.topk(1)
            augment_tokenized[perturbed_mask_index] = int(
                predictions.cpu().numpy())

        df['augment_token_id'].iloc[i] = augment_tokenized

    df['augment_text'] = 0
    df = df.astype('object')

    for i in range(len(df)):
        sep_index = df['augment_token_id'].iloc[i].index(sep_id)
        df['augment_text'].iloc[i] = tokenizer.decode(
            df['augment_token_id'].iloc[i][sep_index + 1:-1])

    return df
예제 #13
0
def get_kobart_model():
    return BartModel.from_pretrained("hyunwoongko/kobart")
예제 #14
0
 def _load_model(self, model_name: str):
     if model_name in ("facebook/mbart-large-cc25"):
         return BartModel.from_pretrained(model_name,
                                          config=self.config).eval()
     return AutoModel.from_pretrained(model_name, config=self.config).eval()
예제 #15
0
print("==== preparing data ====")
make_path(args.cache_dir)
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base', cache_dir=args.cache_dir)

with open('synt_vocab.pkl', 'rb') as f:
    synt_vocab = pickle.load(f)

dataset = prepare_dataset(para_data, tokenizer, num)

print("==== loading model ====")
config = BartConfig.from_pretrained('facebook/bart-base', cache_dir=args.cache_dir)
config.word_dropout = args.word_dropout
config.max_sent_len = args.max_sent_len
config.max_synt_len = args.max_synt_len

bart = BartModel.from_pretrained('facebook/bart-base', cache_dir=args.cache_dir)
model = ParaBart(config)
model.load_state_dict(bart.state_dict(), strict=False)
model.zero_grad()
del bart


no_decay_params = []
no_decay_fast_params = []
fast_params = []
all_other_params = []
adv_no_decay_params = []
adv_all_other_params = []

for n, p in model.named_parameters():
    if 'adv' in n:
예제 #16
0
def load_model(config, checkpoint):
    args = config['args']
    labels = load_label(args.label_path)
    label_size = len(labels)
    config['labels'] = labels
    if config['emb_class'] == 'glove':
        if config['enc_class'] == 'gnb':
            model = TextGloveGNB(config, args.embedding_path, label_size)
        if config['enc_class'] == 'cnn':
            model = TextGloveCNN(config,
                                 args.embedding_path,
                                 label_size,
                                 emb_non_trainable=True)
        if config['enc_class'] == 'densenet-cnn':
            model = TextGloveDensenetCNN(config,
                                         args.embedding_path,
                                         label_size,
                                         emb_non_trainable=True)
        if config['enc_class'] == 'densenet-dsa':
            model = TextGloveDensenetDSA(config,
                                         args.embedding_path,
                                         label_size,
                                         emb_non_trainable=True)
    else:
        if config['emb_class'] == 'bart' and config['use_kobart']:
            from transformers import BartModel
            from kobart import get_kobart_tokenizer, get_pytorch_kobart_model
            bert_tokenizer = get_kobart_tokenizer()
            bert_tokenizer.cls_token = '<s>'
            bert_tokenizer.sep_token = '</s>'
            bert_tokenizer.pad_token = '<pad>'
            bert_model = BartModel.from_pretrained(get_pytorch_kobart_model())
            bert_config = bert_model.config
        elif config['emb_class'] in ['gpt']:
            bert_tokenizer = AutoTokenizer.from_pretrained(
                args.bert_output_dir)
            bert_tokenizer.bos_token = '<|startoftext|>'
            bert_tokenizer.eos_token = '<|endoftext|>'
            bert_tokenizer.cls_token = '<|startoftext|>'
            bert_tokenizer.sep_token = '<|endoftext|>'
            bert_tokenizer.pad_token = '<|pad|>'
            bert_config = AutoConfig.from_pretrained(args.bert_output_dir)
            bert_model = AutoModel.from_pretrained(args.bert_output_dir)
        elif config['emb_class'] in ['t5']:
            from transformers import T5EncoderModel
            bert_tokenizer = AutoTokenizer.from_pretrained(
                args.bert_output_dir)
            bert_tokenizer.cls_token = '<s>'
            bert_tokenizer.sep_token = '</s>'
            bert_tokenizer.pad_token = '<pad>'
            bert_config = AutoConfig.from_pretrained(args.bert_output_dir)
            bert_model = T5EncoderModel(bert_config)
        else:
            bert_tokenizer = AutoTokenizer.from_pretrained(
                args.bert_output_dir)
            bert_config = AutoConfig.from_pretrained(args.bert_output_dir)
            bert_model = AutoModel.from_config(bert_config)

        ModelClass = TextBertCNN
        if config['enc_class'] == 'cls': ModelClass = TextBertCLS
        if config['enc_class'] == 'densenet-cnn':
            ModelClass = TextBertDensenetCNN

        model = ModelClass(config, bert_config, bert_model, bert_tokenizer,
                           label_size)

    if args.enable_qat:
        assert args.device == 'cpu'
        model.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
        '''
        # fuse if applicable
        # model = torch.quantization.fuse_modules(model, [['']])
        '''
        model = torch.quantization.prepare_qat(model)
        model.eval()
        model.to('cpu')
        logger.info("[Convert to quantized model with device=cpu]")
        model = torch.quantization.convert(model)
    if args.enable_qat_fx:
        import torch.quantization.quantize_fx as quantize_fx
        qconfig_dict = {
            "": torch.quantization.get_default_qat_qconfig('fbgemm')
        }
        model = quantize_fx.prepare_qat_fx(model, qconfig_dict)
        logger.info("[Convert to quantized model]")
        model = quantize_fx.convert_fx(model)

    if args.enable_diffq:
        quantizer = DiffQuantizer(model)
        config['quantizer'] = quantizer
        quantizer.restore_quantized_state(checkpoint)
    else:
        model.load_state_dict(checkpoint)

    model = model.to(args.device)
    ''' 
    for name, param in model.named_parameters():
        print(name, param.data, param.device, param.requires_grad)
    '''
    logger.info("[model] :\n{}".format(model.__str__()))
    logger.info("[Model loaded]")
    return model
예제 #17
0
 def test_model_from_pretrained(self):
     # Forces 1.6GB download from S3 for each model
     for model_name in BART_PRETRAINED_MODEL_ARCHIVE_LIST:
         model = BartModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
예제 #18
0
def tokenizer_and_model(type_embedding):
    #########
    #PORTUGUESE
    #########
    if type_embedding.split('_')[0] == 'BERT' or type_embedding.split(
            '_')[0] == 'bert':
        if type_embedding == 'BERT_portuguese_large_neural_mind':
            path = '/home/jeanfarfan/bin/Semi_supervised_learning/data/Brazilian_Bert/BERT_large_portuguese/'
        elif type_embedding == 'BERT_portuguese_base_neural_mind':
            path = '/home/jeanfarfan/bin/Semi_supervised_learning/data/Brazilian_Bert/BERT_base_portuguese/'
        elif type_embedding == 'bert_base_multilingual_cased':
            path = 'bert-base-multilingual-cased'
        elif type_embedding == 'bert_base_multilingual_uncased':
            path = 'bert-base-multilingual-uncased'
        #load tokenizer and model
        tokenizer = BertTokenizer.from_pretrained(path)
        model = BertModel.from_pretrained(path,
                                          output_hidden_states=True,
                                          return_dict=True)
        special_tokens_dict = {
            'additional_special_tokens': ['[USER]', '[SYSTEM]']
        }
        orig_num_tokens = len(tokenizer)
        num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)
        total_num_tokens = orig_num_tokens + num_added_tokens
        model.resize_token_embeddings(total_num_tokens)
    elif type_embedding.split('_')[0] == 'xlmroberta':
        if type_embedding == 'xlmroberta_base':
            path = 'xlm-roberta-base'
        elif type_embedding == 'xlmroberta_large':
            path = 'xlm-roberta-large'
        #load tokenizer and model
        tokenizer = XLMRobertaTokenizer.from_pretrained(path)
        model = XLMRobertaModel.from_pretrained(path,
                                                output_hidden_states=True,
                                                return_dict=True)
    elif type_embedding == 'xlm':
        path = 'xlm-mlm-100-1280'
        #load tokenizer and model
        tokenizer = XLMTokenizer.from_pretrained(path)
        model = XLMModel.from_pretrained(path,
                                         output_hidden_states=True,
                                         return_dict=True)
    #########
    #ENGLISH
    #########
    elif type_embedding == 'en_bert_base_uncased':
        path = 'bert-base-uncased'
        #load tokenizer and model
        tokenizer = BertTokenizer.from_pretrained(path)
        model = BertModel.from_pretrained(path,
                                          output_hidden_states=True,
                                          return_dict=True)
    elif type_embedding == 'en_xlm_mlm_enfr_1024':
        path = 'xlm-mlm-enfr-1024'
        #load tokenizer and model
        tokenizer = XLMTokenizer.from_pretrained(path)
        model = XLMModel.from_pretrained(path,
                                         output_hidden_states=True,
                                         return_dict=True)
    elif type_embedding == 'en_xlm_roberta_base':
        path = 'xlm-roberta-base'
        #load tokenizer and model
        tokenizer = XLMRobertaTokenizer.from_pretrained(path)
        model = XLMRobertaModel.from_pretrained(path,
                                                output_hidden_states=True,
                                                return_dict=True)
    elif type_embedding == 'distilbert_base_cased':
        path = 'distilbert-base-cased'
        #load tokenizer and model
        tokenizer = DistilBertTokenizer.from_pretrained(path)
        model = DistilBertModel.from_pretrained(path,
                                                output_hidden_states=True,
                                                return_dict=True)
    elif type_embedding == 'Mobile_Bert':
        path = 'google/mobilebert-uncased'
        #load tokenizer and model
        tokenizer = MobileBertTokenizer.from_pretrained(path)
        model = MobileBertModel.from_pretrained(path,
                                                output_hidden_states=True,
                                                return_dict=True)
    elif type_embedding == 'Electra':
        path = 'google/electra-small-discriminator'
        #load tokenizer and model
        tokenizer = ElectraTokenizer.from_pretrained(path)
        model = ElectraModel.from_pretrained(path,
                                             output_hidden_states=True,
                                             return_dict=True)
    elif type_embedding == 'BART':
        path = 'facebook/bart-large'
        #load tokenizer and model
        tokenizer = BartTokenizer.from_pretrained(path)
        model = BartModel.from_pretrained(path,
                                          output_hidden_states=True,
                                          return_dict=True)

    return tokenizer, model
 def build_lookups(self):
     embeddings = BartModel.from_pretrained(
         'facebook/bart-large-cnn').encoder.embed_tokens
     ids = torch.LongTensor([i for i in range(self.vocab_size)])
     return torch.transpose(embeddings(ids), 0, 1).detach()
예제 #20
0
def get_embedding(type_embedding, data):
    if type_embedding.split('_')[0] == 'BERT' or type_embedding.split(
            '_')[0] == 'bert':
        if type_embedding == 'BERT_portuguese_large_neural_mind':
            path = '/home/jeanfranco/Movile_project/Semi_supervised_learning/data/Brazilian_Bert/BERT_large_portuguese/'
        elif type_embedding == 'BERT_portuguese_base_neural_mind':
            path = '/home/jeanfranco/Movile_project/Semi_supervised_learning/data/Brazilian_Bert/BERT_base_portuguese/'
        elif type_embedding == 'bert_base_multilingual_cased':
            path = 'bert-base-multilingual-cased'
        elif type_embedding == 'bert_base_multilingual_uncased':
            data = [x.lower() for x in data]
            path = 'bert-base-multilingual-uncased'
        #load tokenizer and model
        tokenizer = BertTokenizer.from_pretrained(path)
        model = BertModel.from_pretrained(path, output_hidden_states=True)
    elif type_embedding.split('_')[0] == 'xlmroberta':
        if type_embedding == 'xlmroberta_base':
            path = 'xlm-roberta-base'
        elif type_embedding == 'xlmroberta_large':
            path = 'xlm-roberta-large'
        #load tokenizer and model
        tokenizer = XLMRobertaTokenizer.from_pretrained(path)
        model = XLMRobertaModel.from_pretrained(path,
                                                output_hidden_states=True)
    elif type_embedding == 'xlm':
        path = 'xlm-mlm-100-1280'
        #load tokenizer and model
        tokenizer = XLMTokenizer.from_pretrained(path)
        model = XLMModel.from_pretrained(path, output_hidden_states=True)
    #########
    #ENGLISH
    #########
    elif type_embedding == 'en_bert_base_uncased':
        path = 'bert-base-uncased'
        #load tokenizer and model
        tokenizer = BertTokenizer.from_pretrained(path)
        model = BertModel.from_pretrained(path, output_hidden_states=True)
    elif type_embedding == 'en_xlm_mlm_enfr_1024':
        path = 'xlm-mlm-enfr-1024'
        #load tokenizer and model
        tokenizer = XLMTokenizer.from_pretrained(path)
        model = XLMModel.from_pretrained(path, output_hidden_states=True)
    elif type_embedding == 'en_xlm_roberta_base':
        path = 'xlm-roberta-base'
        #load tokenizer and model
        tokenizer = XLMRobertaTokenizer.from_pretrained(path)
        model = XLMRobertaModel.from_pretrained(path,
                                                output_hidden_states=True)
    elif type_embedding == 'distilbert_base_cased':
        path = 'distilbert-base-cased'
        #load tokenizer and model
        tokenizer = DistilBertTokenizer.from_pretrained(path)
        model = DistilBertModel.from_pretrained(path,
                                                output_hidden_states=True)
    elif type_embedding == 'Mobile_Bert':
        path = 'google/mobilebert-uncased'
        #load tokenizer and model
        tokenizer = MobileBertTokenizer.from_pretrained(path)
        model = MobileBertModel.from_pretrained(path,
                                                output_hidden_states=True)
    elif type_embedding == 'Electra':
        path = 'google/electra-small-discriminator'
        #load tokenizer and model
        tokenizer = ElectraTokenizer.from_pretrained(path)
        model = ElectraModel.from_pretrained(path, output_hidden_states=True)
    elif type_embedding == 'BART':
        path = 'facebook/bart-large'
        #load tokenizer and model
        tokenizer = BartTokenizer.from_pretrained(path)
        model = BartModel.from_pretrained(path, output_hidden_states=True)

    # Set the device to GPU (cuda) if available, otherwise stick with CPU
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    list_of_four_last_embeddings = []
    list_of_mean = []

    for l in data:
        # Convert the string "granola bars" to tokenized vocabulary IDs
        input_ids = tokenizer.encode(l)
        #print(input_ids)
        # Convert the list of IDs to a tensor of IDs
        input_ids = torch.LongTensor(input_ids)
        #print(input_ids)
        model = model.to(device)
        input_ids = input_ids.to(device)
        #print(input_ids)
        model.eval()

        # unsqueeze IDs to get batch size of 1 as added dimension
        input_ids = input_ids.unsqueeze(0)
        with torch.no_grad():
            out = model(input_ids=input_ids)

        # we only want the hidden_states
        if type_embedding == 'xlm':
            hidden_states = out[1]
        else:
            hidden_states = out[2]
        #mean of layers
        sentence_embedding = torch.mean(hidden_states[-1], dim=1).squeeze()
        list_of_mean.append(sentence_embedding.tolist())

        # get last four layers
        last_four_layers = [hidden_states[i] for i in (-1, -2, -3, -4)]
        # cast layers to a tuple and concatenate over the last dimension
        cat_hidden_states = torch.cat(tuple(last_four_layers), dim=-1)

        # take the mean of the concatenated vector over the token dimension
        cat_sentence_embedding = torch.mean(cat_hidden_states, dim=1).squeeze()
        list_of_four_last_embeddings.append(cat_sentence_embedding.tolist())

    #print('list of four last embeddings', np.array(list_of_four_last_embeddings).shape)
    #print('list of mean', np.array(list_of_mean).shape)

    return list_of_mean, list_of_four_last_embeddings
예제 #21
0
 def __init__(self):
     super(BART, self).__init__()
     self.bart = BartModel.from_pretrained('facebook/bart-large')
     self.dropout = nn.Dropout(0.1)
     self.cls = nn.Linear(in_features=1024, out_features=4)
예제 #22
0
 def test_model_from_pretrained(self):
     # Forces 1.6GB download from S3 for each model
     for model_name in list(BART_PRETRAINED_MODEL_ARCHIVE_MAP.keys()):
         model = BartModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
         self.assertIsNotNone(model)
예제 #23
0
def prepare_model(config, bert_model_name_or_path=None):
    args = config['args']
    emb_non_trainable = not args.embedding_trainable
    labels = load_label(args.label_path)
    label_size = len(labels)
    config['labels'] = labels
    # prepare model
    if config['emb_class'] == 'glove':
        if config['enc_class'] == 'gnb':
            model = TextGloveGNB(config, args.embedding_path, label_size)
        if config['enc_class'] == 'cnn':
            model = TextGloveCNN(config,
                                 args.embedding_path,
                                 label_size,
                                 emb_non_trainable=emb_non_trainable)
        if config['enc_class'] == 'densenet-cnn':
            model = TextGloveDensenetCNN(config,
                                         args.embedding_path,
                                         label_size,
                                         emb_non_trainable=emb_non_trainable)
        if config['enc_class'] == 'densenet-dsa':
            model = TextGloveDensenetDSA(config,
                                         args.embedding_path,
                                         label_size,
                                         emb_non_trainable=emb_non_trainable)
    else:
        model_name_or_path = args.bert_model_name_or_path
        if bert_model_name_or_path:
            model_name_or_path = bert_model_name_or_path

        if config['emb_class'] == 'bart' and config['use_kobart']:
            from transformers import BartModel
            from kobart import get_kobart_tokenizer, get_pytorch_kobart_model
            bert_tokenizer = get_kobart_tokenizer()
            bert_tokenizer.cls_token = '<s>'
            bert_tokenizer.sep_token = '</s>'
            bert_tokenizer.pad_token = '<pad>'
            bert_model = BartModel.from_pretrained(get_pytorch_kobart_model())
        elif config['emb_class'] in ['gpt']:
            bert_tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
            bert_tokenizer.bos_token = '<|startoftext|>'
            bert_tokenizer.eos_token = '<|endoftext|>'
            bert_tokenizer.cls_token = '<|startoftext|>'
            bert_tokenizer.sep_token = '<|endoftext|>'
            bert_tokenizer.pad_token = '<|pad|>'
            bert_model = AutoModel.from_pretrained(
                model_name_or_path,
                from_tf=bool(".ckpt" in model_name_or_path))
            # 3 new tokens added
            bert_model.resize_token_embeddings(len(bert_tokenizer))
        elif config['emb_class'] in ['t5']:
            from transformers import T5EncoderModel
            bert_tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
            bert_tokenizer.cls_token = '<s>'
            bert_tokenizer.sep_token = '</s>'
            bert_tokenizer.pad_token = '<pad>'
            bert_model = T5EncoderModel.from_pretrained(
                model_name_or_path,
                from_tf=bool(".ckpt" in model_name_or_path))

        else:
            bert_tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
            bert_model = AutoModel.from_pretrained(
                model_name_or_path,
                from_tf=bool(".ckpt" in model_name_or_path))

        bert_config = bert_model.config
        # bert model reduction
        reduce_bert_model(config, bert_model, bert_config)
        ModelClass = TextBertCNN
        if config['enc_class'] == 'cls': ModelClass = TextBertCLS
        if config['enc_class'] == 'densenet-cnn':
            ModelClass = TextBertDensenetCNN

        model = ModelClass(config,
                           bert_config,
                           bert_model,
                           bert_tokenizer,
                           label_size,
                           feature_based=args.bert_use_feature_based,
                           finetune_last=args.bert_use_finetune_last)
    if args.restore_path:
        checkpoint = load_checkpoint(args.restore_path)
        model.load_state_dict(checkpoint)
    if args.enable_qat:
        model.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
        '''
        # fuse if applicable
        # model = torch.quantization.fuse_modules(model, [['']])
        '''
        model = torch.quantization.prepare_qat(model)
    if args.enable_qat_fx:
        import torch.quantization.quantize_fx as quantize_fx
        model.train()
        qconfig_dict = {
            "": torch.quantization.get_default_qat_qconfig('fbgemm')
        }
        model = quantize_fx.prepare_qat_fx(model, qconfig_dict)

    logger.info("[model] :\n{}".format(model.__str__()))
    logger.info("[model prepared]")
    return model
예제 #24
0
파일: bart.py 프로젝트: HUSTLyn/SentEval
    'tenacity': 5,
    'epoch_size': 4
}

# Set up logger
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--model',
                        default='bart-large',
                        help='model name or path')
    args = parser.parse_args()

    config = BartConfig.from_pretrained(args.model)
    model = BartModel.from_pretrained(args.model, config=config)
    tokenizer = BartTokenizer.from_pretrained(args.model)

    params_senteval['model'] = model.cuda().eval()
    params_senteval['tokenizer'] = tokenizer
    params_senteval['config'] = config

    se = senteval.engine.SE(params_senteval, batcher, prepare)
    transfer_tasks = [
        'STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'MR', 'CR', 'MPQA',
        'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 'SICKEntailment',
        'SICKRelatedness', 'STSBenchmark', 'Length', 'WordContent', 'Depth',
        'TopConstituents', 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
        'OddManOut', 'CoordinationInversion', 'ImageCaptionRetrieval', 'SNLI'
    ]
    results = se.eval(transfer_tasks)
예제 #25
0
def Seq2Seq(df):
    model_type = 'bart-large'

    tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
    model = BartModel.from_pretrained('facebook/bart-large')
    mask_model = BartForConditionalGeneration.from_pretrained(
        'facebook/bart-large')

    sep_token = '</s>'
    mask_token = '<mask>'

    mask_id = tokenizer(mask_token, return_tensors='pt')['input_ids'][0][1]
    sep_id = tokenizer(sep_token, return_tensors='pt')['input_ids'][0][1]

    optimizer = AdamW(model.parameters())
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    mask_model.to(device)

    auxiliary_tokens = ['the', 'aspect', 'term', 'is']

    df['mask_tokens'] = 0
    df['auxiliary_tokens'] = 0
    df = df.astype('object')

    for i in range(len(df)):

        #for j in range(len(df['aspect_terms'].iloc[i])):
        auxiliary_sents = []
        for j in range(len(df['aspect_terms'].iloc[i])):
            aspect_terms = df['aspect_terms'].iloc[i][j]
            auxiliary_sent = auxiliary_tokens + [aspect_terms] + [
                sep_token
            ] + df['tokens'].iloc[i]
            auxiliary_sents.append(auxiliary_sent)

        mask_sent = auxiliary_tokens + [mask_token] + [sep_token
                                                       ] + df['tokens'].iloc[i]
        df['mask_tokens'].iloc[i] = mask_sent
        df['auxiliary_tokens'].iloc[i] = auxiliary_sents

    df['distance'] = 0
    df = df.astype('object')

    for i in range(len(df)):

        tokenized = tokenizer.encode(df['mask_tokens'].iloc[i])

        sep_index = tokenized.index(sep_id)
        mask_index = tokenized.index(mask_id)

        tokenized = pd.Series([tokenized])

        padded = pad_sequences(tokenized,
                               maxlen=MAX_LEN,
                               dtype="long",
                               value=0,
                               truncating="post",
                               padding="post")

        attention_mask = np.where(padded != 0, 1, 0)

        input_ids = torch.tensor(padded).to(device)
        attention_mask = torch.tensor(attention_mask).to(device)

        with torch.no_grad():
            last_hidden_states = model(input_ids,
                                       attention_mask=attention_mask)

        original_mask_embedding = last_hidden_states[0][:, mask_index, :].cpu(
        ).numpy()

        distance = []

        for pertubed_index in range(sep_index + 1, MAX_LEN):
            padded = pad_sequences(tokenized,
                                   maxlen=MAX_LEN,
                                   dtype="long",
                                   value=0,
                                   truncating="post",
                                   padding="post")
            if padded[0][pertubed_index] != 0 and padded[0][
                    pertubed_index] != sep_id:
                #print(padded.shape)
                cur_id = padded[0][pertubed_index]
                padded[0][pertubed_index] = mask_id

                cur_embedding = mask_embedding(model, padded, mask_index)
                d = dist(original_mask_embedding, cur_embedding)
                distance.append((cur_id, d))

        df['distance'].iloc[i] = distance

    df['perturbed_mask_index'] = 0
    df = df.astype('object')

    for i in range(len(df)):
        perturbed_mask_index = []
        mask_threshold = calculate_threshold(
            np.array(df['distance'].iloc[i])[:, 1], std_strength)
        for dis_index in range(len(df['distance'].iloc[i])):
            if df['distance'].iloc[i][dis_index][1] < mask_threshold and df[
                    'labels'].iloc[i][dis_index] != 'B' and df['labels'].iloc[
                        i][dis_index] != 'I':
                perturbed_mask_index.append(dis_index)

        df['perturbed_mask_index'].iloc[i] = perturbed_mask_index

    df['augment_token_id'] = 0
    df = df.astype('object')

    for i in range(len(df)):

        augment_tokenizeds = []

        for j in range(len(df['aspect_terms'].iloc[i])):

            tokenized = tokenizer.encode(df['auxiliary_tokens'].iloc[i][j])
            tokenized = torch.Tensor(tokenized).unsqueeze(0).to(
                torch.int64).to(device)
            augment_tokenized = tokenizer.encode(
                df['auxiliary_tokens'].iloc[i][j])

            for k in range(len(df['perturbed_mask_index'].iloc[i])):
                mask_tokenized = tokenizer.encode(
                    df['auxiliary_tokens'].iloc[i][j])
                sep_index = mask_tokenized.index(sep_id)
                perturbed_mask_index = df['perturbed_mask_index'].iloc[i][
                    k] + sep_index + 1
                mask_tokenized[perturbed_mask_index] = mask_id

                mask_tokenized = torch.Tensor(mask_tokenized).unsqueeze(0).to(
                    torch.int64).to(device)

                logits = mask_model(mask_tokenized).logits

                probs = logits[0, perturbed_mask_index].softmax(dim=0)
                values, predictions = probs.topk(1)
                augment_tokenized[perturbed_mask_index] = int(
                    predictions.cpu().numpy())

            augment_tokenizeds.append(augment_tokenized)

        df['augment_token_id'].iloc[i] = augment_tokenizeds

    df['augment_tokens'] = 0
    df = df.astype('object')

    for i in range(len(df)):

        tokens_lists = []

        for j in range(len(df['aspect_terms'].iloc[i])):

            tokens_list = []

            for k in range(1, len(df['augment_token_id'].iloc[i][j]) - 1):
                tokens_list.append(
                    tokenizer.decode([df['augment_token_id'].iloc[i][j][k]]))

            sep_index = tokens_list.index(sep_token)
            tokens_list = tokens_list[sep_index + 1:]
            tokens_lists.append(tokens_list)

        df['augment_tokens'].iloc[i] = tokens_lists

    return df