Пример #1
0
 def from_pretrained(cls, model_name: str):
     return cls(
         RobertaForMaskedLM.from_pretrained(
             model_name,
             output_attentions=True,
             output_hidden_states=True,
             output_additional_info=True,
         ),
         RobertaAligner.from_pretrained(model_name),
     )
Пример #2
0
 def create_and_check_roberta_for_masked_lm(self, config, input_ids,
                                            token_type_ids, input_mask,
                                            sequence_labels,
                                            token_labels,
                                            choice_labels):
     model = RobertaForMaskedLM(config=config)
     model.eval()
     loss, prediction_scores = model(input_ids,
                                     attention_mask=input_mask,
                                     token_type_ids=token_type_ids,
                                     masked_lm_labels=token_labels)
     result = {
         "loss": loss,
         "prediction_scores": prediction_scores,
     }
     self.parent.assertListEqual(
         list(result["prediction_scores"].size()),
         [self.batch_size, self.seq_length, self.vocab_size])
     self.check_loss_output(result)
Пример #3
0
def main():

    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaForMaskedLM.from_pretrained('roberta-base')

    #train_data_file = "D:\\Work\\sandbox\\data\\train_citation.jsonl"
    train_data_file = "D:\\Work\\sandbox\\data\\train_scierc.jsonl"
    #train_data_file = "D:\\Work\\sandbox\\data\\train_chemprot.jsonl"
    train_data = datasets.load_dataset("json",
                                       data_files=train_data_file)["train"]

    def tokenization(batched_text):
        tokenized_batch = tokenizer(batched_text['text'],
                                    padding=True,
                                    truncation=True,
                                    return_special_tokens_mask=True)
        return tokenized_batch

    train_data = train_data.map(tokenization,
                                batched=True,
                                batch_size=len(train_data),
                                remove_columns=["text", "label"])
    train_data.set_format('torch', columns=['input_ids'])

    #static masking. Possible TODO: use dynamic masking
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=True,
                                                    mlm_probability=0.15)

    training_args = TrainingArguments(
        output_dir="./tapt_scierc",
        overwrite_output_dir=True,
        num_train_epochs=100,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        learning_rate=1e-4,
        #lr_scheduler_type="constant",
        adam_epsilon=1e-6,
        adam_beta1=0.9,
        adam_beta2=0.98,
        weight_decay=0.01,
        warmup_ratio=0.06,
        fp16=True,
        eval_accumulation_steps=20,
        save_steps=5000,
        save_total_limit=2,
        seed=2)

    trainer = Trainer(model=model,
                      args=training_args,
                      data_collator=data_collator,
                      train_dataset=train_data)
    trainer.train()

    trainer.save_model("./tapt_scierc")
Пример #4
0
def train_MLM(vocf,outmodel,data_df):
    bs=8
    #tokenizer=BertWordPieceTokenizer(vocf)#input vocab.txt
    ttk=BertTokenizer.from_pretrained(vocf)#input vocab.txt
    fvoc=open(vocf)
    vlen=len(fvoc.readlines())
    fvoc.close()
    config=RobertaConfig(vocab_size=vlen,max_position_embeddings=12,num_attention_heads=12, \
                             num_hidden_layers=6,type_vocab_size=1,hidden_size=768)
    model=RobertaForMaskedLM(config=config)
    model.num_parameters()
    
    dataset=tokDataset(data_df,ttk)
#     Data= DataLoader(dataset, batch_size=bs,shuffle=True,drop_last=False,num_workers=0,collate_fn=collate_fn)
#     data_collator = DataCollatorForLanguageModeling(
#         tokenizer=ttk, mlm=True, mlm_probability=0.15
#     )
   
    data_collator=collate_fn(
        tokenizer=ttk, mlm=True, mlm_probability=0.15
    )
    training_args = TrainingArguments(
            output_dir=outmodel,#embedding model path
            overwrite_output_dir=True,
            num_train_epochs=2,
            per_device_train_batch_size=bs,
            save_steps=10_000,
            save_total_limit=2,
            
        )

    trainer = Trainer(
        model=model,
        args=training_args,
        
        train_dataset=dataset,
        data_collator=data_collator,
        prediction_loss_only=True
    )
    trainer.train()
    trainer.save_model(outmodel)
    print('LM train done: ')
Пример #5
0
    def test_inference_masked_lm(self):
        model = RobertaForMaskedLM.from_pretrained("roberta-base")

        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
        output = model(input_ids)[0]
        expected_shape = torch.Size((1, 11, 50265))
        self.assertEqual(output.shape, expected_shape)
        # compare the actual values for a slice.
        expected_slice = torch.Tensor(
            [[[33.8843, -4.3107, 22.7779], [4.6533, -2.8099, 13.6252], [1.8222, -3.6898, 8.8600]]]
        )
        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3))
Пример #6
0
    def __init__(self, args):
        # self.dict_file = "{}/{}".format(args.roberta_model_dir, args.roberta_vocab_name)
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        if args.model_path is not None:
            print("Testing CoLAKE...")
            print('loading model parameters from {}...'.format(
                args.model_path))
            config = RobertaConfig.from_pretrained('roberta-base',
                                                   type_vocab_size=3)
            self.model = RobertaForMaskedLM(config=config)
            states_dict = torch.load(os.path.join(args.model_path,
                                                  'model.bin'))
            self.model.load_state_dict(states_dict, strict=False)
        else:
            print("Testing RoBERTa baseline...")
            self.model = RobertaForMaskedLM.from_pretrained('roberta-base')

        self._build_vocab()
        self._init_inverse_vocab()
        self._model_device = 'cpu'
        self.max_sentence_length = args.max_sentence_length
Пример #7
0
 def __init__(self, cfg, device):
     super().__init__()
     tokenizer = RobertaTokenizerFast.from_pretrained('./bird_bpe_vocab', max_len=256)
     _config = RobertaConfig(
         vocab_size=tokenizer._tokenizer.get_vocab_size(),
         hidden_size=512,
         num_hidden_layers=4,
         num_attention_heads=8,
         max_position_embeddings=256,
         pad_token_id=1,
         eos_token_id=0,
         bos_token_id=2,
         output_attentions=False,
         output_hidden_states=False
     )
     _model = RobertaForMaskedLM(_config)
     _model.load_state_dict(torch.load('bert_small/checkpoint-1100/pytorch_model.bin'))
     _model.eval()
     self.tokenizer = tokenizer
     self._model = _model
     self.device = device
     self.pad_token = 0
     self.batch_size = cfg.batch_size
     self.proj = None
     if cfg.proj_lang:
         self.proj = nn.Sequential(*[EqualisedLinearLayer(512, cfg.latent_dim, weight_scaling=cfg.weight_scaling), nn.Tanh()])
Пример #8
0
    def __init__(self, train_file_path: str, dev_file_path: str, test_file_path: str, lm_file_path: str,
                 train_batch_size: int,
                 test_batch_size: int, lr: float, lm_weights_file_path: str, epochs: int, lm_pretrain: str, task: int,
                 train_scratch: str, model_path: str,
                 joke_classification_path: str, add_joke_model: str,word2vec : str):
        '''

        :param train_file_path: Path to the train file
        :param test_file_path: Path to the test file
        :param train_batch_size: Size of the batch during training
        :param test_batch_size: Size of the batch during testing
        :param lr: learning rate
        '''

        super(RBERT, self).__init__()
        self.bert_model = RobertaForMaskedLM.from_pretrained('roberta-base', output_hidden_states=True)
        if lm_pretrain != 'true':
            pass
            # self.load_joke_lm_weights(lm_weights_file_path)
        self.train_batch_size = train_batch_size
        self.test_batch_size = test_batch_size
        self.train_file_path = train_file_path
        self.lm_file_path = lm_file_path
        # self.lstm = nn.LSTM(768*2,768*2,bidirectional=False)
        self.attention = nn_nlp.Attention(768 * 2)
        self.word2vec = word2vec
        self.dev_file_path = dev_file_path
        self.test_file_path = test_file_path
        self.joke_classification_path = joke_classification_path
        self.lr = lr
        self.task = task

        if word2vec=='true':
            self.gensim_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz',
                                                                                binary=True)
        else:
            self.gensim_model = None
        self.prelu = nn.PReLU()
        self.add_joke_model = add_joke_model
        self.epochs = epochs
        self.linear_joke = nn.Sequential(nn.Dropout(0.3), nn.Linear(768, 2))
        self.linear_reg1 = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(768 * 8, 1024))

        if self.task:
            self.final_linear = nn.Sequential(nn.Dropout(0.3), nn.Linear(1024, 1))
        else:
            self.final_linear = nn.Sequential(nn.Dropout(0.3), nn.Linear(100, 2))

        if train_scratch == 'true':
            self.load_state_dict(torch.load(model_path))
Пример #9
0
def make_model_and_tok(gpuid):
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    model = RobertaForMaskedLM.from_pretrained('roberta-base')
    _ = model.eval()
    _ = model.to(gpuid)

    for param in model.parameters():
        param.requires_grad = False

    pred_model = model.roberta
    enco_model = pred_model.embeddings.word_embeddings

    return (model, enco_model, pred_model, tokenizer)
Пример #10
0
 def create_and_check_for_masked_lm(
     self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 ):
     model = RobertaForMaskedLM(config=config)
     model.to(torch_device)
     model.eval()
     result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
     self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
Пример #11
0
def create_long_model(save_model_to, attention_window, max_pos):
    model = RobertaForMaskedLM.from_pretrained('roberta-base')
    tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base',
                                                     model_max_length=max_pos)
    config = model.config
    #pdb.set_trace()

    # extend position embeddings
    tokenizer.model_max_length = max_pos
    tokenizer.init_kwargs['model_max_length'] = max_pos
    current_max_pos, embed_size = model.roberta.embeddings.position_embeddings.weight.shape
    max_pos += 2  # NOTE: RoBERTa has positions 0,1 reserved, so embedding size is max position + 2
    config.max_position_embeddings = max_pos
    assert max_pos > current_max_pos
    # allocate a larger position embedding matrix
    new_pos_embed = model.roberta.embeddings.position_embeddings.weight.new_empty(
        max_pos, embed_size)
    model.roberta.embeddings.register_buffer(
        "position_ids",
        torch.arange(config.max_position_embeddings).expand((1, -1)),
    )

    # copy position embeddings over and over to initialize the new position embeddings
    k = 2
    step = current_max_pos - 2
    while k < max_pos - 1:
        new_pos_embed[k:(
            k +
            step)] = model.roberta.embeddings.position_embeddings.weight[2:]
        k += step
    model.roberta.embeddings.position_embeddings.weight.data = new_pos_embed

    # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`
    config.attention_window = [attention_window] * config.num_hidden_layers
    for i, layer in enumerate(model.roberta.encoder.layer):
        longformer_self_attn = LongformerSelfAttention(config, layer_id=i)
        longformer_self_attn.query = layer.attention.self.query
        longformer_self_attn.key = layer.attention.self.key
        longformer_self_attn.value = layer.attention.self.value

        longformer_self_attn.query_global = layer.attention.self.query
        longformer_self_attn.key_global = layer.attention.self.key
        longformer_self_attn.value_global = layer.attention.self.value

        layer.attention.self = longformer_self_attn

    logger.info(f'saving model to {save_model_to}')
    model.save_pretrained(save_model_to)
    tokenizer.save_pretrained(save_model_to)
    return model, tokenizer
Пример #12
0
def init_model(model_name: str,
               device: torch.device, cache_dir):
    """
    Initialize a pre-trained LM
    :param model_name: from MODEL_CLASSES
    :param device: CUDA / CPU device
    :return: the model and tokenizer
    """
    logger.info(f'Initializing {model_name}')
    tokenizer = RobertaTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
    model = RobertaForMaskedLM.from_pretrained(model_name, cache_dir=cache_dir)
    model.to(device)
    model.eval()
    return model, tokenizer
Пример #13
0
def main(args):
    data = np.load(args.data, allow_pickle=True)
    tokenizer_path = args.tokenizer
    tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path,
                                        max_len=512,
                                        mask_token="<mask>",
                                        pad_token="<pad>")
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.convert_tokens_to_ids("</s>")),
        ("<s>", tokenizer.convert_tokens_to_ids("<s>")),
    )

    config = RobertaConfig(
        vocab_size=tokenizer.vocab_size,
        max_position_embeddings=514,
        num_attention_heads=12,
        num_hidden_layers=6,
        type_vocab_size=1,
    )

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=True,
                                                    mlm_probability=0.15)
    dataset = PhoneDatasetMLM(data, tokenizer)

    model = RobertaForMaskedLM(config=config)

    training_args = TrainingArguments(
        output_dir=args.output_dir,
        overwrite_output_dir=True,
        num_train_epochs=1,
        per_device_train_batch_size=64,
        logging_steps=2,
        save_steps=10_000,
        save_total_limit=2,
        prediction_loss_only=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )

    trainer.train()
    trainer.save_model(args.output_dir)
    def __init__(self, config):
        super().__init__()
        self.train_config = config

        self.roberta = RobertaForMaskedLM.from_pretrained('roberta-base')
        _ = self.roberta.eval()
        for param in self.roberta.parameters():
            param.requires_grad = False

        self.pred_model = self.roberta.roberta
        self.enc_model = self.pred_model.embeddings.word_embeddings
        self.proj_head = DVProjectionHead_EmbActi()

        self.lossfunc = nn.BCEWithLogitsLoss()

        self.acc = Accuracy(threshold=0.0)
        self.f1 = F1(threshold=0.0)
Пример #15
0
    def __init__(self,
                 model_path='roberta-base',
                 temperature=1.0,
                 top_k=None,
                 top_p=None,
                 device='cuda'):
        super().__init__(device,
                         temperature=temperature,
                         top_k=top_k,
                         top_p=top_p)
        self.model_path = model_path

        self.tokenizer = RobertaTokenizer.from_pretrained(model_path)
        self.model = RobertaForMaskedLM.from_pretrained(model_path)

        self.model.to(self.device)
        self.model.eval()
Пример #16
0
    def test_inference_masked_lm(self):
        model = RobertaForMaskedLM.from_pretrained("roberta-base")

        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
        output = model(input_ids)[0]
        expected_shape = torch.Size((1, 11, 50265))
        self.assertEqual(output.shape, expected_shape)
        # compare the actual values for a slice.
        expected_slice = torch.tensor(
            [[[33.8802, -4.3103, 22.7761], [4.6539, -2.8098, 13.6253], [1.8228, -3.6898, 8.8600]]]
        )

        # roberta = torch.hub.load('pytorch/fairseq', 'roberta.base')
        # roberta.eval()
        # expected_slice = roberta.model.forward(input_ids)[0][:, :3, :3].detach()

        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
Пример #17
0
 def __init__(
     self,
     tokenizer: HfRoBERTaTextEncoder,
     hparams: HyperOptArgumentParser,
     lm_head: bool = False,
 ) -> None:
     super().__init__(768 if "base" in hparams.pretrained_model else 1024, tokenizer)
     self._n_layers = 13 if "base" in hparams.pretrained_model else 25
     self.padding_idx = self.tokenizer.padding_index
     if not lm_head:
         self.model = RobertaModel.from_pretrained(
             hparams.pretrained_model, output_hidden_states=True
         )
     else:
         mlm_model = RobertaForMaskedLM.from_pretrained(
             hparams.pretrained_model, output_hidden_states=True
         )
         self.model = mlm_model.roberta
         self.lm_head = mlm_model.lm_head
Пример #18
0
def main(args):

    # Import the custom trained tokenizer
    tokenizer = RobertaTokenizerFast.from_pretrained(args.tokenizer)

    # Define the model
    config = RobertaConfig(vocab_size=32000)
    model = RobertaForMaskedLM(config=config)

    # Import the dataset
    dataset = LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path=args.data,
        block_size=128,
    )

    # Initialize the data collector
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)

    # Set all of the training arguments
    training_args = TrainingArguments(
        output_dir=args.output,
        overwrite_output_dir=True,
        num_train_epochs=10,
        per_gpu_train_batch_size=24,
        save_steps=10_000,
        save_total_limit=10,
    )

    # Train the model
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )

    trainer.train()

    # Save the mode
    trainer.save_model("./roBERTaCODE_{}_{}".format(args.language, args.size))
Пример #19
0
 def create_and_check_roberta_for_masked_lm(self, config, input_ids,
                                            token_type_ids, input_mask,
                                            sequence_labels, token_labels,
                                            choice_labels):
     model = RobertaForMaskedLM(config=config)
     model.to(torch_device)
     model.eval()
     result = model(input_ids,
                    attention_mask=input_mask,
                    token_type_ids=token_type_ids,
                    labels=token_labels)
     self.parent.assertListEqual(
         list(result["logits"].size()),
         [self.batch_size, self.seq_length, self.vocab_size])
     self.check_loss_output(result)
Пример #20
0
    def test_tokenize(self):
        current_dir = os.path.dirname(os.path.realpath(__file__))
        vocab_path = os.path.join(current_dir, 'data', 'vocab.txt')
        tokenized_smiles = [
            12, 16, 16, 16, 17, 16, 16, 18, 16, 19, 16, 17, 22, 19, 18, 33, 17,
            16, 18, 23, 181, 17, 22, 19, 18, 17, 19, 16, 33, 20, 19, 55, 17,
            16, 38, 23, 18, 17, 33, 17, 19, 18, 35, 20, 19, 18, 16, 20, 22, 16,
            16, 22, 16, 21, 23, 20, 23, 22, 16, 23, 22, 16, 21, 23, 18, 19, 16,
            20, 22, 16, 16, 22, 16, 16, 22, 16, 20, 13
        ]

        model = RobertaForMaskedLM.from_pretrained(
            'seyonec/SMILES_tokenized_PubChem_shard00_50k')
        model.num_parameters()

        tokenizer = SmilesTokenizer(
            vocab_path, max_len=model.config.max_position_embeddings)

        assert tokenized_smiles == tokenizer.encode(
            "CCC(CC)COC(=O)[C@H](C)N[P@](=O)(OC[C@H]1O[C@](C#N)([C@H](O)[C@@H]1O)C1=CC=C2N1N=CN=C2N)OC1=CC=CC=C1"
        )
Пример #21
0
def test_sequence():
    result_dir = "../results/"

    tok_dir = "tokenizer_model/"
    tokenizer = train.get_tok(tok_dir)

    csv_path = "../utils/test.csv"
    txt_name = "test.txt"
    utils.make_train_txt(csv_path, txt_name)

    dp = DP("../utils/test.csv")
    param = dp.param.to_numpy()[-7:]
    param = param.tolist()
    param.append("probability")

    mod_dir = "transformer_model/checkpoint-33000/"
    model = RobertaForMaskedLM.from_pretrained(mod_dir)
    fill_mask = pipeline("fill-mask",
                         model=model,
                         tokenizer=tokenizer,
                         device=0)

    PD, PB, ill, count = start(fill_mask)

    denorm = dp.denormalize(ill)

    l = []
    for x in range(len(PD)):
        tmp = []

        for j in range(7):
            tmp.append(random.randrange(denorm[x][j][0], denorm[x][j][1] + 1))
        tmp.append(PB[x])

        l.append(tmp)

    df = pd.DataFrame(l, columns=param)
    df.to_csv(result_dir + "result.csv", index=False, encoding="cp949")
Пример #22
0
def train_mod(txt_dir, tokenizer, model_dir):
    config = RobertaConfig(
        vocab_size=3305,
        max_position_embeddings=1024,
        num_attention_heads=12,
        num_hidden_layers=6,
        output_attentions=True,
        type_vocab_size=1,
    )

    dataset = LineByLineTextDataset(tokenizer=tokenizer,
                                    file_path=txt_dir,
                                    block_size=1024)

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=True,
                                                    mlm_probability=0.15)

    model = RobertaForMaskedLM(config=config)

    training_args = TrainingArguments(
        output_dir=model_dir,
        overwrite_output_dir=True,
        num_train_epochs=1000,
        per_gpu_train_batch_size=16,
        save_steps=1000,
        save_total_limit=37,
        prediction_loss_only=True,
    )

    trainer = Trainer(model=model,
                      args=training_args,
                      data_collator=data_collator,
                      train_dataset=dataset)

    trainer.train()

    trainer.save_model(model_dir)
Пример #23
0
    def train(self):
        if self.has_started():
            last_checkpoint = self.get_latest_checkpoint()
            logger.info(f"Resuming training from: {last_checkpoint}")

            model = AutoModelForMaskedLM.from_pretrained(last_checkpoint,
                                                         config=self.config)

        else:
            model = RobertaForMaskedLM(config=self.config)

        trainer = Trainer(
            model=model,
            args=self.training_args,
            data_collator=self.data_collator,
            train_dataset=self.dataset,
            prediction_loss_only=True,
        )

        trainer.train()

        trainer.save_model(f"{self.model_dir}")
        self.upload()
Пример #24
0
def evaluate(args):
    """
    Args:
        ckpt: model checkpoints.
        hparams_file: the string should end with "hparams.yaml"
    """
    trainer = Trainer(gpus=args.gpus,
                      distributed_backend=args.distributed_backend,
                      deterministic=True)

    # reload test dataloader
    # print(trainer.test())
    print("path_to_model_checkpoint", args.path_to_model_checkpoint)
    # print(BertForQA)

    model = BertForQA.load_from_checkpoint(
        checkpoint_path=args.path_to_model_checkpoint,
        hparams_file=args.path_to_model_hparams_file,
        map_location=None,
        batch_size=args.eval_batch_size,
    )

    mlm_model = RobertaForMaskedLM.from_pretrained(
        './cached_models/roberta_squad1_covidmlm(train_and_dev)_3epoch/')
    model.model.roberta.load_state_dict(mlm_model.roberta.state_dict())

    # mlm_model = RobertaForMaskedLM.from_pretrained('./cached_models/roberta_squad1_2epoch_covidmlm_3epoch/')
    # model.model.roberta.load_state_dict(mlm_model.roberta.state_dict())
    # # evaluate ner
    # model = BertForNERTask.load_from_checkpoint(
    #     checkpoint_path=args.path_to_model_checkpoint,
    #     hparams_file=args.path_to_model_hparams_file,
    #     map_location=None,
    #     batch_size=args.eval_batch_size
    # )

    trainer.test(model=model)
Пример #25
0
# %%
import torch
import string

from transformers import RobertaTokenizer, RobertaForMaskedLM
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaForMaskedLM.from_pretrained('roberta-base').eval()

top_k = 10


def decode(tokenizer, pred_idx, top_clean):
    ignore_tokens = string.punctuation + '[PAD]'
    tokens = []
    for w in pred_idx:
        token = ''.join(tokenizer.decode(w).split())
        if token not in ignore_tokens:
            tokens.append(token.replace('##', ''))
    return '\n'.join(tokens[:top_clean])


def encode(tokenizer, text_sentence, add_special_tokens=True):
    text_sentence = text_sentence.replace('<mask>', tokenizer.mask_token)
    # if <mask> is the last token, append a "." so that models dont predict punctuation.
    if tokenizer.mask_token == text_sentence.split()[-1]:
        text_sentence += ' .'

    input_ids = torch.tensor([
        tokenizer.encode(text_sentence, add_special_tokens=add_special_tokens)
    ])
    mask_idx = torch.where(input_ids == tokenizer.mask_token_id)[1].tolist()[0]
Пример #26
0
file_in = sys.argv[1]
file_out = sys.argv[2]

all_data_dict = dict()
max_length = 100
tail_hidd_list = list()
#device = "cpu"
device = "cuda"

pretrained_weights = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(pretrained_weights)

fine_tuned_weight = 'roberta-base'
model = RobertaForMaskedLM.from_pretrained(pretrained_weights,
                                           output_hidden_states=True,
                                           return_dict=True)
#model.load_state_dict(torch.load(fine_tuned_weight), strict=False)

#model.to(device).half()
model.to(device)
model.eval()

num_samples = 1000000

old = torch.FloatTensor(768)
with open(file_in) as f:
    #data = json.load(f)
    for index, d in tqdm(enumerate(f)):
        if index == 1000000:
            break
Пример #27
0
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased', do_lower_case=False)
    model = BertForMaskedLM.from_pretrained('./multi-label_LM/multi-label_Bert_e10_b16', config=config)
    #model = BertForMaskedLM.from_pretrained('./multi-label_train.csv_LMmodel', config=config)
    # 12-layer, 768-hidden, 12-heads, 110M parameters.

elif args.LM == 'RoBerta':
    from transformers import RobertaConfig, RobertaTokenizerFast, RobertaForMaskedLM

    config = RobertaConfig(vocab_size=50265,
                           max_position_embeddings=514,
                           num_attention_heads=12,
                           num_hidden_layers=12,
                           type_vocab_size=1,
                           )
    tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', do_lower_case=False)
    model = RobertaForMaskedLM.from_pretrained('./multi-label_LM/multi-label_RoBerta_e10_b16', config=config)
    # 12-layer, 768-hidden, 12-heads, 125M parameters, roberta-base using the bert-base architecture

elif args.LM == 'XLM':
    from transformers import XLMConfig, XLMTokenizer, XLMWithLMHeadModel

    config = XLMConfig(vocab_size=64139,
                       emb_dim=1024,
                       max_position_embeddings=512,
                       n_heads=8,
                       n_layers=6,
                       )

    tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-enfr-1024', do_lower_case=False)
    model = XLMWithLMHeadModel.from_pretrained('./multi-label_LM/multi-label_XLM_e10_b16', config=config)
    # 6-layer, 1024-hidden, 8-heads
Пример #28
0
    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    # Prepare model
    class tempmodel(nn.Module):
        def __init__(self, roberta, insert_net, delete_net):
            super().__init__()
            self.roberta = roberta
            self.insert_net = insert_net
            self.delete_net = delete_net

    roberta = RobertaForMaskedLM.from_pretrained("roberta-base")
    insert_net = nn.Linear(768, 3)
    delete_net = nn.Linear(768, 3)
    roberta.load_state_dict(
        torch.load(os.path.join(args.from_dir, 'bert_model.bin')))
    # if args.delete:
    #     insert_net.load_state_dict(torch.load(os.path.join(args.from_dir, 'insert_model.bin')))
    # else:
    from weight_init import weight_init
    insert_net.apply(weight_init)
    delete_net.apply(weight_init)
    # init CTRL code
    roberta.roberta.embeddings.word_embeddings.weight[
        50261, :].data = roberta.roberta.embeddings.word_embeddings.weight[
            0, :].data
    roberta.roberta.embeddings.word_embeddings.weight[
Пример #29
0
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

print(tokenizer.encode("For it is in reality vain to profess"))

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

tokenizer = RobertaTokenizerFast.from_pretrained(SAVE_MODEL, max_len=512)
model = RobertaForMaskedLM(config=config)

print(model.num_parameters())

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=PATH + "/kant.txt",
    block_size=128,
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm=True,
                                                mlm_probability=0.15)
training_args = TrainingArguments(
    output_dir=SAVE_MODEL,
    overwrite_output_dir=True,
Пример #30
0
def main():
    random.seed(1012)
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger = logging.getLogger(__name__)
    chars = string.ascii_lowercase
    number_of_entity_trials = 10

    tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
    # checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/save_step_92160/checkpoint.pt'
    # state_dict = torch.load(checkpoint_path)["model"]
    # roberta = RobertaForMaskedLM.from_pretrained('roberta-base', state_dict=state_dict)

    # # Initializing a RoBERTa configuration
    # config = RobertaConfig.from_pretrained('roberta-base')
    # # Initializing a model from the configuration
    # roberta = RobertaForMaskedLM(config)
    # checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/save_step_92160/checkpoint.pt'
    # state_dict = torch.load(checkpoint_path)["model"]
    # roberta.load_state_dict(state_dict)

    roberta = HappyROBERTA('roberta-large')

    config = RobertaConfig.from_pretrained('roberta-large')
    mlm = RobertaForMaskedLM(config)
    #checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/save_step_92160/checkpoint.pt'
    #checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/roberta-base/save_step_230400/checkpoint.pt'
    #checkpoint_path = '/home/rahul/common_sense_embedding_analysis/data/finetune_data/roberta-base/roberta_base_best_sample_from_sets/checkpoint.pt'
    checkpoint_path = '../data/finetune_data/roberta-large/save_step_57000/checkpoint.pt'
    state_dict = torch.load(checkpoint_path)["model"]
    mlm.load_state_dict(state_dict)
    mlm.eval()

    roberta.mlm = mlm

    fictitious_entities = proc.generate_pairs_of_random_strings(
        number_of_pairs=100, min_length=3, max_length=12, character_set=chars)
    with open("../data/truism_data/physical_data_sentences_2.json", "r") as f:
        physical_sents = json.load(f)

    with open("../data/truism_data/physical_data_2.json", "r") as f:
        physical_config = json.load(f)

    with open("../data/finetune_data/sample_from_sets/test_keys.json",
              "r") as f:
        test_keys = json.load(f)

    phy_filtered = {}
    for key in test_keys['phy']:
        index = key.split("-")[0]
        ling_pert = key.split("-")[1]
        asym_pert = key.split("-")[2]
        if index not in phy_filtered.keys():
            phy_filtered[index] = {}
            phy_filtered[index][ling_pert] = {}
            phy_filtered[index][ling_pert][asym_pert] = physical_sents[index][
                ling_pert][asym_pert]
        elif ling_pert not in phy_filtered[index].keys():
            phy_filtered[index][ling_pert] = {}
            phy_filtered[index][ling_pert][asym_pert] = physical_sents[index][
                ling_pert][asym_pert]
        else:
            phy_filtered[index][ling_pert][asym_pert] = physical_sents[index][
                ling_pert][asym_pert]
    # physical_sents = {k: physical_sents[k] for k in ('11', '16')}
    # physical_config  = {k: physical_config[k] for k in ('11', '16')}

    logger.info("finished reading in physical data")

    output_df = run_pipeline(model=roberta,
                             tokenizer=tokenizer,
                             fictitious_entities=fictitious_entities,
                             sentences=phy_filtered,
                             config=physical_config,
                             number_of_entity_trials=number_of_entity_trials,
                             logger=logger)

    output_df.to_csv(
        "../data/masked_word_result_data/roberta/sample_from_set/physical_perf_ft19_new_{}.csv"
        .format(number_of_entity_trials),
        index=False)

    logger.info("finished saving physical dataset results")

    with open("../data/truism_data/material_data_sentences_2.json", "r") as f:
        material_sents = json.load(f)

    with open("../data/truism_data/material_data_2.json", "r") as f:
        material_config = json.load(f)

    mat_filtered = {}
    for key in test_keys['mat']:
        index = key.split("-")[0]
        ling_pert = key.split("-")[1]
        asym_pert = key.split("-")[2]
        if index not in mat_filtered.keys():
            mat_filtered[index] = {}
            mat_filtered[index][ling_pert] = {}
            mat_filtered[index][ling_pert][asym_pert] = material_sents[index][
                ling_pert][asym_pert]
        elif ling_pert not in mat_filtered[index].keys():
            mat_filtered[index][ling_pert] = {}
            mat_filtered[index][ling_pert][asym_pert] = material_sents[index][
                ling_pert][asym_pert]
        else:
            mat_filtered[index][ling_pert][asym_pert] = material_sents[index][
                ling_pert][asym_pert]

    logger.info("finished reading in material data")

    output_df = run_pipeline(model=roberta,
                             tokenizer=tokenizer,
                             fictitious_entities=fictitious_entities,
                             sentences=mat_filtered,
                             config=material_config,
                             number_of_entity_trials=number_of_entity_trials,
                             logger=logger)

    output_df.to_csv(
        "../data/masked_word_result_data/roberta/sample_from_set/material_perf_ft19_new_{}.csv"
        .format(number_of_entity_trials),
        index=False)

    logger.info("finished saving physical material results")

    with open("../data/truism_data/social_data_sentences_2.json", "r") as f:
        social_sents = json.load(f)

    with open("../data/truism_data/social_data_2.json", "r") as f:
        social_config = json.load(f)

    soc_filtered = {}
    for key in test_keys['soc']:
        index = key.split("-")[0]
        ling_pert = key.split("-")[1]
        asym_pert = key.split("-")[2]
        if index not in soc_filtered.keys():
            soc_filtered[index] = {}
            soc_filtered[index][ling_pert] = {}
            soc_filtered[index][ling_pert][asym_pert] = social_sents[index][
                ling_pert][asym_pert]
        elif ling_pert not in soc_filtered[index].keys():
            soc_filtered[index][ling_pert] = {}
            soc_filtered[index][ling_pert][asym_pert] = social_sents[index][
                ling_pert][asym_pert]
        else:
            soc_filtered[index][ling_pert][asym_pert] = social_sents[index][
                ling_pert][asym_pert]

    logger.info("finished reading in social data")

    output_df = run_pipeline(model=roberta,
                             tokenizer=tokenizer,
                             fictitious_entities=fictitious_entities,
                             sentences=soc_filtered,
                             config=social_config,
                             number_of_entity_trials=number_of_entity_trials,
                             logger=logger)

    output_df.to_csv(
        "../data/masked_word_result_data/roberta/sample_from_set/social_perf_ft19_new_{}.csv"
        .format(number_of_entity_trials),
        index=False)

    logger.info("finished saving physical social results")