Exemplo n.º 1
0
    def __init__(self, args, config, dataloader, ckpdir):

        self.device = torch.device('cuda') if (
            args.gpu and torch.cuda.is_available()) else torch.device('cpu')
        if torch.cuda.is_available(): print('[Runner] - CUDA is available!')
        self.model_kept = []
        self.global_step = 1
        self.log = SummaryWriter(ckpdir)

        self.args = args
        self.config = config
        self.dataloader = dataloader
        self.ckpdir = ckpdir

        # optimizer
        self.learning_rate = float(config['optimizer']['learning_rate'])
        self.warmup_proportion = config['optimizer']['warmup_proportion']
        self.gradient_accumulation_steps = config['optimizer'][
            'gradient_accumulation_steps']
        self.gradient_clipping = config['optimizer']['gradient_clipping']

        # Training details
        self.apex = config['runner']['apex']
        self.total_steps = config['runner']['total_steps']
        self.log_step = config['runner']['log_step']
        self.save_step = config['runner']['save_step']
        self.duo_feature = config['runner']['duo_feature']
        self.max_keep = config['runner']['max_keep']

        # Model configs
        self.semantic_config = RobertaConfig(**config['semantic'])
        self.acoustic_config = RobertaConfig(**config['acoustic'])
Exemplo n.º 2
0
    def __init__(self, config, num=0):
        super(roBerta, self).__init__()
        model_config = RobertaConfig()
        model_config.vocab_size = config.vocab_size
        model_config.hidden_size = config.hidden_size[0]
        model_config.num_attention_heads = 16
        # 计算loss的方法
        self.loss_method = config.loss_method
        self.multi_drop = config.multi_drop

        self.roberta = RobertaModel(model_config)
        if config.requires_grad:
            for param in self.roberta.parameters():
                param.requires_grad = True

        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.hidden_size = config.hidden_size[num]
        if self.loss_method in ['binary', 'focal_loss', 'ghmc']:
            self.classifier = nn.Linear(self.hidden_size, 1)
        else:
            self.classifier = nn.Linear(self.hidden_size, self.num_labels)
        self.text_linear = nn.Linear(config.embeding_size,
                                     config.hidden_size[0])
        self.vocab_layer = nn.Linear(config.hidden_size[0], config.vocab_size)

        self.classifier.apply(self._init_weights)
        self.roberta.apply(self._init_weights)
        self.text_linear.apply(self._init_weights)
        self.vocab_layer.apply(self._init_weights)
    def roberta_build(self,
                      sparse=False,
                      base_model=None,
                      density=1.0,
                      eval=True):
        if base_model == None:
            config = RobertaConfig(
                vocab_size=52_000,
                max_position_embeddings=514,
                num_attention_heads=12,
                num_hidden_layers=6,
                type_vocab_size=1,
            )

            model = RobertaForMaskedLM(config=config).cuda()
        else:
            model = base_model

        if sparse:
            mp = BlockSparseModelPatcher()
            mp.add_pattern(
                "roberta\.encoder\.layer\.[0-9]+.intermediate\.dense",
                {"density": density})
            mp.add_pattern("roberta\.encoder\.layer\.[0-9]+.output\.dense",
                           {"density": density})
            mp.patch_model(model)

        if eval:
            model.eval()

        return model, model.num_parameters()
Exemplo n.º 4
0
    def __init__(self,
                 data_dir: Path,
                 tokenizer: PreTrainedTokenizer,
                 dataset: Dataset,
                 local_rank=-1):
        assert data_dir, "data_dir input needed"

        self.model_dir = f"{data_dir}/results"
        self.dataset = dataset

        self.config = RobertaConfig(
            vocab_size=52_000,
            max_position_embeddings=514,
            num_attention_heads=12,
            num_hidden_layers=6,
            type_vocab_size=1,
        )
        self.training_args = TrainingArguments(
            run_name=data_dir.name,
            local_rank=local_rank,
            learning_rate=0.00005,  # default 0.00005
            output_dir=f"{self.model_dir}",
            overwrite_output_dir=False,
            num_train_epochs=1,
            per_device_train_batch_size=48,  # Nvidia K80 99%
            seed=42,
            save_steps=10_000,
            save_total_limit=1,
        )

        self.data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
Exemplo n.º 5
0
 def __init__(self, cfg, device):
     super().__init__()
     tokenizer = RobertaTokenizerFast.from_pretrained('./bird_bpe_vocab', max_len=256)
     _config = RobertaConfig(
         vocab_size=tokenizer._tokenizer.get_vocab_size(),
         hidden_size=512,
         num_hidden_layers=4,
         num_attention_heads=8,
         max_position_embeddings=256,
         pad_token_id=1,
         eos_token_id=0,
         bos_token_id=2,
         output_attentions=False,
         output_hidden_states=False
     )
     _model = RobertaForMaskedLM(_config)
     _model.load_state_dict(torch.load('bert_small/checkpoint-1100/pytorch_model.bin'))
     _model.eval()
     self.tokenizer = tokenizer
     self._model = _model
     self.device = device
     self.pad_token = 0
     self.batch_size = cfg.batch_size
     self.proj = None
     if cfg.proj_lang:
         self.proj = nn.Sequential(*[EqualisedLinearLayer(512, cfg.latent_dim, weight_scaling=cfg.weight_scaling), nn.Tanh()])
Exemplo n.º 6
0
def create_roberta_model(tokens_train, attn_mask_train, num_classes):

    config = RobertaConfig(vocab_size=50021, hidden_size=1024,
                           num_hidden_layers=16, num_attention_heads=16, intermediate_size=2048, 
                           attention_probs_dropout_prob=0.3, hidden_dropout_prob=0.3)
    
    bert = TFRobertaModel(config)

    # dense1 = Dense(500, activation='relu')
    dense2 = Dense(368, activation='relu')
    dense3 = Dense(num_classes, activation='softmax')
    dropout = Dropout(0.3)
    
    tokens = Input(shape=(tokens_train.shape[1],), dtype=tf.int32)
    attn_mask = Input(shape=(attn_mask_train.shape[1],), dtype=tf.int32)

    pooled_output = bert(tokens, attn_mask).pooler_output

    med = dropout(dense2(pooled_output))

    final = dense3(pooled_output)

    model = Model(inputs=[tokens, attn_mask], outputs=final)
    
    return model
    def prepare_config_and_inputs(self):
        input_ids = ids_tensor([self.batch_size, self.seq_length],
                               self.vocab_size)

        attention_mask = None
        if self.use_attention_mask:
            attention_mask = random_attention_mask(
                [self.batch_size, self.seq_length])

        token_type_ids = None
        if self.use_token_type_ids:
            token_type_ids = ids_tensor([self.batch_size, self.seq_length],
                                        self.type_vocab_size)

        config = RobertaConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
            num_attention_heads=self.num_attention_heads,
            intermediate_size=self.intermediate_size,
            hidden_act=self.hidden_act,
            hidden_dropout_prob=self.hidden_dropout_prob,
            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
            max_position_embeddings=self.max_position_embeddings,
            type_vocab_size=self.type_vocab_size,
            is_decoder=False,
            initializer_range=self.initializer_range,
        )

        return config, input_ids, token_type_ids, attention_mask
Exemplo n.º 8
0
        def prepare_config_and_inputs(self):
            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)

            input_mask = None
            if self.use_input_mask:
                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)

            token_type_ids = None
            if self.use_token_type_ids:
                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)

            sequence_labels = None
            token_labels = None
            choice_labels = None
            if self.use_labels:
                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
                choice_labels = ids_tensor([self.batch_size], self.num_choices)

            config = RobertaConfig(
                vocab_size=self.vocab_size,
                hidden_size=self.hidden_size,
                num_hidden_layers=self.num_hidden_layers,
                num_attention_heads=self.num_attention_heads,
                intermediate_size=self.intermediate_size,
                hidden_act=self.hidden_act,
                hidden_dropout_prob=self.hidden_dropout_prob,
                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
                max_position_embeddings=self.max_position_embeddings,
                type_vocab_size=self.type_vocab_size,
                initializer_range=self.initializer_range,
            )

            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    def __init__(
        self,
        pretrained_model_name=None,
        config_filename=None,
        vocab_size=None,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        hidden_act="gelu",
        max_position_embeddings=512,
    ):
        super().__init__()

        # Check that only one of pretrained_model_name, config_filename, and
        # vocab_size was passed in
        total = 0
        if pretrained_model_name is not None:
            total += 1
        if config_filename is not None:
            total += 1
        if vocab_size is not None:
            total += 1

        if total != 1:
            raise ValueError(
                "Only one of pretrained_model_name, vocab_size, "
                + "or config_filename should be passed into the "
                + "ROBERTA constructor."
            )

        # TK: The following code checks the same once again.
        if vocab_size is not None:
            config = RobertaConfig(
                vocab_size_or_config_json_file=vocab_size,
                vocab_size=vocab_size,
                hidden_size=hidden_size,
                num_hidden_layers=num_hidden_layers,
                num_attention_heads=num_attention_heads,
                intermediate_size=intermediate_size,
                hidden_act=hidden_act,
                max_position_embeddings=max_position_embeddings,
            )
            model = RobertaModel(config)
        elif pretrained_model_name is not None:
            model = RobertaModel.from_pretrained(pretrained_model_name)
        elif config_filename is not None:
            config = RobertaConfig.from_json_file(config_filename)
            model = RobertaModel(config)
        else:
            raise ValueError(
                "Either pretrained_model_name or vocab_size must" + " be passed into the ROBERTA constructor"
            )

        model.to(self._device)

        self.add_module("roberta", model)
        self.config = model.config
        self._hidden_size = model.config.hidden_size
Exemplo n.º 10
0
    def __init__(self, dropout):
        super(ROBERTAModel, self).__init__()

        self.roberta = RobertaModel.from_pretrained(
            config.PATHS['roberta'],
            config=RobertaConfig())

        self.fc = nn.Linear(768, 2)
        self.dropout = nn.Dropout(dropout)
Exemplo n.º 11
0
def train(no_cache: bool, dataset_path: str, data_config_name: str,
          training_args: TrainingArguments, tokenizer: RobertaTokenizerFast):

    print(f"tokenizer vocab size: {tokenizer.vocab_size}")

    print(f"\nLoading datasets found in {dataset_path}.")
    train_dataset, eval_dataset, test_dataset = load_dataset(
        'EMBO/biolang',
        data_config_name,
        data_dir=dataset_path,
        split=["train", "validation", "test"],
        # download_mode=GenerateMode.FORCE_REDOWNLOAD if no_cache else GenerateMode.REUSE_DATASET_IF_EXISTS,
        cache_dir=CACHE)

    if data_config_name != "MLM":
        data_collator = DataCollatorForTargetedMasking(
            tokenizer=tokenizer, max_length=config.max_length)
    else:
        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                        mlm=True)

    print(f"\nTraining with {len(train_dataset)} examples.")
    print(f"Evaluating on {len(eval_dataset)} examples.")

    if config.from_pretrained:
        model = RobertaForMaskedLM.from_pretrained(config.from_pretrained)
    else:
        model_config = RobertaConfig(
            vocab_size=config.vocab_size,
            max_position_embeddings=config.max_length +
            2,  # max_length + 2 for start/end token
            num_attention_heads=12,
            num_hidden_layers=6,
            type_vocab_size=1,
        )
        model = RobertaForMaskedLM(config=model_config)

    training_args.remove_unused_columns = False  # we need pos_mask and special_tokens_mask in collator

    print("\nTraining arguments:")
    print(training_args)

    trainer = MyTrainer(model=model,
                        args=training_args,
                        data_collator=data_collator,
                        train_dataset=train_dataset,
                        eval_dataset=eval_dataset,
                        compute_metrics=compute_metrics,
                        callbacks=[ShowExample(tokenizer)])

    print(f"CUDA available: {torch.cuda.is_available()}")
    trainer.train()
    trainer.save_model(training_args.output_dir)

    print(f"Testing on {len(test_dataset)}.")
    pred: NamedTuple = trainer.predict(test_dataset, metric_key_prefix='test')
    print(f"{pred.metrics}")
def create_model():
    config = RobertaConfig(
        vocab_size=3437,
        max_position_embeddings=64,
        num_attention_heads=12,
        num_hidden_layers=8,
        type_vocab_size=1,
    )

    return RobertaForMaskedLM(config=config)
Exemplo n.º 13
0
    def __init__(self,
                 classifier_config_dir,
                 device,
                 task_type,
                 n_clf_layers=6,
                 use_dm=True,
                 use_pm=True,
                 use_rt=True,
                 use_bio=False,
                 use_name=False,
                 use_network=False,
                 use_count=False):
        super(ConcatenatedClassifier, self).__init__()
        # load text model
        self.device = device
        self.task_type = task_type
        self.use_text = use_dm | use_pm | use_rt
        self.use_bio = use_bio
        self.use_name = use_name
        self.use_etc = use_network | use_count
        self.text_model = RobertaModel.from_pretrained(
            "vinai/bertweet-base",
            output_attentions=False,
            output_hidden_states=False)
        if self.use_name:
            self.charEmbedding = nn.Embedding(
                num_embeddings=302, embedding_dim=300,
                padding_idx=301)  # 302: 300-top frequent + pad + unk
            self.conv3 = nn.Conv1d(in_channels=300,
                                   out_channels=256,
                                   kernel_size=3,
                                   padding=1)
            self.conv4 = nn.Conv1d(in_channels=300,
                                   out_channels=256,
                                   kernel_size=4,
                                   padding=1)
            self.conv5 = nn.Conv1d(in_channels=300,
                                   out_channels=256,
                                   kernel_size=5,
                                   padding=1)

        # load classifier for combining these features
        config = RobertaConfig()
        config = config.from_json_file(classifier_config_dir)
        config.num_hidden_layers = n_clf_layers
        config.num_attention_heads = n_clf_layers
        config.max_position_embeddings = 7
        if self.use_bio:
            config.max_position_embeddings += 2
        if self.use_name:
            config.max_position_embeddings += 4
        self.concat_model = RobertaModel(config)
        self.classifier = ClassifierLayer(use_count=use_count,
                                          use_network=use_network)
        return
Exemplo n.º 14
0
 def __init__(self):
     super(ReviewModel, self).__init__()
     tokenizer = RobertaTokenizer(
             vocab_file = Constants.VOCAB_FILE,
             merges_file = Constants.MERGES_FILE,
             add_prefix_space = True
         )
     config = RobertaConfig(output_hidden_states = True)
     self.backbone = RobertaModel(config)
     self.backbone.resize_token_embeddings(len(tokenizer))
     self.fc = nn.Linear(in_features = config.hidden_size, out_features = 1, bias = True)
Exemplo n.º 15
0
    def __init__(self, num_classes, model_name) -> None:
        super(bertCRF, self).__init__()

        if model_name == "bert-base-cased-crf":
            self.bert = BertModel(BertConfig())
        if model_name == "roberta-base-crf":
            self.bert = RobertaModel(RobertaConfig())

        self.dropout = nn.Dropout(0.1)
        self.position_wise_ff = nn.Linear(768, num_classes)
        self.crf = CRF(num_classes)
Exemplo n.º 16
0
    def __init__(self, ckpt_path):
        super().__init__()
        # First reinitialize the model
        ckpt_states = torch.load(ckpt_path, map_location='cpu')
        self.acoustic_config = RobertaConfig(
            **ckpt_states['Settings']['Config']['acoustic'])
        self.semantic_config = RobertaConfig(
            **ckpt_states['Settings']['Config']['semantic'])

        acoustic_model = AcousticModel(self.acoustic_config)
        semantic_model = RobertaModel(self.semantic_config,
                                      add_pooling_layer=False)

        # load the model from pretrained states
        self.acoustic_model = self.load_model(acoustic_model,
                                              ckpt_states['acoustic_model'],
                                              'acoustic.')
        self.semantic_model = self.load_model(semantic_model,
                                              ckpt_states['semantic_model'],
                                              'roberta.')
Exemplo n.º 17
0
 def test_TFRobertaForTokenClassification(self):
     from transformers import RobertaConfig, TFRobertaForTokenClassification
     keras.backend.clear_session()
     # pretrained_weights = 'roberta-base'
     tokenizer_file = 'roberta_roberta-base.pickle'
     tokenizer = self._get_tokenzier(tokenizer_file)
     text, inputs, inputs_onnx = self._prepare_inputs(tokenizer)
     config = RobertaConfig()
     model = TFRobertaForTokenClassification(config)
     predictions = model.predict(inputs)
     onnx_model = keras2onnx.convert_keras(model, model.name)
     self.assertTrue(run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files))
 def get_config(self):
     return RobertaConfig(
         vocab_size=self.vocab_size,
         hidden_size=self.hidden_size,
         num_hidden_layers=self.num_hidden_layers,
         num_attention_heads=self.num_attention_heads,
         intermediate_size=self.intermediate_size,
         hidden_act=self.hidden_act,
         hidden_dropout_prob=self.hidden_dropout_prob,
         attention_probs_dropout_prob=self.attention_probs_dropout_prob,
         max_position_embeddings=self.max_position_embeddings,
         type_vocab_size=self.type_vocab_size,
         initializer_range=self.initializer_range,
     )
Exemplo n.º 19
0
def get_config(vocab_size):
    if transformer_type == 'roberta':
        return RobertaConfig(
            vocab_size=vocab_size,
            max_position_embeddings=514,
            num_attention_heads=12,
            num_hidden_layers=6,
            type_vocab_size=1,
        )
    return BertConfig(
        vocab_size=vocab_size,
        max_position_embeddings=514,
        num_attention_heads=12,
        num_hidden_layers=6,
        type_vocab_size=1,
    )
Exemplo n.º 20
0
def get_roberta_model():

    # Initializing a RoBERTa configuration
    configuration = RobertaConfig()

    # Initializing a model from the configuration
    Roberta_Model = RobertaModel(configuration).from_pretrained("roberta-base")
    Roberta_Model.to(device)

    # Accessing the model configuration
    configuration = Roberta_Model.config

    #get the Roberta Tokenizer
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

    return Roberta_Model, tokenizer, configuration
Exemplo n.º 21
0
def main(args):
    data = np.load(args.data, allow_pickle=True)
    tokenizer_path = args.tokenizer
    tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path,
                                        max_len=512,
                                        mask_token="<mask>",
                                        pad_token="<pad>")
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.convert_tokens_to_ids("</s>")),
        ("<s>", tokenizer.convert_tokens_to_ids("<s>")),
    )

    config = RobertaConfig(
        vocab_size=tokenizer.vocab_size,
        max_position_embeddings=514,
        num_attention_heads=12,
        num_hidden_layers=6,
        type_vocab_size=1,
    )

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=True,
                                                    mlm_probability=0.15)
    dataset = PhoneDatasetMLM(data, tokenizer)

    model = RobertaForMaskedLM(config=config)

    training_args = TrainingArguments(
        output_dir=args.output_dir,
        overwrite_output_dir=True,
        num_train_epochs=1,
        per_device_train_batch_size=64,
        logging_steps=2,
        save_steps=10_000,
        save_total_limit=2,
        prediction_loss_only=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )

    trainer.train()
    trainer.save_model(args.output_dir)
Exemplo n.º 22
0
def get_config(args):
    config = {
        "model_type": "roberta",
        "attention_probs_dropout_prob": 0.1,
        "hidden_act": "gelu",
        "hidden_dropout_prob": 0.3,
        "hidden_size": wandb.config.hidden_size,
        "initializer_range": 0.02,
        "num_attention_heads": wandb.config.num_attention_heads,
        "num_hidden_layers": wandb.config.num_hidden_layers,
        "vocab_size": args.vocab_size,
        "intermediate_size": wandb.config.intermediate_size,
        "max_position_embeddings": 1024,
        "cache_dir": args.cache_dir
    }

    return RobertaConfig(**config)
Exemplo n.º 23
0
def build(config):

    tokenizer = RobertaTokenizerFast.from_pretrained(
                                        os.path.join(config.save_directory),
                                        max_len=config.max_length
                                        )

    model_config = RobertaConfig(
        vocab_size=config.vocab_size,
        max_position_embeddings=config.max_length,
        num_attention_heads=config.num_attention_heads,
        num_hidden_layers=config.num_hidden_layers,
        type_vocab_size=1
    )

    model = RobertaForMaskedLM(config=model_config)
    print("the number of parameters of model: ", model.num_parameters())

    dataset = LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path=config.files,
        block_size=32
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=config.mlm_probability
    )

    training_args = TrainingArguments(
        output_dir=os.path.join(config.save_directory),
        overwrite_output_dir=config.overwrite_output_dir,
        num_train_epochs=config.num_train_epochs,
        per_gpu_train_batch_size=config.per_gpu_train_batch_size,
        save_steps=config.save_steps,
        save_total_limit=config.save_total_limit
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
        prediction_loss_only=config.prediction_loss_only
    )

    return trainer
Exemplo n.º 24
0
    def __init__(self, args):
        super(Model, self).__init__()
        args.out_size = len(args.dense_features)
        self.dropout = nn.Dropout(args.hidden_dropout_prob)
        self.args = args

        #创建BERT模型,并且导入预训练模型
        config = RobertaConfig.from_pretrained(args.pretrained_model_path)
        config.output_hidden_states = True
        args.hidden_size = config.hidden_size
        args.num_hidden_layers = config.num_hidden_layers
        self.text_layer = RobertaModel.from_pretrained(
            args.pretrained_model_path, config=config)
        self.text_linear = nn.Linear(
            args.text_dim + args.vocab_dim_v1 * len(args.text_features),
            args.hidden_size)
        logger.info("Load linear from %s",
                    os.path.join(args.pretrained_model_path, "linear.bin"))
        self.text_linear.load_state_dict(
            torch.load(os.path.join(args.pretrained_model_path, "linear.bin")))
        logger.info("Load embeddings from %s",
                    os.path.join(args.pretrained_model_path, "embeddings.bin"))
        self.text_embeddings = nn.Embedding.from_pretrained(torch.load(
            os.path.join(args.pretrained_model_path,
                         "embeddings.bin"))['weight'],
                                                            freeze=True)
        args.out_size += args.hidden_size * 2

        #创建Decoder模型,随机初始化
        config = RobertaConfig()
        config.num_hidden_layers = 4
        config.intermediate_size = 2048
        config.hidden_size = 512
        config.num_attention_heads = 16
        config.vocab_size = 5
        self.text_layer_1 = RobertaModel(config=config)
        self.text_layer_1.apply(self._init_weights)
        self.text_linear_1 = nn.Linear(args.text_dim_1 + args.hidden_size, 512)
        self.text_linear_1.apply(self._init_weights)
        self.norm = nn.BatchNorm1d(args.text_dim_1 + args.hidden_size)
        args.out_size += 1024

        #创建分类器,随机初始化
        self.classifier = ClassificationHead(args)
        self.classifier.apply(self._init_weights)
Exemplo n.º 25
0
def train_MLM(vocf,outmodel,data_df):
    bs=8
    #tokenizer=BertWordPieceTokenizer(vocf)#input vocab.txt
    ttk=BertTokenizer.from_pretrained(vocf)#input vocab.txt
    fvoc=open(vocf)
    vlen=len(fvoc.readlines())
    fvoc.close()
    config=RobertaConfig(vocab_size=vlen,max_position_embeddings=12,num_attention_heads=12, \
                             num_hidden_layers=6,type_vocab_size=1,hidden_size=768)
    model=RobertaForMaskedLM(config=config)
    model.num_parameters()
    
    dataset=tokDataset(data_df,ttk)
#     Data= DataLoader(dataset, batch_size=bs,shuffle=True,drop_last=False,num_workers=0,collate_fn=collate_fn)
#     data_collator = DataCollatorForLanguageModeling(
#         tokenizer=ttk, mlm=True, mlm_probability=0.15
#     )
   
    data_collator=collate_fn(
        tokenizer=ttk, mlm=True, mlm_probability=0.15
    )
    training_args = TrainingArguments(
            output_dir=outmodel,#embedding model path
            overwrite_output_dir=True,
            num_train_epochs=2,
            per_device_train_batch_size=bs,
            save_steps=10_000,
            save_total_limit=2,
            
        )

    trainer = Trainer(
        model=model,
        args=training_args,
        
        train_dataset=dataset,
        data_collator=data_collator,
        prediction_loss_only=True
    )
    trainer.train()
    trainer.save_model(outmodel)
    print('LM train done: ')
Exemplo n.º 26
0
def main(args):
    test_x = np.load(os.path.join(args.test_dir, "test_x.npy"),
                     allow_pickle=True)
    test_y = np.load(os.path.join(args.test_dir, "test_y.npy"),
                     allow_pickle=True)
    num_classes1 = len(np.unique(test_y))

    if args.test2_dir is not None:
        test_x2 = np.load(os.path.join(args.test2_dir, "test_x.npy"),
                          allow_pickle=True)
        test_y2 = np.load(os.path.join(args.test2_dir, "test_y.npy"),
                          allow_pickle=True)
        test_y2 += num_classes1
        test_x = np.concatenate((test_x, test_x2), axis=0)
        test_y = np.concatenate((test_y, test_y2), axis=0)

    num_classes = len(np.unique(test_y))

    tokenizer_path = args.tokenizer
    tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path,
                                        max_len=512,
                                        mask_token="<mask>",
                                        pad_token="<pad>")
    test_dataset = PhoneRobertaDataset(test_x, test_y, tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    config = RobertaConfig(
        vocab_size=tokenizer.vocab_size,
        max_position_embeddings=514,
        num_attention_heads=args.heads,  # default 12
        num_hidden_layers=args.num_layers,  # default 6
        type_vocab_size=1,
        num_labels=num_classes)
    model = RobertaForSequenceClassification(config)
    device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
    model.load_state_dict(torch.load(args.model))
    preds_all, labels_all = evaluate(model, device, test_loader)

    if args.test2_dir is not None:
        print("Evaluate on separate validation using the best model")
        evaluate_separate(preds_all, labels_all, num_classes1)
Exemplo n.º 27
0
def main(args):

    # Import the custom trained tokenizer
    tokenizer = RobertaTokenizerFast.from_pretrained(args.tokenizer)

    # Define the model
    config = RobertaConfig(vocab_size=32000)
    model = RobertaForMaskedLM(config=config)

    # Import the dataset
    dataset = LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path=args.data,
        block_size=128,
    )

    # Initialize the data collector
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)

    # Set all of the training arguments
    training_args = TrainingArguments(
        output_dir=args.output,
        overwrite_output_dir=True,
        num_train_epochs=10,
        per_gpu_train_batch_size=24,
        save_steps=10_000,
        save_total_limit=10,
    )

    # Train the model
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )

    trainer.train()

    # Save the mode
    trainer.save_model("./roBERTaCODE_{}_{}".format(args.language, args.size))
Exemplo n.º 28
0
def get_plm_resources(plm, vocab_len):
    """load PLM resources such as model, tokenizer and config"""
    if plm == 'bert':
        bert_model = BertModel.from_pretrained('bert-base-uncased')
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        bert_config = BertConfig(vocab_size_or_config_json_file=vocab_len)
    elif plm == 'roberta':
        bert_model = RobertaModel.from_pretrained('roberta-base')
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        bert_config = RobertaConfig(vocab_size_or_config_json_file=vocab_len)
    elif plm == 'xlnet':
        bert_model = XLNetModel.from_pretrained('xlnet-base-cased')
        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
        bert_config = XLNetConfig(vocab_size_or_config_json_file=vocab_len)
    elif plm == 'distilbert':
        bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
        tokenizer = DistilBertTokenizer.from_pretrained(
            'distilbert-base-uncased')
        bert_config = DistilBertConfig(
            vocab_size_or_config_json_file=vocab_len)
    return bert_model, tokenizer, bert_config
Exemplo n.º 29
0
def train_mod(txt_dir, tokenizer, model_dir):
    config = RobertaConfig(
        vocab_size=3305,
        max_position_embeddings=1024,
        num_attention_heads=12,
        num_hidden_layers=6,
        output_attentions=True,
        type_vocab_size=1,
    )

    dataset = LineByLineTextDataset(tokenizer=tokenizer,
                                    file_path=txt_dir,
                                    block_size=1024)

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=True,
                                                    mlm_probability=0.15)

    model = RobertaForMaskedLM(config=config)

    training_args = TrainingArguments(
        output_dir=model_dir,
        overwrite_output_dir=True,
        num_train_epochs=1000,
        per_gpu_train_batch_size=16,
        save_steps=1000,
        save_total_limit=37,
        prediction_loss_only=True,
    )

    trainer = Trainer(model=model,
                      args=training_args,
                      data_collator=data_collator,
                      train_dataset=dataset)

    trainer.train()

    trainer.save_model(model_dir)
Exemplo n.º 30
0
                        max_position_embeddings=512,
                        num_attention_heads=12,
                        num_hidden_layers=12,
                        #type_vocab_size=2, default is 2
                        )
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased', do_lower_case=False)
    model = BertForMaskedLM.from_pretrained('./multi-label_LM/multi-label_Bert_e10_b16', config=config)
    #model = BertForMaskedLM.from_pretrained('./multi-label_train.csv_LMmodel', config=config)
    # 12-layer, 768-hidden, 12-heads, 110M parameters.

elif args.LM == 'RoBerta':
    from transformers import RobertaConfig, RobertaTokenizerFast, RobertaForMaskedLM

    config = RobertaConfig(vocab_size=50265,
                           max_position_embeddings=514,
                           num_attention_heads=12,
                           num_hidden_layers=12,
                           type_vocab_size=1,
                           )
    tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', do_lower_case=False)
    model = RobertaForMaskedLM.from_pretrained('./multi-label_LM/multi-label_RoBerta_e10_b16', config=config)
    # 12-layer, 768-hidden, 12-heads, 125M parameters, roberta-base using the bert-base architecture

elif args.LM == 'XLM':
    from transformers import XLMConfig, XLMTokenizer, XLMWithLMHeadModel

    config = XLMConfig(vocab_size=64139,
                       emb_dim=1024,
                       max_position_embeddings=512,
                       n_heads=8,
                       n_layers=6,
                       )