def roberta_build(self,
                      sparse=False,
                      base_model=None,
                      density=1.0,
                      eval=True):
        if base_model == None:
            config = RobertaConfig(
                vocab_size=52_000,
                max_position_embeddings=514,
                num_attention_heads=12,
                num_hidden_layers=6,
                type_vocab_size=1,
            )

            model = RobertaForMaskedLM(config=config).cuda()
        else:
            model = base_model

        if sparse:
            mp = BlockSparseModelPatcher()
            mp.add_pattern(
                "roberta\.encoder\.layer\.[0-9]+.intermediate\.dense",
                {"density": density})
            mp.add_pattern("roberta\.encoder\.layer\.[0-9]+.output\.dense",
                           {"density": density})
            mp.patch_model(model)

        if eval:
            model.eval()

        return model, model.num_parameters()
예제 #2
0
def train_MLM(vocf,outmodel,data_df):
    bs=8
    #tokenizer=BertWordPieceTokenizer(vocf)#input vocab.txt
    ttk=BertTokenizer.from_pretrained(vocf)#input vocab.txt
    fvoc=open(vocf)
    vlen=len(fvoc.readlines())
    fvoc.close()
    config=RobertaConfig(vocab_size=vlen,max_position_embeddings=12,num_attention_heads=12, \
                             num_hidden_layers=6,type_vocab_size=1,hidden_size=768)
    model=RobertaForMaskedLM(config=config)
    model.num_parameters()
    
    dataset=tokDataset(data_df,ttk)
#     Data= DataLoader(dataset, batch_size=bs,shuffle=True,drop_last=False,num_workers=0,collate_fn=collate_fn)
#     data_collator = DataCollatorForLanguageModeling(
#         tokenizer=ttk, mlm=True, mlm_probability=0.15
#     )
   
    data_collator=collate_fn(
        tokenizer=ttk, mlm=True, mlm_probability=0.15
    )
    training_args = TrainingArguments(
            output_dir=outmodel,#embedding model path
            overwrite_output_dir=True,
            num_train_epochs=2,
            per_device_train_batch_size=bs,
            save_steps=10_000,
            save_total_limit=2,
            
        )

    trainer = Trainer(
        model=model,
        args=training_args,
        
        train_dataset=dataset,
        data_collator=data_collator,
        prediction_loss_only=True
    )
    trainer.train()
    trainer.save_model(outmodel)
    print('LM train done: ')
예제 #3
0
파일: build.py 프로젝트: chan8616/PoAI
def build(config):

    tokenizer = RobertaTokenizerFast.from_pretrained(
                                        os.path.join(config.save_directory),
                                        max_len=config.max_length
                                        )

    model_config = RobertaConfig(
        vocab_size=config.vocab_size,
        max_position_embeddings=config.max_length,
        num_attention_heads=config.num_attention_heads,
        num_hidden_layers=config.num_hidden_layers,
        type_vocab_size=1
    )

    model = RobertaForMaskedLM(config=model_config)
    print("the number of parameters of model: ", model.num_parameters())

    dataset = LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path=config.files,
        block_size=32
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=config.mlm_probability
    )

    training_args = TrainingArguments(
        output_dir=os.path.join(config.save_directory),
        overwrite_output_dir=config.overwrite_output_dir,
        num_train_epochs=config.num_train_epochs,
        per_gpu_train_batch_size=config.per_gpu_train_batch_size,
        save_steps=config.save_steps,
        save_total_limit=config.save_total_limit
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
        prediction_loss_only=config.prediction_loss_only
    )

    return trainer
예제 #4
0
tokenizer.enable_truncation(max_length=512)

print(tokenizer.encode("For it is in reality vain to profess"))

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

tokenizer = RobertaTokenizerFast.from_pretrained(SAVE_MODEL, max_len=512)
model = RobertaForMaskedLM(config=config)

print(model.num_parameters())

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=PATH + "/kant.txt",
    block_size=128,
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm=True,
                                                mlm_probability=0.15)
training_args = TrainingArguments(
    output_dir=SAVE_MODEL,
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64,
from transformers import RobertaConfig
from transformers import RobertaTokenizerFast
from transformers import RobertaForMaskedLM

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

tokenizer = RobertaTokenizerFast.from_pretrained("./EsperBERTo", max_len=512)
model = RobertaForMaskedLM(config=config)
model.num_parameters()

from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./oscar.eo.txt",
    block_size=128,
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm=True,
                                                mlm_probability=0.15)

from transformers import Trainer, TrainingArguments
예제 #6
0
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)
print(tokenizer.encode("gostei muito dessa ideia".lower()).tokens)

# Model type
# --------------------------------------------------
config = RobertaConfig(
    vocab_size=vocab_size,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=8,
    type_vocab_size=1,
)
model = RobertaForMaskedLM(config=config)
print("Params: ", model.num_parameters())
tokenizer = RobertaTokenizerFast.from_pretrained("./BR_BERTo", max_len=512)

# Dataset load
# --------------------------------------------------
dataset = EsperantoDataset(
    tokenizer=tokenizer,
    file_path="./corpus.txt",
    length=corpus_length
)

# Start training
# --------------------------------------------------
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)
예제 #7
0
def main(argv):
    wandb.login()

    is_gpu = torch.cuda.is_available()

    config = RobertaConfig(
        vocab_size=FLAGS.vocab_size,
        max_position_embeddings=FLAGS.max_position_embeddings,
        num_attention_heads=FLAGS.num_attention_heads,
        num_hidden_layers=FLAGS.num_hidden_layers,
        type_vocab_size=FLAGS.type_vocab_size,
    )

    if FLAGS.tokenizer_path:
        tokenizer_path = FLAGS.tokenizer_path
    elif FLAGS.tokenizer_type.upper() == "BPE":
        tokenizer_path = FLAGS.output_tokenizer_dir
        if not os.path.isdir(tokenizer_path):
            os.makedirs(tokenizer_path)

        tokenizer = ByteLevelBPETokenizer()
        tokenizer.train(
            files=FLAGS.dataset_path,
            vocab_size=FLAGS.vocab_size,
            min_frequency=FLAGS.BPE_min_frequency,
            special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])
        tokenizer.save_model(tokenizer_path)
    else:
        print("Please provide a tokenizer path if using the SMILES tokenizer")

    tokenizer = RobertaTokenizerFast.from_pretrained(
        tokenizer_path, max_len=FLAGS.max_tokenizer_len)

    model = RobertaForMaskedLM(config=config)
    model.num_parameters()

    dataset = RawTextDataset(tokenizer=tokenizer,
                             file_path=FLAGS.dataset_path,
                             block_size=FLAGS.tokenizer_block_size)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=FLAGS.mlm_probability)

    training_args = TrainingArguments(
        output_dir=FLAGS.output_dir,
        overwrite_output_dir=FLAGS.overwrite_output_dir,
        num_train_epochs=FLAGS.num_train_epochs,
        per_device_train_batch_size=FLAGS.per_device_train_batch_size,
        save_steps=FLAGS.save_steps,
        save_total_limit=FLAGS.save_total_limit,
        fp16=is_gpu,  # fp16 only works on CUDA devices
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )

    trainer.train()
    trainer.save_model(FLAGS.model_name)
예제 #8
0
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling

#configs
config = RobertaConfig(
    vocab_size=52000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

tokenizer = RobertaTokenizerFast.from_pretrained("./latentbert", max_len=512)
model = RobertaForMaskedLM(config=config)

print('num params: {}'.format(model.num_parameters()))

#training dataset
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="../results_file.txt",
    block_size=128,
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm=True,
                                                mlm_probability=0.15)

print('before trainer')

#trainer
예제 #9
0
def train(epoch, vocab_size, train_files_path, save_path, learning_rate,
          save_steps, per_gpu_train_batch_size, gradient_accumulation_steps):
    """
    从头训练一个语言模型RoBERTa.
    """
    from transformers import RobertaConfig

    config = RobertaConfig(  #可调整参数
        vocab_size=vocab_size,
        max_position_embeddings=514,
        num_attention_heads=12,
        num_hidden_layers=12,
        type_vocab_size=1,
    )

    # 不训练,直接用BERT的分词器
    from transformers import BertTokenizer
    tokenizer = BertTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext",
                                              max_len=512)
    # 事实上,hfl/chinese-roberta-wwm-ext/vocab.txt == chinese_L-12_H-768_A-12/vocab.txt  #词典

    from transformers import RobertaForMaskedLM
    """
    源码:
    class RobertaForMaskedLM()
        def __init__(self, config):
            super().__init__(config)

            if not config.is_decoder:
                logger.warning("If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`")

            self.roberta = RobertaModel(config, add_pooling_layer=False)
            self.lm_head = RobertaLMHead(config)

            self.init_weights()

    可以发现,RobertaForMaskedLM包含了roberta主体模型(不含pooling层)和一个语言模型输出层
    """
    # 初始化模型
    model = RobertaForMaskedLM(config=config)

    print("参数数量: ", model.num_parameters())  # 1亿参数

    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=train_files_path,
        block_size=512,
    )

    from transformers import DataCollatorForLanguageModeling

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=True,
                                                    mlm_probability=0.15)

    from transformers import Trainer, TrainingArguments

    training_args = TrainingArguments(
        output_dir=save_path,
        overwrite_output_dir=True,
        num_train_epochs=epoch,
        per_gpu_train_batch_size=per_gpu_train_batch_size,
        save_steps=save_steps,
        save_total_limit=2,
        gradient_accumulation_steps=
        gradient_accumulation_steps,  # 32*8=256 batch_size
        learning_rate=learning_rate,
        weight_decay=0.01,
        adam_epsilon=1e-6,
        warmup_steps=10000,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )

    trainer.train()

    trainer.save_model(save_path)
def main():
    # Main training script

    wandb.login()

    #verify file length
    fname = 'pubchem-10m.txt'

    def file_len(fname):
        with open(fname) as f:
            for i, l in enumerate(f):
                pass
        return i + 1

    print(file_len(fname))

    torch.cuda.is_available()  #checking if CUDA + Colab GPU works

    config = RobertaConfig(
        vocab_size=52_000,
        max_position_embeddings=512,
        num_attention_heads=12,
        num_hidden_layers=6,
        type_vocab_size=1,
    )

    tokenizer = RobertaTokenizerFast.from_pretrained(
        "seyonec/SMILES_tokenized_PubChem_shard00_160k", max_len=512)

    # test
    tokenizer.encode("[O-][N+](=O)c1cnc(s1)Sc1nnc(s1)N")

    model = RobertaForMaskedLM(config=config)
    model.num_parameters()

    dataset = RawTextDataset(tokenizer=tokenizer,
                             file_path="pubchem-10m.txt",
                             block_size=512)

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=True,
                                                    mlm_probability=0.15)

    training_args = TrainingArguments(
        output_dir="PubChem_10M_SMILES_Tokenizer",
        overwrite_output_dir=True,
        num_train_epochs=10,
        per_gpu_train_batch_size=64,
        save_steps=10_000,
        save_total_limit=2,
        fp16=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )

    trainer.train()
    trainer.save_model("PubChem_10M_SMILES_Tokenizer")