예제 #1
0
def initialize_model():

    config = get_config()
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    #device = torch.device('cpu')
    print("device", device)

    '''
    Create dataloaders
    '''
    train_dataset = SplitReshapeTrainDataset(config['complex_sentences_file'], config['simple_sentences_file'])
    train_data, val_data = torch.utils.data.random_split(train_dataset, [round(config["train_data_percentage"] * len(train_dataset)), round(config["val_data_percentage"] * len(train_dataset))])

    train_dataloader = DataLoader(train_data, batch_size=config["batch_size"], num_workers=config["num_of_workers"], pin_memory=True)
    val_dataloader = DataLoader(val_data, batch_size=config["batch_size"], num_workers=config["num_of_workers"], pin_memory=True)

    '''
    create tokenizer
    '''
    tokenizer = ByteLevelBPETokenizer(
        "./data/english_tokenizer-vocab.json",
        "./data/english_tokenizer-merges.txt",
    )
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )


    '''
    Create model
    '''
    vocab_size = len(tokenizer.get_vocab())
    print("tokenizer.vocab_size", vocab_size)
    model = TransformerModel(config['embedding_size'],
           vocab_size,
           vocab_size,
           config['src_pad_idx'],
           config['num_heads'],
           config['num_encoder_layers'],
           config['num_decoder_layers'],
           config['forward_expansion'],
           config['dropout'],
           config['max_len'],
           device)

    model.train()

    trainer = model.to(device)

    '''
    Create Optimizer
    '''
    loss_fun = nn.CrossEntropyLoss(ignore_index = config['src_pad_idx'])
    optimizer = optim.Adam(trainer.parameters(), lr = config["learning_rate"])
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, 10)

    writer = SummaryWriter()

    return config, train_dataloader, val_dataloader, trainer, loss_fun, optimizer, writer, device, scheduler, tokenizer
예제 #2
0
def initialize_model():

    config = get_config()
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    #device = torch.device('cpu')
    print("device", device)
    '''create tokenizers'''

    tokenizer = ByteLevelBPETokenizer(
        "data/english_tokenizer-vocab.json",
        "data/english_tokenizer-merges.txt",
    )
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )
    tokenizer.enable_padding(pad_token='[PAD]', length=config['max_len'])
    tokenizer.enable_truncation(max_length=config['max_len'])
    '''
    Create model
    '''
    vocab_size = len(tokenizer.get_vocab())
    print("tokenizer.vocab_size", vocab_size)
    model = TransformerModel(config['embedding_size'], vocab_size, vocab_size,
                             config['src_pad_idx'], config['num_heads'],
                             config['num_encoder_layers'],
                             config['num_decoder_layers'],
                             config['forward_expansion'], config['dropout'],
                             config['max_len'], device)
    checkpoint = torch.load(config['pretrained_model'], map_location=device)
    model.load_state_dict(checkpoint['net'])
    model.eval()
    model = model.to(device)

    return config, model, tokenizer, device
        block_size=block_size
    )
    dataset_gen_val = LineByLineTextDataset(
        tokenizer=bpe_tokenizer,
        file_path=input_path_val,
        block_size=block_size
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=bpe_tokenizer, mlm=True, mlm_probability=mlm_probability
    )

    # create model
    config = LongformerConfig(
        attention_window=attention_window,
        sep_token_id=bpe_tokenizer.get_vocab()["</s>"],
        pad_token_id=bpe_tokenizer.get_vocab()["<pad>"],
        bos_token_id=bpe_tokenizer.get_vocab()["<s>"], 
        eos_token_id=bpe_tokenizer.get_vocab()["</s>"],
        vocab_size=bpe_tokenizer.vocab_size,
        max_position_embeddings=max_len+10,
        num_attention_heads=num_attention_heads,
        num_hidden_layers=num_hidden_layers,
        type_vocab_size=1
    )
    
    model = LongformerForMaskedLM(config=config)

    _pretty_print(f"Number of model parameters : {model.num_parameters():,}")

    model_path = os.path.join(output_path, "lm")