예제 #1
0
def initialize_model():

    config = get_config()
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    #device = torch.device('cpu')
    print("device", device)

    '''
    Create dataloaders
    '''
    train_dataset = SplitReshapeTrainDataset(config['complex_sentences_file'], config['simple_sentences_file'])
    train_data, val_data = torch.utils.data.random_split(train_dataset, [round(config["train_data_percentage"] * len(train_dataset)), round(config["val_data_percentage"] * len(train_dataset))])

    train_dataloader = DataLoader(train_data, batch_size=config["batch_size"], num_workers=config["num_of_workers"], pin_memory=True)
    val_dataloader = DataLoader(val_data, batch_size=config["batch_size"], num_workers=config["num_of_workers"], pin_memory=True)

    '''
    create tokenizer
    '''
    tokenizer = ByteLevelBPETokenizer(
        "./data/english_tokenizer-vocab.json",
        "./data/english_tokenizer-merges.txt",
    )
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )


    '''
    Create model
    '''
    vocab_size = len(tokenizer.get_vocab())
    print("tokenizer.vocab_size", vocab_size)
    model = TransformerModel(config['embedding_size'],
           vocab_size,
           vocab_size,
           config['src_pad_idx'],
           config['num_heads'],
           config['num_encoder_layers'],
           config['num_decoder_layers'],
           config['forward_expansion'],
           config['dropout'],
           config['max_len'],
           device)

    model.train()

    trainer = model.to(device)

    '''
    Create Optimizer
    '''
    loss_fun = nn.CrossEntropyLoss(ignore_index = config['src_pad_idx'])
    optimizer = optim.Adam(trainer.parameters(), lr = config["learning_rate"])
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, 10)

    writer = SummaryWriter()

    return config, train_dataloader, val_dataloader, trainer, loss_fun, optimizer, writer, device, scheduler, tokenizer
예제 #2
0
 def _fit_tokenizer(
     path_to_text_file: Union[str, List[str]],
     tokenizer: ByteLevelBPETokenizer,
     vocabulary_size: int,
 ) -> None:
     tokenizer.train(
         path_to_text_file,
         vocabulary_size,
         special_tokens=[EOD_TOKEN, PAD_TOKEN, SOS_TOKEN, UNK_TOKEN],
     )
예제 #3
0
def initialize_model():

    config = get_config()
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    #device = torch.device('cpu')
    print("device", device)
    '''create tokenizers'''

    tokenizer = ByteLevelBPETokenizer(
        "data/english_tokenizer-vocab.json",
        "data/english_tokenizer-merges.txt",
    )
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )
    tokenizer.enable_padding(pad_token='[PAD]', length=config['max_len'])
    tokenizer.enable_truncation(max_length=config['max_len'])
    '''
    Create model
    '''
    vocab_size = len(tokenizer.get_vocab())
    print("tokenizer.vocab_size", vocab_size)
    model = TransformerModel(config['embedding_size'], vocab_size, vocab_size,
                             config['src_pad_idx'], config['num_heads'],
                             config['num_encoder_layers'],
                             config['num_decoder_layers'],
                             config['forward_expansion'], config['dropout'],
                             config['max_len'], device)
    checkpoint = torch.load(config['pretrained_model'], map_location=device)
    model.load_state_dict(checkpoint['net'])
    model.eval()
    model = model.to(device)

    return config, model, tokenizer, device
    def __init__(self,
                 tokenizer: PreTrainedTokenizer,
                 args,
                 file_path: str,
                 block_size=512):
        assert os.path.isfile(file_path)
        # Here, we do not cache the features, operating under the assumption
        # that we will soon use fast multithreaded tokenizers from the
        # `tokenizers` repo everywhere =)
        logger.info(" Creating features from dataset file at %s", file_path)

        with open(file_path, encoding="utf-8") as f:
            lines = [
                line for line in f.read().splitlines()
                if (len(line) > 0 and not line.isspace())
            ]

        tokenizer = ByteLevelBPETokenizer(
            f"{args['tokenizer_name']}/vocab.json",
            f"{args['tokenizer_name']}/merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )

        tokenizer.enable_truncation(max_length=block_size)
        self.examples = [t.ids for t in tokenizer.encode_batch(lines)]
예제 #5
0
def test_language_model_dataset_fit_tokenizer_should_call_the_train_method_of_bpe_tokenizer(
):
    # Given
    language_modeling_dataset = LanguageModelingDataset(1, 1)
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train = MagicMock()
    language_modeling_dataset.set_tokenizer(tokenizer)

    # When
    language_modeling_dataset._fit_tokenizer(FAKE_PATH_FOR_TEST, tokenizer,
                                             300)

    # Then
    tokenizer.train.assert_called_with(
        FAKE_PATH_FOR_TEST,
        300,
        special_tokens=[EOD_TOKEN, PAD_TOKEN, SOS_TOKEN, UNK_TOKEN],
    )
예제 #6
0
def create_norwegian_tokenizer():
    tokenizer = ByteLevelBPETokenizer(
        "./models/KariBERTa-tiny/vocab.json",
        "./models/KariBERTa-tiny/merges.txt",
    )
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )
    tokenizer.enable_truncation(max_length=512)
    tokenizer.enable_padding()
    return tokenizer
예제 #7
0
    def pretrain_tokenization(self):
        paths = [str(x) for x in Path("handler/datadir/").glob("*-train.txt")]
        print(paths)
        tokenizer = ByteLevelBPETokenizer()

        tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])

        tokenizer.save(".", "danbert-small")
예제 #8
0
def main():
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train(files=get_file(),
                    vocab_size=config.VOCAB_SIZE,
                    min_frequency=config.MIN_FREQUENCY,
                    special_tokens=config.SPECIAL_TOKENS)

    tokenizer.save_model(config.TOKENIZER_PATH)
예제 #9
0
def create_token_masker(tokenizer: ByteLevelBPETokenizer):
    special_tokens = [
        START_TOKEN, PAD_TOKEN, STOP_TOKEN, UNKNOWN_TOKEN, MASK_TOKEN
    ]
    special_token_ids = {
        tokenizer.token_to_id(token)
        for token in special_tokens
    }
    special_token_ids

    def get_special_tokens_mask(token_ids: torch.tensor):
        return [
            1 if token_id in special_token_ids else 0 for token_id in token_ids
        ]

    return get_special_tokens_mask
예제 #10
0
    def __init__(self, evaluate: bool = false):
        tokenizer = ByteLevelBPETokenizer(
            "./esperberto-vocab.json",
            './esperberto-merges.txt',
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)

        self.examples = []

        src_files = Path("./")
예제 #11
0
    def __init__(self, evaluate: bool = False):
        tokenizer = ByteLevelBPETokenizer(
            "./roberta-lm/vocab.json",
            "./roberta-lm/merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)
        # or use the RobertaTokenizer from `transformers` directly.

        self.examples = []

        src_files = Path("./data/montecristo/").glob("**/*.txt")
        for src_file in src_files:
            print("🔥", src_file)
            lines = src_file.read_text(encoding="utf-8").splitlines()
            self.examples += [x.ids for x in tokenizer.encode_batch(lines)]
예제 #12
0
    def __init__(self, file_path: str = None, tokenizer_path: str = None):
        tokenizer = ByteLevelBPETokenizer(
            tokenizer_path + "/vocab.json",
            tokenizer_path + "/merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)

        self.examples = []

        with open(file_path, encoding="utf-8") as f:
            lines = f.readlines()
            lines = [
                line for line in lines
                if (len(line) > 0 and not line.isspace())
            ]
            self.examples += [x.ids for x in tokenizer.encode_batch(lines)]
예제 #13
0
def train_tok(txt_dir, tokenizer_dir):
    # Initialize a tokenizer
    tokenizer = ByteLevelBPETokenizer()

    # Customize training
    tokenizer.train(files=txt_dir,
                    vocab_size=52_000,
                    min_frequency=2,
                    special_tokens=[
                        "<s>",
                        "<pad>",
                        "</s>",
                        "<unk>",
                        "<mask>",
                    ])

    tokenizer.save_model(tokenizer_dir)
예제 #14
0
    def __init__(self, evaluate=False):
        tokenizer = ByteLevelBPETokenizer(
            "/home/zheng/sde/previous_small_model/bpe/esperberto_10000size-vocab.json",
            "/home/zheng/sde/previous_small_model/bpe/esperberto_10000size-merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)
        # or use the RobertaTokenizer from `transformers` directly.

        self.examples = []
        if evaluate:
            src_files = ["/home/zheng/sde/data/valid.txt"]
        else:
            src_files = ["/home/zheng/sde/data/test.txt"]

        for src_file in src_files:
            print(src_file)
            f = open(src_file, 'r')
            lines = f.readlines()
            self.examples += [x.ids for x in tokenizer.encode_batch(lines)]
예제 #15
0
import os
from tokenizers.implementations import ByteLevelBPETokenizer
from transformers import BertConfig
from transformers import BertTokenizer
from transformers import BertForMaskedLM
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

PATH = os.getcwd()
SAVE_MODEL = os.getcwd()

tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files="kant.txt",
                vocab_size=52_000,
                min_frequency=2,
                special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.save_model(SAVE_MODEL)
tokenizer = ByteLevelBPETokenizer(
    SAVE_MODEL + "/vocab.json",
    SAVE_MODEL + "/merges.txt",
)

tokenizer.enable_truncation(max_length=512)
print(tokenizer.encode("For it is in reality vain to profess"))

config = BertConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
예제 #16
0
from tokenizers.implementations import ByteLevelBPETokenizer
from transformers import LineByLineTextDataset

# config = RobertaConfig(
#     vocab_size=52_000,
#     max_position_embeddings=514,
#     num_attention_heads=12,
#     num_hidden_layers=6,
#     type_vocab_size=1,
# )

tokenizer = ByteLevelBPETokenizer(
    "./models/nepali_BERT_tokenizer_L-vocab.json",
    "./models/nepali_BERT_tokenizer_L-merges.txt",
)

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=
    "../Data/nepali_corpus/sanitized_data/sentences/master_sentence_list.txt",  #this path is from step 1
    block_size=128,
)

#Need this util for data back propagation
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm=True,
                                                mlm_probability=0.15)

#Set up training arguments
                               input_path_val,
                               output_path,
                               vocab_size=30_000,
                               min_freq=2,
                               max_len=256,
                               block_size=64,
                               mlm_probability=0.15,
                               num_attention_heads=6,
                               num_hidden_layers=3,
                               epochs=5,
                               batch_size=30,
                               val_batch_size=60,
                               eval_steps=50,
                               **kwargs):
 # instantiate tokenizer
 bpe_tokenizer = ByteLevelBPETokenizer()
 # train tokenizer
 _pretty_print("Training tokenizer")
 bpe_tokenizer.train([input_path, input_path_val],
                     vocab_size=vocab_size,
                     min_frequency=min_freq,
                     special_tokens=[
                         "<s>",
                         "<pad>",
                         "</s>",
                         "<unk>",
                         "<mask>",
                     ])
 # save tokenizer
 tok_path = os.path.join(output_path, "tokenizer")
 os.makedirs(tok_path, exist_ok=True)
예제 #18
0
from tokenizers.implementations import ByteLevelBPETokenizer

from language_model.tokenization.trainer import ByteLevelBPETokenizerTrainer

task = ByteLevelBPETokenizerTrainer(
    source_folder_path="data/ukr/data/wiki_oscar_data/",
    tokenizer=ByteLevelBPETokenizer(),
    vocab_size=52000,
    min_frequency=5,
    special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"],
)
예제 #19
0
    with open('data/start.txt', 'r') as myfile:
        data = myfile.read()

    segmenter.segment('I am Batman i live in gotham')

# =============================================================================
# Huggingface tokenizer
# =============================================================================

if False:
    from tokenizers.implementations import ByteLevelBPETokenizer
    from tokenizers.processors import BertProcessing
    from pathlib import Path

    tokenizer = ByteLevelBPETokenizer(
        "data/german_old.json",
        "data/german_old.txt",
    )

    tokenizer = ByteLevelBPETokenizer(
        "data/german_old.json",
        "data/german_old.txt",
    )

    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )

    tokenizer.enable_truncation(max_length=512)

    #print(tokenizer.encode(sen_out[0]))
예제 #20
0
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

import torch

tokenizer = ByteLevelBPETokenizer(
    "./EsperBERTo/vocab.json",
    "./EsperBERTo/merges.txt",
)

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

from transformers_weighted_head.src.transformers import RobertaConfig

WEIGHT_HEADS = True

config = RobertaConfig(
    weight_heads=WEIGHT_HEADS,
    vocab_size=52_000,
    max_position_embeddings=514,
    hidden_size=480,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

from transformers_weighted_head.src.transformers import RobertaTokenizerFast
예제 #21
0
# -*- coding:utf-8 -*-
import os
from argparse import ArgumentParser
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing, RobertaProcessing

if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument("--token_path",
                        type=str,
                        nargs='?',
                        required=True,
                        help="")
    args = parser.parse_args()

    inputpath = args.token_path
    tokenizer = ByteLevelBPETokenizer(os.path.join(inputpath, "vocab.json"),
                                      os.path.join(inputpath, "merges.txt"),
                                      add_prefix_space=True,
                                      trim_offsets=True,
                                      lowercase=True,
                                      unicode_normalizer="nfkc")
    tokenizer._tokenizer.post_processor = RobertaProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
        trim_offsets=True,
        add_prefix_space=True)
    tokenizer.enable_truncation(max_length=512)
    tokens = tokenizer.encode("I am Julien\nI am from China.").tokens
    print([x.encode('utf-8') for x in tokens])
예제 #22
0
                    default=os.environ['SM_MODEL_DIR'])
parser.add_argument('--output-data-dir',
                    type=str,
                    default=os.environ['SM_OUTPUT_DATA_DIR'])
parser.add_argument('--data-dir',
                    type=str,
                    default=os.environ['SM_CHANNEL_TRAINING'])

args = parser.parse_args()

paths = [str(x) for x in Path(args.data_dir).glob("**/*.txt")]
print("data files")
print(paths)

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths,
                vocab_size=52_000,
                min_frequency=2,
                special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])

# Need to save it to model dir for inference
tokenizer.save(args.model_dir)

tokenizer = ByteLevelBPETokenizer(os.path.join(args.model_dir, "vocab.json"),
                                  os.path.join(args.model_dir, "merges.txt"))

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
예제 #23
0
def main():
    argument_parser = argparse.ArgumentParser()
    argument_parser.add_argument("--path_to_train_data",
                                 type=str,
                                 required=True)
    argument_parser.add_argument("--path_to_eval_data",
                                 type=str,
                                 required=False,
                                 default=None)
    argument_parser.add_argument("--n_epochs",
                                 type=int,
                                 required=False,
                                 default=3)
    argument_parser.add_argument("--batch_size",
                                 type=int,
                                 required=False,
                                 default=32)
    argument_parser.add_argument("--bptt",
                                 type=int,
                                 required=False,
                                 default=64)
    argument_parser.add_argument("--lr",
                                 type=float,
                                 required=False,
                                 default=0.0001)
    argument_parser.add_argument("--vocabulary_size",
                                 type=int,
                                 required=False,
                                 default=20000)
    argument_parser.add_argument("--embedding_dimension",
                                 type=int,
                                 required=False,
                                 default=300)
    argument_parser.add_argument("--hidden_units_for_lstm",
                                 type=int,
                                 required=False,
                                 default=256)
    argument_parser.add_argument("--num_of_lstm_layer",
                                 type=int,
                                 required=False,
                                 default=1)
    argument_parser.add_argument("--n_decoder_blocks",
                                 type=int,
                                 required=False,
                                 default=5)

    arguments = argument_parser.parse_args()

    train_language_modeling_dataset = LanguageModelingDataset(
        arguments.batch_size, arguments.bptt)
    train_language_modeling_dataset.set_tokenizer(ByteLevelBPETokenizer())
    train_language_modeling_dataset.fit(
        arguments.path_to_train_data,
        vocabulary_size=arguments.vocabulary_size)

    train_language_modeling_dataloader = LanguageModelingDataLoader(
        arguments.bptt,
        train_language_modeling_dataset.transform(arguments.path_to_train_data,
                                                  return_target=True),
    )

    model = LSTMModel(
        arguments.vocabulary_size,
        arguments.embedding_dimension,
        arguments.hidden_units_for_lstm,
        arguments.n_decoder_blocks,
        arguments.num_of_lstm_layer,
    )

    logger = TensorboardLogger()
    trainer = Trainer(arguments.batch_size)
    trainer.set_logger(logger)

    if arguments.path_to_eval_data:
        eval_language_modeling_dataloader = LanguageModelingDataLoader(
            arguments.bptt,
            train_language_modeling_dataset.transform(
                arguments.path_to_eval_data, return_target=True),
        )

        trainer.train(
            model,
            train_language_modeling_dataloader,
            CrossEntropyLoss(),
            Adam(model.parameters(), arguments.lr),
            eval_language_modeling_dataloader,
            arguments.n_epochs,
        )

    else:
        trainer.train(
            model,
            train_language_modeling_dataloader,
            CrossEntropyLoss(),
            Adam(model.parameters(), arguments.lr),
            None,
            arguments.n_epochs,
        )

    logger.log_params(vars(arguments), trainer.losses)
    saver = Saver(logger.log_dir())
    saver.save_preprocessor_and_model(train_language_modeling_dataset, model)
예제 #24
0
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

tokenizer = ByteLevelBPETokenizer(
    "./bert-tokenizer/vocab.json",
    "./bert-tokenizer/merges.txt",
)

from tokenizers.processors import BertProcessing

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

### Testing

tokenizer.encode('TOBB ETU NLP&IR team')

예제 #25
0
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
import os.path
import sys
ROOT_DIRECTORY = os.path.join(os.path.dirname(__file__), '..')
sys.path.append(ROOT_DIRECTORY)

tokenizer = ByteLevelBPETokenizer(
    os.path.join(ROOT_DIRECTORY, "models/en_cycl_tokenizer/vocab.json"),
    os.path.join(ROOT_DIRECTORY, "models/en_cycl_tokenizer/merges.txt"),
)
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)
token_to_encode = "(() ((#$isa #$McCoyTyner-Musician #$Individual)))"
encoded_token = tokenizer.encode(token_to_encode)
print(encoded_token)
print(encoded_token.tokens)

token_to_encode = "Pair of scissors is marketed as office product."
encoded_token = tokenizer.encode(token_to_encode)
print(encoded_token)
print(encoded_token.tokens)
예제 #26
0
        return example


# Check that PyTorch sees it
print("CUDA:", torch.cuda.is_available())
corpus_length = 6_993_330 # fazer um wc -l para ver a qtde de linhas
vocab_size = 150_000

# Dataset files
# --------------------------------------------------
paths = [str(x) for x in Path("./").glob("**/corpus.txt")]

# Byte Level Tokernize
# --------------------------------------------------
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()
# Customize training
tokenizer.train(files=paths, vocab_size=vocab_size, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])
# Save files to disk
tokenizer.save_model("BR_BERTo")
# Test
tokenizer = ByteLevelBPETokenizer(
    "./BR_BERTo/vocab.json",
    "./BR_BERTo/merges.txt",
)
예제 #27
0
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

path = "./model"

tokenizer = ByteLevelBPETokenizer(
    "{path}/vocab.json".format(path=path),
    "{path}/merges.txt".format(path=path),
)

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)

tokenizer.enable_truncation(max_length=512)

tokenizer.encode("Dette er første testen.")

tokens = tokenizer.encode("Dette er første testen.").tokens

print(tokens)