Exemplo n.º 1
0
def _create_train_files_and_regenerate_vocab():
    print("pass")
    r = run("split -l1000000 train.txt --verbose")
    if r.ok:
        print("Train splits generated")
    if r.ok:
        try:
            shutil.rmtree("td")
        except FileNotFoundError:
            pass
        os.mkdir("td")
        r = run(
            "mv xaa td/xaa.txt | mv xab td/xbb.txt | mv xac td/xac.txt | mv xad td/xad.txt | mv xae td/xae.txt | mv xaf td/xaf.txt"
        )
        if r.ok:
            paths = [str(x) for x in Path(".").glob("td/*.txt")]
            tokenizer = ByteLevelBPETokenizer()

            # Customize training
            tokenizer.train(
                files=paths,
                vocab_size=52_000,
                min_frequency=2,
                special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])
            try:
                shutil.rmtree("codeBERT")
            except FileNotFoundError:
                pass
            os.mkdir("codeBERT")
            tokenizer.save("codeBERT")
Exemplo n.º 2
0
def create_tokenizer(args):

    # Directory for storing
    directory = args.store_files

    # Train the tokenizer
    # paths = [str(x) for x in Path("./eo_data/").glob("**/*.txt")]
    paths = [args.file]

    # Initialize a tokenizer
    tokenizer = ByteLevelBPETokenizer()

    # Customize training
    tokenizer.train(files=paths, vocab_size=args.vocab_size, min_frequency=2, special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
    ])

    # Save files to disk
    tokenizer.save(args.store_files)

    tokenizer_config = {
        "max_len": 512
    }

    with open("{}/tokenizer_config.json".format(args.store_files), 'w') as fp:
        json.dump(tokenizer_config, fp)
Exemplo n.º 3
0
def build_tokenizer(data_path, save_path):
    r"""
        Creates a tokenizer for the Bert Model based on the given data corpus

        Args:
            data_path (:obj:`str`):
            	Path to the data corpus
            save_path (:obj:`str`):
				Path where the custom tokenizer should be saved
    """

    # Initialize a tokenizer
    tokenizer = ByteLevelBPETokenizer()
    # Customize training
    tokenizer.train(files=data_path,
                    vocab_size=52000,
                    min_frequency=2,
                    special_tokens=[
                        "<s>",
                        "<pad>",
                        "</s>",
                        "<unk>",
                        "<mask>",
                    ])
    tokenizer.save(save_path)
Exemplo n.º 4
0
def train_tokenizer(
    files: Union[str, List[str]],
    dropout: float = None,
    vocab_size: int = 1000,
    min_frequency: int = 2,
    save_path: str = "",
    added_tokens: List[str] = [],
    bos_token: str = "<|endoftext|>",
    eos_token: str = "<|endoftext|>",
    unk_token: str = "<|endoftext|>",
    serialize: bool = False,
) -> None:
    """
    Tokenizes the text(s) as a tokenizer, wrapping the tokenizer package.
    See: https://huggingface.co/blog/how-to-train

    For consistency, this function makes opinionated assuptions.

    :param files: path to file(s) to train tokenizer on
    :param dropout: Training dropout
    :param vocab_size: Final vocabulary size
    :param min_frequency: Minimum number of occurences to add to vocab
    :param save_path: Where to save the final tokenizer
    :param added_tokens: List of tokens to add to the tokenizer (currently not working)
    :param bos_token: Beginning-of-string special token
    :param eos_token: End-of-string special token
    :param unk_token: Unknown special token
    """

    assert isinstance(files, str) or isinstance(
        files, list), "files must be a string or a list."

    assert isinstance(added_tokens, list), "added_tokens must be a list."

    if isinstance(files, str):
        files = [files]

    tokenizer = ByteLevelBPETokenizer(dropout=dropout)

    tokenizer.train(
        files=files,
        vocab_size=vocab_size - len(added_tokens),
        min_frequency=min_frequency,
        special_tokens=[bos_token, eos_token, unk_token],
    )

    tokenizer.add_tokens(added_tokens)

    PREFIX = "aitextgen"
    save_path_str = "the current directory" if save_path == "" else save_path
    if serialize:
        logger.info(f"Saving {PREFIX}.tokenizer.json to {save_path_str}. " +
                    "You will need this file to build the GPT2Tokenizer.")
        tokenizer.save(f"{PREFIX}.tokenizer.json")
    else:
        logger.info(
            f"Saving {PREFIX}-vocab.json and {PREFIX}-merges.txt to {save_path_str}. "
            + "You will need both files to build the GPT2Tokenizer.")
        tokenizer.save_model(save_path, PREFIX)
    def train_tokenizer(self,
                        train_files,
                        tokenizer_name=None,
                        output_dir=None,
                        use_trained_tokenizer=True):
        """
        Train a new tokenizer on `train_files`.

        Args:

        - train_files: List of files to be used when training the tokenizer.

        - tokenizer_name: Name of a pretrained tokenizer or a path to a directory containing a tokenizer.

        - output_dir (optional): The directory where model files will be saved. If not given, self.args['output_dir']
        will be used.

        - use_trained_tokenizer (optional): Load the trained tokenizer once training completes.

        Returns: None
        """

        if not isinstance(train_files, list):
            train_files = [train_files]

        if not output_dir:
            output_dir = self.args["output_dir"]

        tokenizer = ByteLevelBPETokenizer()

        tokenizer.train(
            files=train_files,
            vocab_size=self.args["vocab_size"],
            min_frequency=self.args["min_frequency"],
            special_tokens=self.args["special_tokens"],
        )

        os.makedirs(output_dir, exist_ok=True)

        tokenizer.save(output_dir)
        logger.info(" Training of {} tokenizer complete. Saved to {}.".format(
            tokenizer_name, output_dir))

        _, _, tokenizer_class = MODEL_CLASSES[self.args["model_type"]]
        tokenizer = tokenizer_class.from_pretrained(output_dir)

        if use_trained_tokenizer:
            self.tokenizer = tokenizer
            self.args["tokenizer_name"] = output_dir

            try:
                model_to_resize = self.model.module if hasattr(
                    self.model, "module") else self.model
                model_to_resize.resize_token_embeddings(len(self.tokenizer))
            except AttributeError:
                pass
Exemplo n.º 6
0
def train_tokenizer(
    files: Union[str, List[str]],
    dropout: float = None,
    vocab_size: int = 1000,
    min_frequency: int = 2,
    prefix: str = "aitextgen",
    save_path: str = "",
    added_tokens: List[str] = [],
    bos_token: str = "<|endoftext|>",
    eos_token: str = "<|endoftext|>",
    unk_token: str = "<|endoftext|>",
    serialize: bool = True,
    trim_offsets: bool = True,
) -> None:
    """
    Tokenizes the text(s) as a tokenizer, wrapping the tokenizer package.
    See: https://huggingface.co/blog/how-to-train

    For consistency, this function makes opinionated assuptions.

    :param files: path to file(s) to train tokenizer on
    :param dropout: Training dropout
    :param vocab_size: Final vocabulary size
    :param min_frequency: Minimum number of occurences to add to vocab
    :param prefix: File name prefix of the final tokenizer
    :param save_path: Where to save the final tokenizer
    :param added_tokens: List of tokens to add to the tokenizer (currently not working)
    :param bos_token: Beginning-of-string special token
    :param eos_token: End-of-string special token
    :param unk_token: Unknown special token
    """

    assert isinstance(files, str) or isinstance(
        files, list), "files must be a string or a list."

    assert isinstance(added_tokens, list), "added_tokens must be a list."

    if isinstance(files, str):
        files = [files]

    tokenizer = ByteLevelBPETokenizer(dropout=dropout,
                                      trim_offsets=trim_offsets)

    tokenizer.train(
        files=files,
        vocab_size=vocab_size,
        min_frequency=min_frequency,
        special_tokens=[bos_token, eos_token, unk_token] + added_tokens,
    )

    if serialize:
        tokenizer.save(f"{prefix}.tokenizer.json")
    else:
        tokenizer.save_model(save_path, prefix)
Exemplo n.º 7
0
def save_sentense_piece_model():
    paths = [str(x) for x in Path("./data/").glob("**/*.txt")]
    print(paths)

    special_token = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train(files=paths,
                    vocab_size=32000,
                    min_frequency=2,
                    special_tokens=special_token)
    tokenizer.save(".", "ko")
Exemplo n.º 8
0
Arquivo: spm.py Projeto: liuqiskan/nlp
def save_sentense_piece_model():
    ko_paths = ['./data/korean-english-park.dev.ko', './data/korean-english-park.train.ko']
    en_paths = ['./data/korean-english-park.dev.en', './data/korean-english-park.train.en']

    special_token = ["<pad>", "<bos>", "<eos>", "<unk>", "<mask>"]
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train(files=ko_paths, vocab_size=32000, min_frequency=2, special_tokens=special_token)
    tokenizer.save("./create_spm", "ko")

    tokenizer.train(files=en_paths, vocab_size=32000, min_frequency=2, special_tokens=special_token)
    tokenizer.save("./create_spm", "en")
def tokenize(filename, vocab_size):

    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train(files=filename,
                    vocab_size=vocab_size,
                    min_frequency=2,
                    special_tokens=['<|endoftext|>'])
    # '<bos>', '<eos>', '<unk>', '<pad>', '<mask>'])
    tokenizer.save(corpus)

    return tokenizer
Exemplo n.º 10
0
def main(args):
    paths = [path for path in args.input.split(":")]

    # Initialize a tokenizer
    tokenizer = ByteLevelBPETokenizer()

    # Customize training
    tokenizer.train(
        files=paths,
        vocab_size=args.vocab_size,
        min_frequency=args.min_freq,
        special_tokens=["<s>", "<pad>", "</s>", "<unk>"],
    )

    # Save files to disk
    tokenizer.save("{}.json".format(args.name), pretty=True)

    tok_spec = json.loads(tokenizer.to_str())
    with open("{}-vocab.json".format(args.name), "w") as fp:
        json.dump(tok_spec["model"]["vocab"], fp, indent=4)
    with open("{}-merges.txt".format(args.name), "w") as fp:
        fp.write("\n".join(tok_spec["model"]["merges"]))
Exemplo n.º 11
0
def get_french_vocab(model_name):
    root = Path(os.getcwd()).parent.parent.parent
    french_corpus = "Datasets/corpora/fr/text"
    fr_corpus_path = os.path.join(root, french_corpus)
    files = []
    for dir_ in os.listdir(fr_corpus_path):
        fr_corpus_dir = os.path.join(fr_corpus_path, dir_)
        for text_file in os.listdir(fr_corpus_dir):
            text_file = os.path.join(fr_corpus_dir, text_file)
            files.append(text_file)

    tokenizer = ByteLevelBPETokenizer(add_prefix_space=True)
    tokenizer.pre_tokenizer = Whitespace()

    tokenizer.train(files,
                    vocab_size=20000,
                    min_frequency=2,
                    show_progress=True,
                    special_tokens=["<sos>", "<pad>", "<eos>", "<unk>"])

    print(tokenizer.encode("c'est la meilleure des phrases françaises").tokens)
    tokenizer.save(model_name)
Exemplo n.º 12
0
    else:
        print(files)

    gpt2_tok.train(
        files=files,
        vocab_size=args.vocab_size,
        show_progress=True,
        special_tokens=["<|endoftext|>", "<s>", "<pad>", "</s>"],
    )

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)


    gpt2_tok.save(
            os.path.join(args.output_dir,"tokenizer.json"), pretty=True
        )  # FIX Access is denied. (os error 5)
    gpt2_tok.save_model(args.output_dir, args.output_file_name)

    # tokenizer = GPT2TokenizerFast(
    #     vocab_file=os.path.join(args.output_dir, args.output_file_name) + "-vocab.json",
    #     merges_file=os.path.join(args.output_dir, args.output_file_name)
    #     + "-merges.txt",
    #     add_prefix_space=True,
    # )

    # tokenizer.add_special_tokens(
    #     {
    #         "eos_token": "<|endoftext|>",
    #         "bos_token": "<|endoftext|>",
    #         "unk_token": "<|endoftext|>",
Exemplo n.º 13
0
for file_path in tqdm.tqdm(
        glob.glob("../../data/txts/*.txt") +
        glob.glob("../../data/txts2/*.txt")):
    paper_name = file_path.split("/")[-1].replace(".pdf.txt", "")
    if os.path.exists(f"../../data/pre_abstract_txts/{paper_name}.txt"):
        continue

    with open(file_path) as file:
        text = file.read()

    abstract = None
    mention_count = len(abstract_re.findall(papers[paper_name]["title"]))
    if mention_count > 0:
        mentions = list(abstract_re.finditer(text))
        if len(mentions) >= mention_count:
            abstract = mentions[mention_count]
    else:
        abstract = abstract_re.search(text)

    if abstract is None:
        continue

    with open(f"../../data/pre_abstract_txts/{paper_name}.txt", "w") as file:
        file.write(text[:abstract.start()])

files = glob.glob("../../data/pre_abstract_txts/*.txt")
tokenizer = ByteLevelBPETokenizer(lowercase=True)
tokenizer.train(files, vocab_size=2500, special_tokens=["[PAD]"])
tokenizer.save("tokenizer")
Exemplo n.º 14
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
from tokenizers import ByteLevelBPETokenizer

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training

vocab_size = 50265
path = 'data/src-train.txt'
# Customize training
tokenizer.train(files=path,
                vocab_size=50265,
                min_frequency=2,
                special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])

# Save files to disk
directory = "models/roberta"

if not os.path.exists(directory):
    os.makedirs(directory)

tokenizer.save(directory)
Exemplo n.º 15
0
from tokenizers import ByteLevelBPETokenizer

path = "roberta_test/train.txt" #plwiki

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=path,
                vocab_size=50265,
                min_frequency=5,
                special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])



tokenizer.save("roberta_test/tokenizer.json")

import json
config = {
	"architectures": [
		"RobertaForMaskedLM"
	],
	"attention_probs_dropout_prob": 0.1,
	"hidden_act": "gelu",
	"hidden_dropout_prob": 0.1,
	"hidden_size": 768,
	"initializer_range": 0.02,
	"intermediate_size": 3072,
	"layer_norm_eps": 1e-05,
	"max_position_embeddings": 514,
	"model_type": "roberta",
Exemplo n.º 16
0
    alphabet = pre_tokenizers.ByteLevel.alphabet()  # 256 chars

logger.info(
    'Initial alphabet for ByteLevel BPE as defined in pre_tokenizers.ByteLevel.alphabet(): ',
    alphabet)
# And then train
tokenizer.train(
    files,
    vocab_size=args.vocab_size,
    min_frequency=2,
    show_progress=True,
    special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'],
)

# Save the files
tokenizer.save(args.out, args.name)

# Restoring model from learned vocab/merges
tokenizer = ByteLevelBPETokenizer(
    join(args.out, '{}-vocab.json'.format(args.name)),
    join(args.out, '{}-merges.txt'.format(args.name)),
    add_prefix_space=True,
)

# Test encoding
logger.info(
    'Tokens and their ids from ByteLevelBPETokenizer with GFP protein sequence: \n MSKGEE LFTGVVPILVELDGDVNGHKFSVSGEGEG DAT'
)
encoded = tokenizer.encode('MSKGEE LFTGVVPILVELDGDVNGHKFSVSGEGEG DAT',
                           pad_to_max_length=True)
logger.info(encoded.tokens)
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

clean_wiki_text = "/home/rohola/codes/persian_transformer/clean_wiki_text_txt"
paths = [str(x) for x in Path(clean_wiki_text).glob("**/*")]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths,
                vocab_size=52000,
                min_frequency=2,
                special_tokens=[
                    "<s>",
                    "<pad>",
                    "</s>",
                    "<unk>",
                    "<mask>",
                ])

# Save files to disk
tokenizer.save("models/faberto", "faberto")
    parser.add_argument('--train_path', type=str)
    parser.add_argument('--n_files', type=int)
    parser.add_argument('--save_path', type=str)
    parser.add_argument('--vocab_size', type=int)
    parser.add_argument('--control_codes', nargs='+',
                        default=['<|endoftext|>'])

    args = parser.parse_args()

    if os.path.isdir(args.train_path):
        paths = glob.glob(os.path.join(args.train_path, '*'))
    else:
        paths = [args.train_path]

    paths = paths[:args.n_files]

    tok = ByteLevelBPETokenizer()

    tok.train(files=paths, vocab_size=args.vocab_size,
              special_tokens=args.control_codes)

    tok.save(args.save_path)

    tokenizer_config = {
        "max_len": 1024
    }

    with open(os.path.join(args.save_path, "tokenizer_config.json"), 'w') as fp:
        json.dump(tokenizer_config, fp)
Exemplo n.º 19
0
for (_, _, f) in walk(labeledDataFolder + "/legitimate_htmls"):
    files.extend(
        [labeledDataFolder + "/legitimate_htmls/" + file for file in f])
for (_, _, f) in walk(labeledDataFolder + "/phishing_htmls"):
    files.extend([labeledDataFolder + "/phishing_htmls/" + file for file in f])
print("Total number of html files: %d\n" % len(files))

# Writing data, one html file per line. This is the format the tokenizer expects
print("Writing html data into a single file...")
output = open("tokenizer/htmlCodePerLine.txt", "w")
count = 0
for file in files:
    count = count + 1
    print("Files processed: %d, Total files: %d" % (count, len(files)))
    fileData = io.open(file, "r", errors="ignore").readlines()
    fileData = ''.join(str(line) for line in fileData)
    fileData = fileData.replace("\n", " ")
    output.write(fileData + "\n")
output.close()

# Starting tokenization
print("\nStarting tokenization with BPE")
tokenizer = ByteLevelBPETokenizer()
tokenizer.train("tokenizer/htmlCodePerLine.txt",
                min_frequency=minFrequency,
                vocab_size=vocabSize)
print(
    "Vocabulary size is: %d\nNOTE: Sometimes, the vocab size might not be equal to the input 'vocab_size'\n"
    % (tokenizer.get_vocab_size()))
tokenizer.save("tokenizer", "tokenizer.tok")
print("Tokenizer files have been saved in 'tokenizer' directory...")
Exemplo n.º 20
0
from transformers import GPT2Tokenizer

if __name__ == '__main__':
    # # Initialize a tokenizer
    tokenizer = ByteLevelBPETokenizer()

    HOME = os.environ['HOME']
    data_file = HOME + '/data/wikitext-103-raw/wiki.train.raw'
    # data_file ='/tmp/wikitext-2-raw/wiki.train.raw'
    tokenizer.train(files=[data_file],
                    vocab_size=20_000,
                    min_frequency=2,
                    special_tokens=[
                        "<s>",
                        "<pad>",
                        "</s>",
                        "<unk>",
                        "<mask>",
                    ])
    tokenizer_name = "Tokenizer"
    os.makedirs(tokenizer_name)
    tokenizer.save(tokenizer_name)

    with open(data_file, encoding="utf-8") as f:
        text = f.read()

    tok = GPT2Tokenizer.from_pretrained('Tokenizer')
    x = tok.convert_tokens_to_ids(tok.tokenize(text[:100]))
    y = tok.build_inputs_with_special_tokens(x)
    print(x)
    print(y)
Exemplo n.º 21
0
    # train tokenizer
    _pretty_print("Training tokenizer")
    bpe_tokenizer.train(
        [input_path, input_path_val],
        vocab_size=vocab_size,
        min_frequency=min_freq,
        special_tokens=[
            "<s>", "<pad>", "</s>", "<unk>", "<mask>", "<sep>", "<cls>"
        ],
    )
    # save tokenizer
    tok_path = os.path.join(output_path, "tokenizer")
    tok_path_file = os.path.join(tok_path, "vocab.json")
    os.makedirs(tok_path, exist_ok=True)
    # bpe_tokenizer.save_model(tok_path)
    bpe_tokenizer.save(tok_path_file, True)

    # load tokenizer with Roberta configuration
    bpe_tokenizer = PreTrainedTokenizerFast(
        tokenizer_file=tok_path_file,
        max_length=max_len,
        lowercase=True,
        unk_token="<unk>",
        sep_token="<sep>",
        pad_token="<pad>",
        cls_token="<cls>",
        mask_token="<mask>",
        bos_token="<s>",
        eos_token="</s>",
    )
    # bpe_tokenizer = FunnelTokenizerFast(
Exemplo n.º 22
0
#! pip install tokenizers

from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path("./data/").glob("**/*.txt")]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths,
                vocab_size=52_000,
                min_frequency=2,
                special_tokens=[
                    "<s>",
                    "<pad>",
                    "</s>",
                    "<unk>",
                    "<mask>",
                ])

# Save files to disk
tokenizer.save("litberta")
Exemplo n.º 23
0
def architecture_search(process_id):
    os.makedirs(f"checkpoints/{process_id+1}")
    os.makedirs(f"tokenizer/{process_id+1}")

    files = glob.glob("../../data/pre_abstract_txts/*.txt")

    tok_sizes = list(range(100, 2000, 100))
    hidden_sizes = list(range(12, 300, 12))
    emb_sizes = list(range(10, 250, 10))
    cased = [True, False]

    batch_size = 1

    results = {}
    choices = list(itertools.product(tok_sizes, hidden_sizes, emb_sizes,
                                     cased))
    random.shuffle(choices)

    best_acc = -np.inf
    while len(choices) > 0:
        tok_size, hidden_size, emb_size, cased = choices.pop()
        print(tok_size, hidden_size, emb_size, cased)

        tokenizer = ByteLevelBPETokenizer(lowercase=cased)
        tokenizer.train(files, vocab_size=tok_size, special_tokens=["[PAD]"])

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        dataset = TextDataset(data_dir="../../data/pre_abstract_txts",
                              labels_dir="../../data/pre_abstract_labels",
                              device=device,
                              tokenizer=tokenizer,
                              batch_size=batch_size)
        test_dataset = TextDataset(
            data_dir="../../data/pre_abstract_txts",
            labels_dir="../../data/pre_abstract_labels_test",
            device=device,
            tokenizer=tokenizer,
            batch_size=batch_size)
        model = LSTMTagger(vocab_size=tokenizer.get_vocab_size(),
                           embedding_dim=emb_size,
                           lstm_dim=hidden_size,
                           dropout=0,
                           n_classes=len(dataset.classes)).to(device)

        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters())
        # optimizer = torch.optim.SGD(model.parameters(), momentum=0.9, nesterov=True, lr=v)

        epoch = 0
        n = 3
        test_acc = -np.inf
        log_interval = 10  # all n batches
        weights = copy.deepcopy(model.state_dict())
        while True:
            dataset.shuffle()
            epoch += 1
            model.train()
            total_loss = 0.
            pbar = tqdm.tqdm(enumerate(dataset), desc=f"epoch {epoch}")
            for i, (x, y) in pbar:
                # reset gradients
                optimizer.zero_grad()
                # feed forward batch
                output = model(x)
                # calculate loss
                loss = criterion(output.transpose(1, 2), y)
                # back propagate loss
                loss.backward()
                # norm and clip gradients
                # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
                optimizer.step()

                pbar.set_description(
                    f'epoch {epoch} | batch {i + 1:d}/{len(dataset)} | loss {loss.item():.2f}'
                )

            model.eval()
            a, c = 0, 0
            with torch.no_grad():
                t_loss = 0
                for i, (x, y) in enumerate(test_dataset):
                    output = model(x)
                    loss = criterion(output.transpose(1, 2), y)
                    t_loss += loss.item()
                    for p, t in zip(torch.argmax(output, -1), y):
                        for pi, ti in zip(p, t):
                            a += 1
                            if pi == ti:
                                c += 1
                acc = c / a
                if acc <= test_acc and n > 0:
                    n -= 1
                    continue
                elif acc <= test_acc:
                    break
                print(t_loss, acc)
                weights = copy.deepcopy(model.state_dict())
                test_acc = acc
        results[(tok_size, hidden_size, emb_size, cased)] = acc
        print(
            list(
                sorted([(k, v) for k, v in results.items()],
                       key=lambda y: y[1],
                       reverse=True))[:10])
        print(best_acc, test_acc)
        if test_acc > best_acc:
            best_acc = test_acc
            dir_path = f"tokenizer/{process_id+1}/lstm-tagger-{best_acc:.6f}"
            if os.path.exists(dir_path):
                continue
            torch.save(
                weights,
                f"checkpoints/{process_id+1}/lstm-tagger-{best_acc:.6f}.pt")
            os.makedirs(dir_path)
            tokenizer.save(dir_path)
Exemplo n.º 24
0
def main():
    '''
    python -m ipdb run_gpt2.py      \
        --data-path /path/to/americanlit/     \
        --output-dir path/to/checkpoint/     \
        --eval-split valid     \
        --train-n-steps 20000     \
        --validate-every 1000     \
        --sequence-tune-rate 0.0     \
        --mode train \
        --model-name from_scratch \
        --batch-size 32 --seqlen 80 --gradient-accumulation-steps 4

    '''#with this bsz, seqlen, fits to bm gpus

    parser = argparse.ArgumentParser(description='openGPT-2 analysis')

    #debug menu
    parser.add_argument('--debug',
                        action='store_true',
                        help='use dbg1000.jsonl for faster programming')

    #training options
    #--> consider redefining FT...
    parser.add_argument('--mode',
                        choices=[
                            'train', 'FT', 'eval-singletoken',
                            'eval-completion', 'eval-both'
                        ],
                        default='eval-singletoken')
    parser.add_argument(
        '--input-mode',
        choices=['CLM', 'relFT'],
        default='CLM',
        help=
        'determine whether or not to put specials amongst sentences (CLM => do not  /  relFT => do)'
    )
    parser.add_argument('--data-path',
                        default='../jsonlpath/DBG',
                        help='path/to/jsonl/files')

    parser.add_argument('--eval-split', choices=['train', 'valid', 'test'])
    parser.add_argument(
        '--model-name',
        choices=['from_scratch', 'gpt2', 'gpt2-medium', 'gpt2-large'],
        default='gpt2')
    parser.add_argument('--model-load-dir', type=str, default=None)
    parser.add_argument('--seed', type=int, default=777)
    #parser.add_argument('--data-base', type=str)

    parser.add_argument('--batch-size', type=int, default=32)
    parser.add_argument("--max-steps",
                        default=-1,
                        type=int,
                        help="If > 0: set total number of training \
                            steps to perform. Override num_train_epochs.")
    parser.add_argument('--num-train-epochs', type=int, default=1)
    parser.add_argument('--gradient-accumulation-steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before\
                            performing a backward/update pass.")
    parser.add_argument('--seqlen', type=int, default=120)
    parser.add_argument(
        '--tolerate_offset',
        type=int,
        default=20,
        help=
        'when training with TPLoss, length to be additionally tolerated to args.seqlen.'
    )
    #training is done upto this step. regardless of args.max_steps or args.num_train_epochs
    parser.add_argument('--train-n-steps', type=int, default=-1)  #10000)

    parser.add_argument('--seqlen-singletoken', type=int, default=1024)
    parser.add_argument('--seqlen-completion', type=int,
                        default=300)  # need to unify both and use only one
    parser.add_argument('--seqlen-train', type=int, default=300)

    parser.add_argument(
        "--output-dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    # eval-completion
    parser.add_argument('--prefix-length', type=int, default=50)
    parser.add_argument('--continuation-length', type=int, default=100)
    parser.add_argument('--top-k', type=int, default=1)
    parser.add_argument('--top-p', type=float, default=0.0)

    # custom training
    parser.add_argument('--sequence-tune-rate', type=float, default=0.5)

    parser.add_argument('--report-metrics-every', type=int, default=10)
    parser.add_argument('--save-every', type=int, default=1000)
    parser.add_argument('--sequence-ngram-n', type=int, default=4)

    parser.add_argument('--validate-every', type=int, default=10000)

    # training loop
    parser.add_argument("--adam-epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument('--max-grad-norm', type=int, default=1)

    parser.add_argument('--learning-rate', type=float, default=6.25e-5)
    parser.add_argument("--warmup-steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument('--lr-schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight-decay', type=float, default=0.01)
    parser.add_argument('--lm-coef', type=float, default=0.9)
    parser.add_argument('--num-workers', type=int, default=0)

    args = parser.parse_args()
    print(args)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    ## file below prep'd by flatten.py using amerlit jsonl splits (which are all post processed)
    ## root / 'flattened_amerlit.txt'
    if args.mode == 'FT':
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

    elif args.mode == 'train':  # train tokenizer based on corpus
        d_root = Path(args.data_path)
        vocab_path = d_root / 'vocab.json'
        rawtxt_path = d_root / 'flattened_amerlit.txt'  # this is obtained by running "python 4_flatten4vocab.py @ dataroot"
        merge_path = d_root / 'merges.txt'

        if not (vocab_path.exists()
                and merge_path.exists()):  #check if vocab file exists
            vocabgenerator = ByteLevelBPETokenizer()
            vocabgenerator.train(str(rawtxt_path),
                                 vocab_size=50_000,
                                 min_frequency=2)
            vocabgenerator.save(
                str(d_root)
            )  # vocabgenerator is also tokenizer but not from transformers
            del vocabgenerator
        tokenizer = GPT2Tokenizer(vocab_path, merge_path, errors='replace')

    # add CLS to the vocab
    # see example here: https://huggingface.co/transformers/model_doc/gpt2.html#transformers.GPT2DoubleHeadsModel.forward
    tokenizer = init_special_tokens(tokenizer)

    dataset_paths = {
        'train': d_root / 'train.jsonl',
        'valid': d_root / 'val.jsonl',
        'test': d_root / 'test.jsonl',
    }  # keep this for later code compatibility albeit it looks crappy

    if args.model_load_dir:
        model = GPT2LMHeadModel.from_pretrained(args.model_load_dir)
    elif args.model_name == 'from_scratch':
        config = GPT2Config()
        config.architectures = ["GPT2LMHeadModel"]
        model = GPT2LMHeadModel(config)

        #mp = GPT2LMHeadModel.from_pretrained('gpt2')
        #pretrained config vs GPT2Config has only difference
        # "architectures": ['GPT2LMHeadModel']
    else:
        model = GPT2LMHeadModel.from_pretrained(args.model_name)

    model.resize_token_embeddings(len(tokenizer))
    model.config.output_hidden_states = True  # make them return output hidden
    model.to(device)
    '''if args.mode == 'eval-singletoken' or args.mode == 'eval-both':
        eval_singletoken(model, args, dataset_paths)
    '''
    if args.mode == 'eval-completion' or args.mode == 'eval-both':
        datasets = get_datasets(dataset_paths, max_len=args.seqlen_completion)
        eval_sampler = SequentialSampler(datasets[args.eval_split])
        eval_dataloader = DataLoader(datasets[args.eval_split],
                                     sampler=eval_sampler,
                                     batch_size=1)

        model.eval()

        with torch.no_grad():
            all_text_completions = []

            bpe_ngram_metrics = Metrics(pad=-1)
            word_ngram_metrics = Metrics(pad=-1)

            for i, batch in tqdm(enumerate(eval_dataloader),
                                 desc="Evaluating",
                                 total=len(eval_dataloader)):
                input_sequence = batch[0].cuda()
                if input_sequence.size(1) < args.prefix_length:
                    continue

                # Predict the completions.
                batch = batch_input_sequence_by_prefix_length(
                    input_sequence, args.prefix_length)
                bpe_completions, _ = sample_sequence(model, batch,
                                                     args.prefix_length,
                                                     args.continuation_length,
                                                     args.top_k, args.top_p)
                bpe_completions = bpe_completions.tolist()

                # Extract continuations from the predicted completions.
                bpe_continuations = []
                text_continuations = []
                for bpe_completion in bpe_completions:
                    bpe_continuations.append(
                        bpe_completion[args.prefix_length:])
                    text_continuations.append(
                        get_text_continuation(bpe_completion, tokenizer, args))
                    all_text_completions.append(
                        tokenizer.decode(bpe_completion))

                # Only keep continuations with at least one 4-gram
                # (A short continuation may occur due to predicted whitespace, then tokenizing, despite being
                #  normal length in BPE tokens).
                text_continuations = [
                    c for c in text_continuations if len(c) > 3
                ]

                # Update metrics with this batch of continuations.
                bpe_ngram_metrics.update(bpe_continuations)
                word_ngram_metrics.update(text_continuations)

                # Save the (possibly intermediate) metrics.
                save_completion_metrics(bpe_metrics=bpe_ngram_metrics.report(
                    'bpe_%s' % args.eval_split),
                                        word_metrics=word_ngram_metrics.report(
                                            'word_%s' % args.eval_split),
                                        text_completions=all_text_completions,
                                        config=model.config.to_dict(),
                                        args=args)

    if args.mode == 'train':
        if not os.path.exists(os.path.join(args.output_dir, 'best')):
            os.makedirs(os.path.join(args.output_dir, 'best'))

        token_loss = mle_loss
        if args.debug:
            train_seq_dataloader = get_dataloaders(args,
                                                   tokenizer,
                                                   spl='dbg1000')
            #for batch in train_seq_dataloader:
            #print(batch.pre_tru.shape)
            #print(batch.pre_fals) # None
            #set_trace()
        else:  # debugging mode
            train_seq_dataloader = get_dataloaders(args,
                                                   tokenizer,
                                                   spl='train')

        # Setup optimizer

        # one of both need to be specified for training
        # args.num_train_epochs  /   args.max_steps
        if args.max_steps > 0:
            t_total = args.max_steps
            args.num_train_epochs = args.max_steps // (args.batch_size * len(
                train_seq_dataloader) // args.gradient_accumulation_steps) + 1

            #if performing gradient accumulation, steps won't update.
            #this means actual epochs training multiplied directly by "gradient_accumulation_steps"

        else:
            t_total = len(
                train_seq_dataloader
            ) // args.gradient_accumulation_steps * args.num_train_epochs

            #if not specified,

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            args.weight_decay
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(optimizer,
                                                    args.warmup_steps, t_total)

        total_steps = 0
        best_ppl = 1e20
        for _ in trange(args.num_train_epochs, desc="Epoch"):
            logging_outputs = []
            epoch_loss = 0
            epoch_steps = 0
            tqdm_bar = tqdm(train_seq_dataloader,
                            desc="Training",
                            total=t_total
                            if args.train_n_steps <= 1 else args.train_n_steps)
            for step, batch in enumerate(tqdm_bar):
                optimizer.zero_grad()

                # Sequence loss
                if torch.rand(1).item() < args.sequence_tune_rate:
                    if batch[0].size(1) < args.prefix_length:
                        continue
                    loss, batch_metrics = ul_seq(model, batch, args)

                # Token loss
                else:
                    loss, batch_metrics = token_loss(
                        model, batch, args)  # == mleloss(model, batch, args)

                loss.backward()
                optimizer.step()
                scheduler.step()
                epoch_loss += loss.item()
                epoch_steps += 1
                total_steps += 1
                tqdm_bar.desc = f"Training loss: {(epoch_loss/epoch_steps):.2f} lr: {scheduler.get_lr()[0]:.2f}"  # get_last_lr in pytorch 1.4.0
                #tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(epoch_loss/epoch_steps, scheduler.get_lr()[0]) # scheduler.get_last_lr() is for 1.4.0

                logging_outputs.append(batch_metrics)

                if epoch_steps % args.report_metrics_every == 0:
                    logging_average = CrossEntropyCriterionWCustomMetrics.aggregate_logging_outputs(
                        logging_outputs)
                    temp = SequencePenaltyCriterion.aggregate_logging_outputs(
                        logging_outputs)
                    for k, v in temp.items():
                        logging_average[k] = v
                    logging_average['ppl'] = 2**logging_average['loss']
                    print(logging_average)
                    logging_outputs = []

                if step == args.train_n_steps:
                    break  # here train_n_steps

                if epoch_steps % args.save_every == 0:
                    model_to_save = model.module if hasattr(
                        model, 'module') else model
                    output_model_file = os.path.join(args.output_dir,
                                                     WEIGHTS_NAME)
                    output_config_file = os.path.join(args.output_dir,
                                                      CONFIG_NAME)
                    torch.save(model_to_save.state_dict(), output_model_file)
                    model_to_save.config.to_json_file(output_config_file)
                    tokenizer.save_vocabulary(args.output_dir)

                if total_steps % args.validate_every == 0:
                    print("Validating...")
                    validation_outputs = eval_singletoken(
                        model, args, dataset_paths, train_iter=total_steps)
                    if validation_outputs['ppl'] < best_ppl:
                        best_ppl = validation_outputs['ppl']
                        model_to_save = model.module if hasattr(
                            model, 'module') else model
                        output_model_file = os.path.join(
                            args.output_dir, 'best', WEIGHTS_NAME)
                        output_config_file = os.path.join(
                            args.output_dir, 'best', CONFIG_NAME)
                        torch.save(model_to_save.state_dict(),
                                   output_model_file)
                        model_to_save.config.to_json_file(output_config_file)
                        tokenizer.save_vocabulary(
                            os.path.join(args.output_dir, 'best'))
                        save_singletoken_metrics(validation_outputs,
                                                 model.config.to_dict(),
                                                 args,
                                                 train_iter=total_steps,
                                                 best=True)
Exemplo n.º 25
0
def save_sentense_piece_model(cfg):
    paths = get_file_path_list(cfg)
    special_token = ["<pad>", "<bos>", "<eos>", "<sep>", "<unk>", "<mask>"]
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train(files=paths, vocab_size=32000, min_frequency=2, special_tokens=special_token)
    tokenizer.save(cfg.path_sentence_piece, "ko")
Exemplo n.º 26
0
class HuggingFaceBpeHelper(BPEHelper):
    """
    HuggingFace's ByteLevelBPE Tokenizer.

    Fast because Rust.
    """
    def __init__(self, opt: Opt, shared: TShared = None):
        super().__init__(opt, shared)
        # Default true for HF
        self.add_prefix_space = opt.get('bpe_add_prefix_space', True)
        if self.add_prefix_space is None:
            self.add_prefix_space = True
        if opt.get('dict_loaded'):
            dfname = opt['dict_file']
            if os.path.isfile(f'{dfname}-merges.txt'):
                opt['bpe_merge'] = f'{dfname}-merges.txt'
            if os.path.isfile(f'{dfname}-vocab.json'):
                opt['bpe_vocab'] = f'{dfname}-vocab.json'
        try:
            from tokenizers import ByteLevelBPETokenizer
        except ImportError:
            raise ImportError(
                'Please install HuggingFace tokenizer with: pip install tokenizers'
            )

        if self.lower:
            warn_once(
                'Are you sure you want to lower case your BPE dictionary?')
        if self.maxtokens > 0 or self.minfreq > 0:
            raise ValueError(
                'You should not filter vocabulary with using --dict-tokenizer bytelevelbpe'
                ' (no --dict-minfreq or --dict-maxtokens).')
        if 'bpe_vocab' not in opt:
            raise ValueError(
                '--bpe-vocab is required for loading pretrained tokenizer')
        if 'bpe_merge' not in opt:
            raise ValueError(
                '--bpe-merge is required for loading pretrained tokenizer')

        self.vocab_path = opt['bpe_vocab']
        self.merge_path = opt['bpe_merge']

        if not self.vocab_path or not self.merge_path:
            raise IOError('--bpe-vocab and --bpe-merge are mandatory with '
                          '--dict-tokenizer bytelevelbpe')

        if not os.path.isfile(self.vocab_path):
            raise IOError(
                f'File {self.vocab_path} does not exist. --bpe-vocab must be pretrained.'
            )
        if not os.path.isfile(self.merge_path):
            raise IOError(
                f'File {self.merge_path} does not exist. --bpe-merge must be pretrained.'
            )

        self.tokenizer = ByteLevelBPETokenizer(self.vocab_path,
                                               self.merge_path,
                                               self.add_prefix_space)

    def helper_encode(self, text: str) -> List[str]:
        """
        Decode list of tokens into text string.

        :param tokens:
            list of tokens
        :param delimiter:
            string delimiter for tokens

        :return text:
            decoded text
        """
        return self.tokenizer.encode(text).tokens

    def helper_decode(self, tokens: List[str], token_ids: List[int],
                      delimiter: str) -> str:
        """
        Decode list of tokens into text string.

        :param tokens:
            list of tokens
        :param token_ids:
            list of token ids
        :param delimiter:
            string delimiter for tokens

        :return text:
            decoded text
        """
        text = self.tokenizer.decode(token_ids)
        return text

    def sync_with_dict(self, dict_agent):
        """
        Sync the dictionary agent with Hugging Face tokenizer's BPE dict.

        Called only once on initialization.
        """
        special_tokens = [
            dict_agent.null_token,
            dict_agent.start_token,
            dict_agent.end_token,
            dict_agent.unk_token,
        ]
        self.tokenizer.add_special_tokens(special_tokens)
        for i in range(self.tokenizer.get_vocab_size() - 4):
            token = self.tokenizer.id_to_token(i)
            dict_agent.add_token(token)
            # We don't have access to the hugging face word frequency table,
            # just set it to 1 instead
            dict_agent.freq[token] = 1

    def save(self, dir_name: str, file_name: str):
        """
        Save appropriate files.

        :param dir_name:
            directory to save.
        :param file_name:
            file to save.
        """
        self.tokenizer.save(dir_name, file_name)
Exemplo n.º 27
0
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

#paths = ['data/train_sentences.txt']
paths = ['data/train/t5.txt']
#paths = [str(x) for x in Path("./data/").glob("train_subset_*.txt")]

tokenizer = ByteLevelBPETokenizer()

tokenizer.train(files=paths, vocab_size=25_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

tokenizer.save("models", "KariBERTa-small")
Exemplo n.º 28
0
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path("./data/eo/data/").glob("**/*.txt")]

tokenizer = ByteLevelBPETokenizer()

tokenizer.train(files=paths,
                vocab_size=52_000,
                min_frequency=2,
                special_tokens=[
                    "<s>",
                    "<pad>",
                    "</s>",
                    "<unk>",
                    "<mask>",
                ])

tokenizer.save(".", "esperberto")
Exemplo n.º 29
0
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths,
                vocab_size=52_000,
                min_frequency=2,
                special_tokens=[
                    "<s>",
                    "<pad>",
                    "</s>",
                    "<unk>",
                    "<mask>",
                ])

# Save files to disk
tokenizer.save(".", "rubinberto")

tokenizer = ByteLevelBPETokenizer(
    "rubinberto-vocab.json",
    "rubinberto-merges.txt",
)

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

print(
    tokenizer.encode(
        "А можно вспоминать не о событиях, а, например, о чувствах, испытываемых нами за «отчетный период»."
                       "<pad>",
                       "<SEP>",
                       "<UNK>",
                       "<MASK>",
                   ])
print('en completed')
# Customize training
ta_tokenizer.train(files=new_ta_path,
                   vocab_size=8300,
                   min_frequency=2,
                   special_tokens=[
                       "<CLS>",
                       "<pad>",
                       "<SEP>",
                       "<UNK>",
                       "<MASK>",
                   ])
print('ta completed')
en_tokenizer.save(en_tokenizer_path)
ta_tokenizer.save(ta_tokenizer_path)
en_tokenizer = Tokenizer.from_file(en_tokenizer_path)
ta_tokenizer = Tokenizer.from_file(ta_tokenizer_path)
tamil_text = 'அதனை நிரூபிப்பதுபோல் இருக்குமாம் படம்'
english_text = 'This movie will prove that'
id_1 = ta_tokenizer.encode(tamil_text)
assert (ta_tokenizer.decode(
    id_1.ids) == tamil_text), 'mismatch in tamil tokenizer encoding'
id_2 = en_tokenizer.encode(english_text)
assert (en_tokenizer.decode(
    id_2.ids) == english_text), 'mismatch in english tokenizer encoding'