示例#1
0
def train_bert():
    # https://huggingface.co/transformers/_modules/transformers/tokenization_bert.html

    files = [
        "Corpora/CS_V0_normalized_sent_per_line.txt",
        "Corpora/AsoSoft_Large_sent_per_line.txt",
        "Corpora/KTC_all_cleaned.txt", "Corpora/Lyrics_all_cleaned.txt",
        "Corpora/Tanztil_ku_normalized.txt"
    ]

    vocab_size = 50000
    # Initialize a tokenizer
    tokenizer = BertWordPieceTokenizer(clean_text=True,
                                       handle_chinese_chars=False,
                                       strip_accents=True,
                                       lowercase=False)

    # And then train
    tokenizer.train(
        files,
        vocab_size,
        min_frequency=2,
        show_progress=True,
        special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
        limit_alphabet=1000,
        wordpieces_prefix="##",
    )

    tokenizer.save('./', 'ckb-wordpiece_%s' % str(vocab_size))
示例#2
0
def train_tokenizer(files: List[str],
                    tokenizer_name: str,
                    base_path: str,
                    vocab_size: int,
                    lowercase: bool = False,
                    strip_accents: bool = False):

    tokenizer = BertWordPieceTokenizer(lowercase=lowercase,
                                       strip_accents=strip_accents)

    tokenizer_path = os.path.join(base_path, tokenizer_name)
    os.makedirs(tokenizer_path, exist_ok=True)

    initial_alphabet = get_bert_initial_alphabet()

    tokenizer.train(files,
                    special_tokens=initial_alphabet,
                    vocab_size=vocab_size)

    tokenizer.save(tokenizer_path)

    # Creating a default config for the tokenizer
    config = {'do_lower_case': lowercase, 'strip_accents': strip_accents}
    config_file_path = os.path.join(tokenizer_path, 'tokenizer_config.json')

    with open(config_file_path, 'w+') as config_file:
        json.dump(config, config_file)
def train_tokenizer(captions):
    print('Create training file...')
    train_tokenizer = [sample for samples in captions for sample in samples]
    with open('train_tokenizer.txt', 'a') as f:
        for sample in train_tokenizer:
            f.write(sample)
    # init
    bwpt = BertWordPieceTokenizer(vocab_file=None,
                                  unk_token='[UNK]',
                                  sep_token='[SEP]',
                                  cls_token='[CLS]',
                                  clean_text=True,
                                  handle_chinese_chars=True,
                                  strip_accents=True,
                                  lowercase=True,
                                  wordpieces_prefix='##')
    print('Tokenizer training...')
    bwpt.train(files=['train_tokenizer.txt'],
               vocab_size=30000,
               min_frequency=5,
               limit_alphabet=1000,
               special_tokens=['[PAD]', '[UNK]', '[CLS]', '[MASK]', '[SEP]'])

    bwpt.save('.', 'captions')

    # initialization of a trained tokenizer
    tokenizer = BertWordPieceTokenizer('captions-vocab.txt')
    tokenizer.enable_truncation(max_length=16)
    print('Tokenizer is ready to use...')
    return tokenizer
示例#4
0
class BertWordPiece:

    def __init__(self, clean_text: bool, strip_accents: bool, lowercase: bool):
        self.clean = clean_text
        self.strip = strip_accents
        self.lower = lowercase

        self.tokenizer = BertWordPieceTokenizer(
            clean_text=self.clean, 
            strip_accents=self.clean
            lowercase=self.lower, 
            handle_chinese_chars=True
        )

    def train(self, files, vocab_size, min_frequency, limit_alphabet):
        self.trainer = self.tokenizer.train(
            files,
            vocab_size=vocab_size,
            min_frequency=min_frequency,
            show_progress=True,
            special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
            limit_alphabet=limit_alphabet,
            wordpieces_prefix="##",
        )

    def save(self, path, filename):
        self.tokenizer.save(path, filename)
示例#5
0
 def create_vocab(file_path, output_path, least_freq=2):
     tokenizer = BertWordPieceTokenizer(clean_text=False,
                                        strip_accents=False,
                                        lowercase=True)
     files = [file_path]
     tokenizer.train(files,
                     vocab_size=1000,
                     min_frequency=least_freq,
                     show_progress=True,
                     special_tokens=['[PAD]', '[UNK]', '[SOS]', '[EOS]'],
                     limit_alphabet=1000,
                     wordpieces_prefix="##")
     tokenizer.save(output_path)
     print(f"Vacabulary created at location {output_path}")
def train_tokenizer(
    corpus: Union[str, List[str]],
    vocab_size: int = 30519,
    overwrite: bool = True,
    lowercase: bool = True,
    save_vocab: bool = False,
    dst: Optional[str] = None,
    in_domain_vocab: str = VOCAB_CACHE_PREFIX,
) -> BertWordPieceTokenizer:
    """Train a WordPiece tokenizer from scratch.

    Arguments:
        corpus {Union[str, List[str]]} -- In-domain corpus / corpora

    Keyword Arguments:
        vocab_size {int} -- Size of trained vocabulary (default: 30519)
        lowercase {bool} -- If True, perform lowercasing (default: True)
        save_vocab {bool} -- If True, save vocab to `in_domain_vocab`
                             (default: Fakse)
        in_domain_vocab {str} -- Path to save trained tokenizer vocabulary
                                 (default: {'in-domain-vocab.txt'})

    Returns:
        A BertWordPieceTokenizer trained on in-domain corpora.
    """
    if not isinstance(corpus, list):
        corpus = [corpus]

    # Load cached vocab if possible
    if not overwrite:
        cached_vocab = Path(dst) / (VOCAB_CACHE_PREFIX + '-vocab.txt')

        if cached_vocab.exists():
            logger.info(f'Loading cached vocabulary at {cached_vocab}')
            return BertWordPieceTokenizer(str(cached_vocab))
        else:
            logger.info(f'Cached vocabulary not found at {cached_vocab}')

    # Train tokenizer
    logger.info('Training new WordPiece tokenizer on in-domain corpora')
    tokenizer = BertWordPieceTokenizer(lowercase=lowercase)
    tokenizer.train(corpus, vocab_size=vocab_size)

    if save_vocab:
        tokenizer.save('.' if dst is None else dst, in_domain_vocab)
        logger.info('Saved in-domain vocabulary to '
                    f'{Path(dst) / (in_domain_vocab + "-vocab.txt")}')
    return tokenizer
def train_bert_tokenizer(dataset_base_path: str,
                         target_path: str,
                         tokenizer_name: str,
                         files_pattern: str = '**/*',
                         vocab_size: int = 30000,
                         lower_case: bool = False):
    """
    Trains a BERT WordPiece Tokenizer based on data
    located in dataset_base_path.

    By default it reads all files in dataset_base_path. One can
    specify `files_pattern` for filtering.

    The files generated by the tokenizer will be saved under
    <target_path>/<tokenizer_name> namespace.
    """
    files = [
        str(f) for f in Path(dataset_base_path).glob(files_pattern)
        if os.path.isfile(f)
    ]

    logger.info(f'Found {len(files)} files to use for training.')
    logger.debug(f'Files are: {files}')

    tokenizer_args = {
        'lowercase': lower_case,
        'strip_accents': False,
    }

    wordpiece_tokenizer = BertWordPieceTokenizer(**tokenizer_args)
    wordpiece_tokenizer.train(files=files, vocab_size=vocab_size)

    save_out = wordpiece_tokenizer.save(target_path, tokenizer_name)

    logger.info(f'Train finish. Result is in {save_out}')
示例#8
0
def get_vocabulary(infile: Text, vocabsize: int, outfolder: Text):
    # get special token maps and config
    autotok = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
    autotok.save_pretrained(args.outfolder)
    os.remove(os.path.join(args.outfolder, "vocab.txt"))

    # Initialize a tokenizer
    tokenizer = BertWordPieceTokenizer(strip_accents=False,
                                       lowercase=False,
                                       clean_text=False)

    # Then train it!
    tokenizer.train([args.infile],
                    vocab_size=args.vocabsize,
                    limit_alphabet=int(1e9))

    # And finally save it somewhere
    tokenizer.save(args.outfolder, "vocab")
    os.rename(os.path.join(args.outfolder, "vocab-vocab.txt"),
              os.path.join(args.outfolder, "vocab.txt"))
示例#9
0
class Tokenizer:
    def __init__(self, lang):
        """
        A Tokenizer class to load and train a custom tokenizer
        Using the Hugging Face tokenization library for the same
        """
        self.tokenizer_dir = r"data/{}".format(lang)
        if not os.path.exists(self.tokenizer_dir):
            os.mkdir(self.tokenizer_dir)
        self.vocab = self.tokenizer_dir + "/vocab.txt"
        if os.path.exists(self.vocab):
            print("Initialized tokenizer using cached vocab file {}".format(self.vocab))
            self.tokenizer = BertWordPieceTokenizer(vocab_file=self.vocab)
        else:
            self.tokenizer = BertWordPieceTokenizer()

        self.tokenizer.enable_padding(max_length=MAX_LENGTH)
        self.tokenizer.enable_truncation(max_length=MAX_LENGTH)

    def train_tokenizer(self, sentences):
        """
        Train a tokenizer with a list of sentences
        """

        if not os.path.exists(self.vocab):
            print("Training tokenizer for {}".format(self.tokenizer_dir))
            # Hugging Face only accepts a Temp File with sentences for Training Tokenizer
            with open(self.tokenizer_dir + "/data.txt", "w+", encoding="utf-8") as f:
                [f.write(i + "\n") for i in sentences]
            self.tokenizer.train([self.tokenizer_dir + "/data.txt"])
            self.tokenizer.save(self.tokenizer_dir)
            print("Trained a tokenizer with vocab size {}".format(self.tokenizer.get_vocab_size()))

            # Removing the temp file
            os.remove(self.tokenizer_dir + "/data.txt")

    def encode(self, decoded):
        return self.tokenizer.encode(decoded)

    def decode(self, encoded):
        return self.tokenizer.decode_batch(encoded)
示例#10
0
def tokenize(inputPath, outputPath):
    paths = [str(x) for x in Path(inputPath).glob("*.ns")]
    print(paths)
    # Initialize a tokenizer

    tokenizer = BertWordPieceTokenizer(vocab_file=None,
                                       clean_text=True,
                                       handle_chinese_chars=True,
                                       strip_accents=False,
                                       lowercase=False,
                                       wordpieces_prefix="##")

    # Customize training
    tokenizer.train(
        files=paths,
        vocab_size=50000,
        min_frequency=2,
        show_progress=True,
        special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
    )

    tokenizer.save(outputPath)
示例#11
0
def train_tokenizer(filename, params):
    """
    Train a BertWordPieceTokenizer with the specified params and save it
    """
    # Get tokenization params
    save_location = params["tokenizer_path"]
    max_length = params["max_length"]
    min_freq = params["min_freq"]
    vocabsize = params["vocab_size"]

    tokenizer = BertWordPieceTokenizer()
    tokenizer.do_lower_case = False
    special_tokens = ["[S]","[PAD]","[/S]","[UNK]","[MASK]", "[SEP]","[CLS]"]
    tokenizer.train(files=[filename], vocab_size=vocabsize, min_frequency=min_freq, special_tokens = special_tokens)

    tokenizer._tokenizer.post_processor = BertProcessing(("[SEP]", tokenizer.token_to_id("[SEP]")), ("[CLS]", tokenizer.token_to_id("[CLS]")),)
    tokenizer.enable_truncation(max_length=max_length)

    print("Saving tokenizer ...")
    if not os.path.exists(save_location):
        os.makedirs(save_location)
    tokenizer.save(save_location)
示例#12
0
def main(language):
    # Initialize an empty BERT tokenizer
    tokenizer = BertWordPieceTokenizer(
        clean_text=True,
        handle_chinese_chars=False,
        strip_accents=False,
        lowercase=False,
    )

    cleaned_dir = BASE_DIR / "data/wikiextracted" / language / "cleaned"

    # prepare text files to train vocab on them
    # use only one subdir
    # files = [str(file_path) for file_path in cleaned_dir.glob("AA/wiki_*")]
    # use all wiki articles (in the given language)
    files = [str(file_path) for file_path in cleaned_dir.glob("**/wiki_*")]

    # train BERT tokenizer
    tokenizer.train(
        files,
        # vocab_size=100, # default value is 30000
        min_frequency=MIN_FREQ,
        show_progress=True,
        special_tokens=SPEC_TOKENS,
        limit_alphabet=SIZE_OF_ALPHABET, # default value is 1000
        wordpieces_prefix="##"
    )

    # save the vocab
    os.makedirs(str(BASE_DIR / "data/tokenizer" / language), exist_ok=True)
    tokenizer.save(str(BASE_DIR / "data/tokenizer" / language / "vocab"))

    # save the alphabet
    vocab = json.loads(read_vocab(language))['model']['vocab']
    alphabet = prepare_alphabet(vocab)
    write_alphabet_to_file(alphabet, language)
示例#13
0
    def train_tokenizer(self, train_files, tokenizer_name=None, output_dir=None, use_trained_tokenizer=True):
        """
        Train a new tokenizer on `train_files`.

        Args:

        - train_files: List of files to be used when training the tokenizer.

        - tokenizer_name: Name of a pretrained tokenizer or a path to a directory containing a tokenizer.

        - output_dir (optional): The directory where model files will be saved. If not given, self.args['output_dir']
        will be used.

        - use_trained_tokenizer (optional): Load the trained tokenizer once training completes.

        Returns: None
        """

        if not self.args["vocab_size"]:
            raise AttributeError(
                "Cannot train a new tokenizer as vocab_size is not specified in args dict. "
                "Either provide a tokenizer or specify vocab_size."
            )

        if not isinstance(train_files, list):
            train_files = [train_files]

        if not output_dir:
            output_dir = self.args["output_dir"]

        if self.args["model_type"] in ["bert", "electra"]:
            tokenizer = BertWordPieceTokenizer()
            self.args["special_tokens"] = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
            self.args["wordpieces_prefix"] = "##"

            tokenizer.train(
                files=train_files,
                vocab_size=self.args["vocab_size"],
                min_frequency=self.args["min_frequency"],
                special_tokens=self.args["special_tokens"],
                wordpieces_prefix="##",
            )
        else:
            tokenizer = ByteLevelBPETokenizer()

            tokenizer.train(
                files=train_files,
                vocab_size=self.args["vocab_size"],
                min_frequency=self.args["min_frequency"],
                special_tokens=self.args["special_tokens"],
            )

        os.makedirs(output_dir, exist_ok=True)

        tokenizer.save(output_dir)
        logger.info(" Training of {} tokenizer complete. Saved to {}.".format(tokenizer_name, output_dir))

        _, _, tokenizer_class = MODEL_CLASSES[self.args["model_type"]]
        tokenizer = tokenizer_class.from_pretrained(output_dir)

        if use_trained_tokenizer:
            self.tokenizer = tokenizer
            self.args["tokenizer_name"] = output_dir
            try:
                if self.args["model_type"] == "electra":
                    model_to_resize = (
                        self.model.generator_model.module
                        if hasattr(self.model.generator_model, "module")
                        else self.model.generator_model
                    )
                    model_to_resize.resize_token_embeddings(len(self.tokenizer))

                    model_to_resize = (
                        self.model.discriminator_model.module
                        if hasattr(self.model.discriminator_model, "module")
                        else self.model.discriminator_model
                    )
                    model_to_resize.resize_token_embeddings(len(self.tokenizer))

                model_to_resize = self.model.module if hasattr(self.model, "module") else self.model
                model_to_resize.resize_token_embeddings(len(self.tokenizer))
            except AttributeError:
                pass
示例#14
0
def train(args, rep):
    # Set random seed
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    # Rename output dir based on arguments
    if args.output_dir == "":
        cwd = os.getcwd()
        base = args.model_name_or_path.split("/")[-1]
        model_type = "_example" if args.example else "_linear"
        data_path = '_' + '_'.join(
            args.train_data_path.split("/")[-2:]).replace(".csv", "")
        mlm_on = "_mlmtrain" if args.mlm_data_path == "" or args.mlm_data_path == args.train_data_path else "_mlmfull"
        mlm_pre = "_mlmpre" if args.mlm_pre else ""
        mlm_dur = "_mlmdur" if args.mlm_during else ""
        observer = "_observer" if args.use_observers else ""
        name = base + model_type + data_path + mlm_on + mlm_pre + mlm_dur + observer + "_v{}".format(
            rep)
        args.output_dir = os.path.join(cwd, "checkpoints", name)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    elif args.num_epochs == 0:
        pass
    else:
        raise Exception("Directory {} already exists".format(args.output_dir))
        #pass

    json.dump(args.__dict__,
              open(os.path.join(args.output_dir, 'args.json'), "w+"))

    # Save args
    torch.save(args, os.path.join(args.output_dir, "run_args"))

    # Configure tensorboard writer
    tb_writer = SummaryWriter(log_dir=args.output_dir)

    # Configure tokenizer
    token_vocab_name = os.path.basename(args.token_vocab_path).replace(
        ".txt", "")
    tokenizer = BertWordPieceTokenizer(args.token_vocab_path,
                                       lowercase=args.do_lowercase)
    tokenizer.enable_padding(max_length=args.max_seq_length)
    tokenizer.save(args.output_dir + "/tokenizer")

    # Data readers
    if args.task == "intent":
        dataset_initializer = IntentDataset
    elif args.task == "slot":
        if 'taskmaster' in args.train_data_path:
            dataset_initializer = TMSlotDataset
        else:
            dataset_initializer = SlotDataset
    elif args.task == "response":
        dataset_initializer = ResponseSelectionDataset
    elif args.task == "dst":
        dataset_initializer = StateTrackingDataset
    elif args.task == "top":
        dataset_initializer = TOPDataset
    else:
        raise ValueError("Not a valid task type: {}".format(args.task))

    train_dataset = dataset_initializer(args.train_data_path, tokenizer,
                                        args.max_seq_length, token_vocab_name)

    if args.mlm_data_path != '':
        mlm_dataset = dataset_initializer(args.mlm_data_path, tokenizer,
                                          args.max_seq_length,
                                          token_vocab_name)
    else:
        mlm_dataset = train_dataset

    val_dataset = dataset_initializer(
        args.val_data_path, tokenizer, 512,
        token_vocab_name) if args.val_data_path else None

    test_dataset = dataset_initializer(args.test_data_path, tokenizer, 512,
                                       token_vocab_name)

    # Data loaders
    train_dataloader = DataLoader(dataset=train_dataset,
                                  batch_size=args.train_batch_size,
                                  shuffle=True,
                                  pin_memory=True)

    mlm_dataloader = DataLoader(dataset=mlm_dataset,
                                batch_size=args.train_batch_size,
                                shuffle=True,
                                pin_memory=True)

    val_dataloader = DataLoader(dataset=val_dataset,
                                batch_size=1,
                                pin_memory=True) if val_dataset else None

    test_dataloader = DataLoader(dataset=test_dataset,
                                 batch_size=1,
                                 pin_memory=True)

    # Load model
    if args.task == "intent":
        if args.example:
            model = ExampleIntentBertModel(
                args.model_name_or_path,
                dropout=args.dropout,
                num_intent_labels=len(train_dataset.intent_label_to_idx),
                use_observers=args.use_observers)
        else:
            model = IntentBertModel(args.model_name_or_path,
                                    dropout=args.dropout,
                                    num_intent_labels=len(
                                        train_dataset.intent_label_to_idx),
                                    use_observers=args.use_observers)
    elif args.task == "slot":
        if args.example:
            model = ExampleSlotBertModel(args.model_name_or_path,
                                         dropout=args.dropout,
                                         num_slot_labels=len(
                                             train_dataset.slot_label_to_idx),
                                         use_observers=args.use_observers)
        else:
            model = SlotBertModel(args.model_name_or_path,
                                  dropout=args.dropout,
                                  num_slot_labels=len(
                                      train_dataset.slot_label_to_idx),
                                  use_observers=args.use_observers)
    elif args.task == "response":
        model = ResponseSelectionBertModel(args.model_name_or_path,
                                           dropout=args.dropout)
    elif args.task == "dst":
        model = StateTrackingBertModel(
            args.model_name_or_path,
            dropout=args.dropout,
            num_slot_labels=train_dataset.slot_lengths)
    elif args.task == "top":
        if args.example:
            model = ExampleJointSlotIntentBertModel(
                args.model_name_or_path,
                dropout=args.dropout,
                num_intent_labels=len(train_dataset.intent_label_to_idx),
                num_slot_labels=len(train_dataset.slot_label_to_idx))
        else:
            model = JointSlotIntentBertModel(
                args.model_name_or_path,
                dropout=args.dropout,
                num_intent_labels=len(train_dataset.intent_label_to_idx),
                num_slot_labels=len(train_dataset.slot_label_to_idx))
    else:
        raise ValueError("Cannot instantiate model for task: {}".format(
            args.task))

    if torch.cuda.is_available():
        model.to(args.device)

    if args.mlm_pre or args.mlm_during:
        pre_model = BertPretrain(args.model_name_or_path)
        mlm_optimizer = AdamW(pre_model.parameters(),
                              lr=args.learning_rate,
                              eps=args.adam_epsilon)
        if torch.cuda.is_available():
            pre_model.to(args.device)

    # MLM Pre-train
    if args.mlm_pre and args.num_epochs > 0:
        # Maintain most recent score per label.
        for epoch in trange(3, desc="Pre-train Epochs"):
            pre_model.train()
            epoch_loss = 0
            num_batches = 0
            for batch in tqdm(mlm_dataloader):
                num_batches += 1

                # Train model
                if "input_ids" in batch:
                    inputs, labels = mask_tokens(batch["input_ids"].cuda(),
                                                 tokenizer)
                else:
                    inputs, labels = mask_tokens(batch["ctx_input_ids"].cuda(),
                                                 tokenizer)

                loss = pre_model(inputs, labels)
                if args.grad_accum > 1:
                    loss = loss / args.grad_accum
                loss.backward()
                epoch_loss += loss.item()

                if args.grad_accum <= 1 or num_batches % args.grad_accum == 0:
                    if args.max_grad_norm > 0:
                        torch.nn.utils.clip_grad_norm_(pre_model.parameters(),
                                                       args.max_grad_norm)

                    mlm_optimizer.step()
                    pre_model.zero_grad()

            LOGGER.info("Epoch loss: {}".format(epoch_loss / num_batches))

        # Transfer BERT weights
        model.bert_model = pre_model.bert_model.bert

    # Train
    optimizer = AdamW(model.parameters(),
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    global_step = 0
    metrics_to_log = {}
    best_score = -1
    patience = 0
    for epoch in trange(args.num_epochs, desc="Epoch"):
        model.train()
        epoch_loss = 0
        num_batches = 0

        if args.task == "top" and args.example:
            # Pre-fill cache but don't return anything
            retrieve_examples(train_dataset, None, None, task="top")

        for batch in tqdm(train_dataloader):
            num_batches += 1
            global_step += 1

            # Transfer to gpu
            if torch.cuda.is_available():
                for key, val in batch.items():
                    if type(batch[key]) is list:
                        continue

                    batch[key] = batch[key].to(args.device)

            # Train model
            if args.task == "intent":
                if args.example:
                    examples = retrieve_examples(train_dataset,
                                                 batch["intent_label"],
                                                 batch["ind"],
                                                 task="intent")

                    _, intent_loss = model(
                        input_ids=batch["input_ids"],
                        attention_mask=batch["attention_mask"],
                        token_type_ids=batch["token_type_ids"],
                        intent_label=batch["intent_label"],
                        example_input=examples["input_ids"],
                        example_mask=examples["attention_mask"],
                        example_token_types=examples["token_type_ids"],
                        example_intents=examples["intent_label"])
                else:
                    _, intent_loss = model(
                        input_ids=batch["input_ids"],
                        attention_mask=batch["attention_mask"],
                        token_type_ids=batch["token_type_ids"],
                        intent_label=batch["intent_label"])
                if args.grad_accum > 1:
                    intent_loss = intent_loss / args.grad_accum
                intent_loss.backward()
                epoch_loss += intent_loss.item()
            elif args.task == "slot":
                if args.example:
                    examples = retrieve_examples(train_dataset,
                                                 batch["slot_labels"],
                                                 batch["ind"],
                                                 task="slot",
                                                 num=64)

                    _, slot_loss = model(
                        input_ids=batch["input_ids"],
                        attention_mask=batch["attention_mask"],
                        token_type_ids=batch["token_type_ids"],
                        slot_labels=batch["slot_labels"],
                        example_word_inds=examples["word_ind"],
                        example_input=examples["input_ids"],
                        example_mask=examples["attention_mask"],
                        example_token_types=examples["token_type_ids"],
                        example_slots=examples["slot_labels"])
                else:
                    _, slot_loss = model(
                        input_ids=batch["input_ids"],
                        attention_mask=batch["attention_mask"],
                        token_type_ids=batch["token_type_ids"],
                        slot_labels=batch["slot_labels"])
                if args.grad_accum > 1:
                    slot_loss = slot_loss / args.grad_accum
                slot_loss.backward()
                epoch_loss += slot_loss.item()
            elif args.task == "response":
                resp_loss = model(
                    ctx_input_ids=batch["ctx_input_ids"],
                    ctx_attention_mask=batch["ctx_attention_mask"],
                    ctx_token_type_ids=batch["ctx_token_type_ids"],
                    rsp_input_ids=batch["rsp_input_ids"],
                    rsp_attention_mask=batch["rsp_attention_mask"],
                    rsp_token_type_ids=batch["rsp_token_type_ids"])
                resp_loss.backward()
                epoch_loss += resp_loss.item()
            elif args.task == "dst":
                _, state_loss = model(input_ids=batch["input_ids"],
                                      attention_mask=batch["attention_mask"],
                                      token_type_ids=batch["token_type_ids"],
                                      state_label=batch["state_label"])
                state_loss.backward()
                epoch_loss += state_loss.item()
            elif args.task == "top":
                if args.example:
                    # Get intent examples
                    intent_examples = retrieve_examples(train_dataset,
                                                        batch["intent_label"],
                                                        batch["ind"],
                                                        task="intent",
                                                        num=32)

                    # Get slot examples
                    slot_examples = retrieve_examples(train_dataset,
                                                      batch["slot_labels"],
                                                      batch["ind"],
                                                      task="slot",
                                                      num=32)

                    loss = model(input_ids=batch["input_ids"],
                                 attention_mask=batch["attention_mask"],
                                 token_type_ids=batch["token_type_ids"],
                                 intent_label=batch["intent_label"],
                                 slot_labels=batch["slot_labels"],
                                 intent_examples=intent_examples,
                                 slot_examples=slot_examples)
                else:
                    _, _, loss = model(input_ids=batch["input_ids"],
                                       attention_mask=batch["attention_mask"],
                                       token_type_ids=batch["token_type_ids"],
                                       intent_label=batch["intent_label"],
                                       slot_labels=batch["slot_labels"])
                if args.grad_accum > 1:
                    loss = loss / args.grad_accum
                loss.backward()
                epoch_loss += loss.item()

            if args.grad_accum <= 1 or num_batches % args.grad_accum == 0:
                if args.max_grad_norm > 0:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                optimizer.step()
                model.zero_grad()

        LOGGER.info("Epoch loss: {}".format(epoch_loss / num_batches))

        # Evaluate and save checkpoint
        score = evaluate(model,
                         val_dataloader,
                         train_dataloader,
                         tokenizer,
                         task=args.task,
                         example=args.example,
                         device=args.device)
        metrics_to_log["eval_score"] = score
        LOGGER.info("Task: {}, score: {}---".format(args.task, score))

        if score < best_score:
            patience += 1
        else:
            patience = 0

        if score > best_score:
            LOGGER.info("New best results found for {}! Score: {}".format(
                args.task, score))
            torch.save(model.state_dict(),
                       os.path.join(args.output_dir, "model.pt"))
            torch.save(optimizer.state_dict(),
                       os.path.join(args.output_dir, "optimizer.pt"))
            best_score = score

        for name, val in metrics_to_log.items():
            tb_writer.add_scalar(name, val, global_step)

        if patience >= args.patience:
            LOGGER.info("Stopping early due to patience")
            break

        # Run MLM during training
        if args.mlm_during:
            pre_model.train()
            epoch_loss = 0
            num_batches = 0
            for batch in tqdm(mlm_dataloader):
                num_batches += 1

                # Train model
                if "input_ids" in batch:
                    inputs, labels = mask_tokens(batch["input_ids"].cuda(),
                                                 tokenizer)
                else:
                    inputs, labels = mask_tokens(batch["ctx_input_ids"].cuda(),
                                                 tokenizer)

                loss = pre_model(inputs, labels)

                if args.grad_accum > 1:
                    loss = loss / args.grad_accum

                loss.backward()
                epoch_loss += loss.item()

                if args.grad_accum <= 1 or num_batches % args.grad_accum == 0:
                    if args.max_grad_norm > 0:
                        torch.nn.utils.clip_grad_norm_(pre_model.parameters(),
                                                       args.max_grad_norm)

                    mlm_optimizer.step()
                    pre_model.zero_grad()

            LOGGER.info("MLMloss: {}".format(epoch_loss / num_batches))

    # Evaluate on test set
    LOGGER.info("Loading up best model for test evaluation...")
    model.load_state_dict(torch.load(os.path.join(args.output_dir,
                                                  "model.pt")))
    score = evaluate(model,
                     test_dataloader,
                     train_dataloader,
                     tokenizer,
                     task=args.task,
                     example=args.example,
                     device=args.device)
    print("Best result for {}: Score: {}".format(args.task, score))
    tb_writer.add_scalar("final_test_score", score, global_step)
    tb_writer.close()
    return score
示例#15
0
        vocab[special_token] = len(vocab)
    # Add other words - if not already present.
    for w in words:
        if w not in vocab:
            vocab[w] = len(vocab)
    print(vocab)

    # New tokenizer.
    init_tokenizer = BertWordPieceTokenizer(vocab=vocab) 
    init_tokenizer.normalizer = Sequence([Replace("(", " ( "), Replace(")", " ) "), BertNormalizer()])
    init_tokenizer.pre_tokenizer = Whitespace()
    #init_tokenizer.pad_token_id = vocab["[PAD]"]
    #print("Created tokenizer: ", init_tokenizer)

    # Save the created tokenizer.
    init_tokenizer.save(decoder_tokenizer_path)
    print("Tokenizer saved to: ", decoder_tokenizer_path)

# Load from tokenizer file.
tokenizer = PreTrainedTokenizerFast(tokenizer_file=decoder_tokenizer_path)
tokenizer.add_special_tokens({'pad_token': '[PAD]', 'cls_token': '[CLS]', 'sep_token': '[SEP]',
    'unk_token': '[UNK]', 'mask_token': '[MASK]', 'bos_token': '[BOS]', 'eos_token': '[EOS]'
    })

print(f"\nLoaded tokenizer vocabulary ({len(tokenizer.get_vocab())}):\n" + "-"*50)
for k, v in tokenizer.get_vocab().items():
    print(k, ": ", v)

goals = "has_anything(robot),on_surface(blue_block, tabletop),stacked(blue_block, red_block),on_surface(yellow_block, tabletop)"
values = [False, True, True, False]
input = process_goals(goals, values, return_string=True)
示例#16
0
from pathlib import Path

from tokenizers import BertWordPieceTokenizer

#paths = [str(x) for x in Path("./eo_data/").glob("**/*.txt")]
paths = ['../../data/jw300.en-tw.tw','../../data/asante_twi_bible.txt']

# Initialize a tokenizer
tokenizer = BertWordPieceTokenizer()

# Customize training
# And then train
tokenizer.train(
    paths,
    vocab_size=30000,
    min_frequency=2,
    show_progress=True,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
    limit_alphabet=1000,
    wordpieces_prefix="##",
)

# Save files to disk
tokenizer.save("abena-base-v2-akuapem-twi-cased")
示例#17
0
from tokenizers import BertWordPieceTokenizer
import tqdm

tokenizer = BertWordPieceTokenizer()
tokenizer.train(["all.raw"], vocab_size=10000)
# with open("all.raw") as f:
#     for line in tqdm.tqdm(f.readlines()):
#         tokenizer.add_tokens(line.strip().split(" "))
tokenizer.save(".", "tokenizer")

t = BertWordPieceTokenizer("tokenizer-vocab.txt", add_special_tokens=False)
示例#18
0
file_list = os.listdir(dir_path)  # 파일 경로 내 코퍼스 목록
wordpiece_train_file = "./ch-{}-wpm-{}-wiki".format(
    args.limit_alphabet, args.vocab_size)  # 워드피스 학습 파일
vocab_file = args.vocab_file  # 생성할 vocab 파일

# 코퍼스 목록
corpus_files = []
for file_name in file_list:
    if '.txt' in file_name:  # txt 파일인 경우
        corpus_files.append(f'{dir_path}/{file_name}')

tokenizer = BertWordPieceTokenizer(
    vocab_file=None,
    clean_text=True,
    handle_chinese_chars=True,
    strip_accents=False,  # Must be False if cased model
    lowercase=False,
    wordpieces_prefix=args.wordpieces_prefix)

tokenizer.train(files=corpus_files,
                limit_alphabet=args.limit_alphabet,
                vocab_size=args.vocab_size,
                wordpieces_prefix=args.wordpieces_prefix)
tokenizer.save(wordpiece_train_file, True)

f = open(vocab_file, 'w', encoding='utf-8')
with open(wordpiece_train_file) as json_file:
    json_data = json.load(json_file)
    for item in json_data["model"]["vocab"].keys():
        f.write(item + '\n')
    f.close()
示例#19
0
import argparse
from tokenizers import BertWordPieceTokenizer

parser = argparse.ArgumentParser()

parser.add_argument("--corpus_file", type=str, default="../data/namuwiki.txt")
parser.add_argument("--vocab_size", type=int, default=22000)
parser.add_argument("--limit_alphabet", type=int, default=6000)

args = parser.parse_args()

tokenizer = BertWordPieceTokenizer(
    vocab=None,
    clean_text=True,
    handle_chinese_chars=True,
    strip_accents=False,  # Must be False if cased model
    lowercase=False,
    wordpieces_prefix="##")
tokenizer.train(files=[args.corpus_file],
                limit_alphabet=args.limit_alphabet,
                vocab_size=args.vocab_size)

tokenizer.save(
    "./ch-{}-wpm-{}-pretty".format(args.limit_alphabet, args.vocab_size), True)
    # prepare text files to train vocab on them
    files = ['data/merged_CC.txt', 'data/merged_wiki.txt']
    
    # train BERT tokenizer
    tokenizer.train(
      files,
      vocab_size=50000,
      min_frequency=2,
      show_progress=True,
      special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'],
      limit_alphabet=1000,
      wordpieces_prefix="##"
    )
    
    # save the vocab
    tokenizer.save('.', 'bert_german')    
    
    #Takes: 3 Minutes on 4 Cores for 800 mb 

# =============================================================================
#     Runtime approx. 30 min for 1.4 GB TXT Data on 24 Cores
#     Copied from https://huggingface.co/blog/how-to-train
# =============================================================================
 
if False:    
    folder_path = "/media/philipp/F25225165224E0D94/tmp/BERT_DATA"
    
    paths = [str(x) for x in Path(folder_path).glob("**/*.txt")]
    
    # Initialize a tokenizer
    tokenizer = ByteLevelBPETokenizer()
示例#21
0
                    type=str,
                    help="The name of the output vocab files")
args = parser.parse_args()

files = glob.glob(args.files)
if not files:
    print(f"File does not exist: {args.files}")
    exit(1)

# Initialize an empty tokenizer
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=True,
    strip_accents=True,
    lowercase=True,
)

# And then train
trainer = tokenizer.train(
    files,
    vocab_size=10000,
    min_frequency=2,
    show_progress=True,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
    limit_alphabet=1000,
    wordpieces_prefix="##",
)

# Save the files
tokenizer.save(args.out, args.name)
from tokenizers import BertWordPieceTokenizer
tokenizer = BertWordPieceTokenizer(
    clean_text=False,
    handle_chinese_chars=True,
    strip_accents=True,
    lowercase=False,
)

from glob import glob

files = glob('../bert/dumping-*.txt')
files = [
    i for i in files
    if 'twitter' not in i and 'instagram' not in i and 'combined' not in i
] + ['dumping-commmon-crawl.txt']
files

trainer = tokenizer.train(
    files,
    vocab_size=32000,
    min_frequency=2,
    show_progress=True,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
    limit_alphabet=1000,
    wordpieces_prefix="##",
)

tokenizer.save('./', 'bahasa-standard')
示例#23
0
# dev_corpus_file = './mimicdata/bio-mimic3/dev_50.csv'
# test_corpus_file = './mimicdata/bio-mimic3/test_50.csv'

train_corpus_file = './mimicdata/mimic3/train_full.csv'
dev_corpus_file = './mimicdata/mimic3/dev_full.csv'
test_corpus_file = './mimicdata/mimic3/test_full.csv'

limit_alphabet = 100
vocab_size = 100000

tokenizer = BertWordPieceTokenizer(
    vocab_file=None,
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=False,  # Must be False if cased model
    lowercase=True,
    wordpieces_prefix="##",
)

tokenizer.train(
    files=[train_corpus_file, dev_corpus_file, test_corpus_file],
    limit_alphabet=limit_alphabet,
    vocab_size=vocab_size,
    min_frequency=1,
)

# tokenizer.save("./tokenizers", "bert-tiny-mimic3-50-{}-limit-{}".format(limit_alphabet, vocab_size))
tokenizer.save(
    "./tokenizers",
    "bert-tiny-mimic3-full-{}-limit-{}".format(limit_alphabet, vocab_size))
示例#24
0
}

special_id2word = {i: w for w, i in special_word2id.items()}

tokenizer = BertWordPieceTokenizer(unk_token=UNK_TOKEN,
                                   sep_token=SEP_TOKEN,
                                   cls_token=CLS_TOKEN,
                                   pad_token=PAD_TOKEN,
                                   mask_token=MASK_TOKEN,
                                   strip_accents=False,
                                   lowercase=False)

# make sure that the vocab is large enough to cover special indices
assert params.vocab_size > max(CLS_INDEX, SEP_INDEX, UNK_INDEX, PAD_INDEX,
                               MASK_INDEX)
tokenizer.train(params.input, vocab_size=params.vocab_size, min_frequency=5)
tokenizer.save(params.out_dir, params.lg)

# insert special words to the correct position
vocab_file = os.path.join(params.out_dir, params.lg + '-vocab.txt')
with open(vocab_file, 'r') as f:
    words = [w.rstrip() for w in f]
    new_words = words[len(special_word2id):]
    for i in sorted(special_id2word.keys()):
        new_words.insert(i, special_id2word[i])

# overwrite the vocab file
with open(vocab_file, 'w') as f:
    for w in new_words:
        f.write(w + '\n')
from tokenizers import BertWordPieceTokenizer

# Initialize a tokenizer
tokenizer = BertWordPieceTokenizer()

# Then train it!
tokenizer.train(["./sample.csv"])

# Now, let's use it:
encoded = tokenizer.encode(
    "미국에서는 여전히, 연준은 물론 정부와 의회 역시 신용경색 해소를 위해 다방면의 노력을 하고 있다. 하지만 그것은, 미 금융시스템의 붕괴는 모면케 해 줄 수 있을지언정, 순환적 경기침체까지 피해가게 만들 수는 없을 것 같다."
)
print("WPM --------------")
print(encoded.tokens)

from konlpy.tag import Mecab
print("Mecab --------------")
mecab = Mecab()
print(
    mecab.morphs(
        "미국에서는 여전히, 연준은 물론 정부와 의회 역시 신용경색 해소를 위해 다방면의 노력을 하고 있다. 하지만 그것은, 미 금융시스템의 붕 괴는 모면케 해 줄 수 있을지언정, 순환적 경기침체까지 피해가게 만들 수는 없을 것 같다."
    ))

# And finally save it somewhere
tokenizer.save(".", name="WPM")
示例#26
0
def train(args, rep):
    # Set random seed
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    # Rename output dir based on arguments
    if args.output_dir == "":
        cwd = os.getcwd()
        base = args.model_name_or_path.split("/")[-1]
        data_path = "_" + "_".join(
            args.train_data_path.split("/")[-2:]).replace(".csv", "")
        mlm_pre = "_mlmpre" if args.mlm_pre else ""
        mlm_dur = "_mlmdur" if args.mlm_during else ""
        name = base + data_path + mlm_pre + mlm_dur + "_v{}".format(rep)
        args.output_dir = os.path.join(cwd, "checkpoints", name)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    elif args.num_epochs == 0:
        # This means we're evaluating. Don't create the directory.
        pass
    else:
        raise Exception("Directory {} already exists".format(args.output_dir))

    # Dump arguments to the checkpoint directory, to ensure reproducability.
    if args.num_epochs > 0:
        json.dump(args.__dict__,
                  open(os.path.join(args.output_dir, "args.json"), "w+"))
        torch.save(args, os.path.join(args.output_dir, "run_args"))

    # Configure tensorboard writer
    tb_writer = SummaryWriter(log_dir=args.output_dir)

    # Configure tokenizer
    token_vocab_name = os.path.basename(args.token_vocab_path).replace(
        ".txt", "")
    tokenizer = BertWordPieceTokenizer(args.token_vocab_path,
                                       lowercase=args.do_lowercase)
    # tokenizer.enable_padding(max_length=args.max_seq_length)

    if args.num_epochs > 0:
        tokenizer.save(args.output_dir + "/tokenizer.bin")

    # Data readers
    dataset_initializer = DATASET_MAPPER[args.task]
    mlm_pre_dataset_initializer = DATASET_MAPPER[args.mlm_pre_task]

    train_dataset = dataset_initializer(args.train_data_path, tokenizer,
                                        args.max_seq_length, token_vocab_name)

    mlm_pre_dataset = (mlm_pre_dataset_initializer(
        args.mlm_pre_data_path, tokenizer, args.max_seq_length,
        token_vocab_name) if args.mlm_pre_data_path else train_dataset)

    mlm_during_dataset = (dataset_initializer(
        args.mlm_during_data_path, tokenizer, args.max_seq_length,
        token_vocab_name) if args.mlm_during_data_path else train_dataset)

    val_dataset = (dataset_initializer(args.val_data_path, tokenizer, 512,
                                       token_vocab_name)
                   if args.val_data_path else None)

    test_dataset = dataset_initializer(args.test_data_path, tokenizer, 512,
                                       token_vocab_name)

    # Data loaders
    train_dataloader = DataLoader(
        dataset=train_dataset,
        batch_size=args.train_batch_size,
        shuffle=True,
        pin_memory=True,
    )

    mlm_pre_dataloader = DataLoader(
        dataset=mlm_pre_dataset,
        batch_size=args.train_batch_size,
        shuffle=True,
        pin_memory=True,
    )

    mlm_during_dataloader = DataLoader(
        dataset=mlm_during_dataset,
        batch_size=args.train_batch_size,
        shuffle=True,
        pin_memory=True,
    )

    val_dataloader = (DataLoader(dataset=val_dataset,
                                 batch_size=1,
                                 pin_memory=True) if val_dataset else None)

    test_dataloader = DataLoader(dataset=test_dataset,
                                 batch_size=1,
                                 pin_memory=True)

    # Load model
    if args.task == "intent":
        model = IntentBertModel(
            args.model_name_or_path,
            dropout=args.dropout,
            num_intent_labels=len(train_dataset.intent_label_to_idx),
        )
    elif args.task == "slot":
        model = SlotBertModel(
            args.model_name_or_path,
            dropout=args.dropout,
            num_slot_labels=len(train_dataset.slot_label_to_idx),
        )
    elif args.task == "top":
        model = JointSlotIntentBertModel(
            args.model_name_or_path,
            dropout=args.dropout,
            num_intent_labels=len(train_dataset.intent_label_to_idx),
            num_slot_labels=len(train_dataset.slot_label_to_idx),
        )
    else:
        raise ValueError("Cannot instantiate model for task: {}".format(
            args.task))

    if torch.cuda.is_available():
        model.to(args.device)

    # Initialize MLM model
    if args.mlm_pre or args.mlm_during:
        pre_model = BertPretrain(args.model_name_or_path)
        mlm_optimizer = AdamW(pre_model.parameters(),
                              lr=args.learning_rate,
                              eps=args.adam_epsilon)
        if torch.cuda.is_available():
            pre_model.to(args.device)

    # MLM Pre-train
    if args.mlm_pre and args.num_epochs > 0:
        # Maintain most recent score per label.
        for epoch in trange(3, desc="Pre-train Epochs"):
            pre_model.train()
            epoch_loss = 0
            num_batches = 0
            for batch in tqdm(mlm_pre_dataloader, leave=False):
                num_batches += 1

                # Train model
                if "input_ids" in batch:
                    inputs, labels = mask_tokens(batch["input_ids"].cuda(),
                                                 tokenizer)
                else:
                    inputs, labels = mask_tokens(batch["ctx_input_ids"].cuda(),
                                                 tokenizer)

                loss = pre_model(inputs, labels)
                if args.grad_accum > 1:
                    loss = loss / args.grad_accum
                loss.backward()
                epoch_loss += loss.item()

                if args.grad_accum <= 1 or num_batches % args.grad_accum == 0:
                    if args.max_grad_norm > 0:
                        torch.nn.utils.clip_grad_norm_(pre_model.parameters(),
                                                       args.max_grad_norm)

                    mlm_optimizer.step()
                    pre_model.zero_grad()

            LOGGER.info("Epoch loss: {}".format(epoch_loss / num_batches))

        # Transfer BERT weights
        model.bert_model = pre_model.bert_model.bert

    # Train
    optimizer = AdamW(model.parameters(),
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    global_step = 0
    metrics_to_log = {}
    best_score = -1
    patience = 0
    for epoch in trange(args.num_epochs, desc="Epoch"):
        model.train()
        epoch_loss = 0
        num_batches = 0

        for batch in tqdm(train_dataloader, leave=False):
            num_batches += 1
            global_step += 1

            # Transfer to gpu
            if torch.cuda.is_available():
                for key, val in batch.items():
                    if type(batch[key]) is list:
                        continue

                    batch[key] = batch[key].to(args.device)

            # Train model
            if args.task == "intent":
                _, intent_loss = model(
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    token_type_ids=batch["token_type_ids"],
                    intent_label=batch["intent_label"],
                )

                if args.grad_accum > 1:
                    intent_loss = intent_loss / args.grad_accum

                intent_loss.backward()
                epoch_loss += intent_loss.item()
            elif args.task == "slot":
                _, slot_loss = model(
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    token_type_ids=batch["token_type_ids"],
                    slot_labels=batch["slot_labels"],
                )

                if args.grad_accum > 1:
                    slot_loss = slot_loss / args.grad_accum
                slot_loss.backward()
                epoch_loss += slot_loss.item()
            elif args.task == "top":
                _, _, loss = model(
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    token_type_ids=batch["token_type_ids"],
                    intent_label=batch["intent_label"],
                    slot_labels=batch["slot_labels"],
                )

                if args.grad_accum > 1:
                    loss = loss / args.grad_accum
                loss.backward()
                epoch_loss += loss.item()

            if args.grad_accum <= 1 or num_batches % args.grad_accum == 0:
                if args.max_grad_norm > 0:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                optimizer.step()
                model.zero_grad()

        LOGGER.info("Epoch loss: {}".format(epoch_loss / num_batches))

        # Evaluate and save checkpoint
        score = evaluate(
            model,
            val_dataloader,
            tokenizer,
            task=args.task,
            device=args.device,
            args=args,
        )
        metrics_to_log["eval_score"] = score
        LOGGER.info("Task: {}, score: {}---".format(args.task, score))

        if score < best_score:
            patience += 1
        else:
            patience = 0

        if score > best_score:
            LOGGER.info("New best results found for {}! Score: {}".format(
                args.task, score))
            torch.save(model.state_dict(),
                       os.path.join(args.output_dir, "model.pt"))
            torch.save(optimizer.state_dict(),
                       os.path.join(args.output_dir, "optimizer.pt"))
            best_score = score

        for name, val in metrics_to_log.items():
            tb_writer.add_scalar(name, val, global_step)

        if patience >= args.patience:
            LOGGER.info("Stopping early due to patience")
            break

        # Run MLM during training
        if args.mlm_during:
            pre_model.train()
            epoch_loss = 0
            num_batches = 0
            for batch in tqdm(mlm_during_dataloader, leave=False):
                num_batches += 1

                # Train model
                if "input_ids" in batch:
                    inputs, labels = mask_tokens(batch["input_ids"].cuda(),
                                                 tokenizer)
                else:
                    inputs, labels = mask_tokens(batch["ctx_input_ids"].cuda(),
                                                 tokenizer)

                loss = pre_model(inputs, labels)

                if args.grad_accum > 1:
                    loss = loss / args.grad_accum

                loss.backward()
                epoch_loss += loss.item()

                if args.grad_accum <= 1 or num_batches % args.grad_accum == 0:
                    if args.max_grad_norm > 0:
                        torch.nn.utils.clip_grad_norm_(pre_model.parameters(),
                                                       args.max_grad_norm)

                    mlm_optimizer.step()
                    pre_model.zero_grad()

            LOGGER.info("MLMloss: {}".format(epoch_loss / num_batches))

    # Evaluate on test set
    LOGGER.info("Loading up best model for test evaluation...")
    model.load_state_dict(torch.load(os.path.join(args.output_dir,
                                                  "model.pt")))
    score = evaluate(model,
                     test_dataloader,
                     tokenizer,
                     task=args.task,
                     device=args.device,
                     args=args)
    print("Best result for {}: Score: {}".format(args.task, score))
    tb_writer.add_scalar("final_test_score", score, global_step)
    tb_writer.close()
    return score
示例#27
0
    STORAGE_BUCKET = "gs://sbt0"

    for prefix in prefixes:
        input_dir_gs = os.path.join(
            STORAGE_BUCKET,
            "data/corpus/%s_lower/zhwiki-latest-pages-articles_%s_lower.txt" %
            (prefix, prefix))
        input_dir_local = "./zhwiki-latest-pages-articles_%s_lower.txt" % prefix
        tf.gfile.Copy(input_dir_gs, input_dir_local, overwrite=True)

    for vocab_size in vocab_sizes:
        for prefix in prefixes:
            try:
                tokenizer_name = prefix + "_" + str(vocab_size)
                tokenizer = BertWordPieceTokenizer(handle_chinese_chars=False,
                                                   clean_text=True)

                tokenizer.train(
                    [
                        "./zhwiki-latest-pages-articles_%s_lower.txt" % prefix
                        # "./zhwiki-latest-pages-articles_lower.txt"
                    ],
                    vocab_size=vocab_size,
                    show_progress=True,
                    min_frequency=1,
                )
                tokenizer.save("data_proc/tokenizers/wordpiece",
                               tokenizer_name)

            except Exception as e:
                print(e)
                    type=int,
                    help='How big to make vocab',
                    required=True)
parser.add_argument('--output-dir', type=str, help='Output dir', required=True)
parser.add_argument('--min-frequency',
                    type=int,
                    help='Min frequency to merge',
                    default=5)
parser.add_argument('--limit-alphabet',
                    type=int,
                    help='Alphabet max size',
                    default=1000)

args = parser.parse_args()

tokenizer = BertWordPieceTokenizer(
    clean_text=False,
    handle_chinese_chars=True,
    strip_accents=False,
    lowercase=False,
)

tokenizer.train(
    args.corpus,
    vocab_size=args.vocab_size,
    min_frequency=args.min_frequency,
    limit_alphabet=args.limit_alphabet,
)

tokenizer.save(args.output_dir, f"{args.limit_alphabet}-{args.vocab_size}")
示例#29
0
# SentencePieceBPETokenizer: A BPE implementation compatible with the one used by SentencePiece
# BertWordPieceTokenizer: The famous Bert tokenizer, using WordPiece

DATAFILE = '../data/pg16457.txt'
MODELDIR = 'models'

input_text = 'This is a test'

# Training the tokenizers

print("========= CharBPETokenizer ==========")
# CharBPETokenizer
tokenizer = CharBPETokenizer()
tokenizer.train([DATAFILE], vocab_size=500)

tokenizer.save(MODELDIR, 'char_bpe')

output = tokenizer.encode(input_text)
print(output.tokens)  # ['T', 'his</w>', 'is</w>', 'a</w>', 't', 'est</w>']

print("========= ByteLevelBPETokenizer ==========")
# ByteLevelBPETokenizer
tokenizer = ByteLevelBPETokenizer()
tokenizer.train([DATAFILE], vocab_size=500)

tokenizer.save(MODELDIR, 'byte_bpe')
output = tokenizer.encode(input_text)
print(output.tokens)  # ['T', 'h', 'is', 'Ġis', 'Ġa', 'Ġt', 'est']

print("========= SentencePieceBPETokenizer ==========")
# SentencePieceBPETokenizer
示例#30
0
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=False,
)

trainer = tokenizer.train(
    "/bachelor_project/data/sentences.txt",
    vocab_size=100000,
    min_frequency=2,
    show_progress=True,
    special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'],
    limit_alphabet=1000,
    wordpieces_prefix="##")

tokenizer.save("./", "cased-100k")

# In[ ]:

tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=
    False,  # We need to investigate that further (stripping helps?)
    lowercase=True,
)

trainer = tokenizer.train(
    "/bachelor_project/data/sentences.txt",
    vocab_size=100000,
    min_frequency=2,