예제 #1
0
    def __init__(self,
                 pretrained_path,
                 n_labels,
                 hidden_size=768,
                 dropout_p=0.2,
                 label_ignore_idx=0,
                 head_init_range=0.04,
                 device='cuda'):
        super().__init__()

        self.n_labels = n_labels

        self.linear_1 = nn.Linear(hidden_size, hidden_size)
        self.classification_head = nn.Linear(hidden_size, n_labels)

        self.label_ignore_idx = label_ignore_idx
        self.tokenizer = PreTrainedTokenizerFast(
            tokenizer_file=os.path.join(pretrained_path, "tokenizer.json"))
        self.model = AutoModel.from_pretrained(pretrained_path)

        self.dropout = nn.Dropout(dropout_p)

        self.device = device

        # initializing classification head
        self.classification_head.weight.data.normal_(mean=0.0,
                                                     std=head_init_range)
예제 #2
0
def load_tokenizer(tknzr_file,
                   flag_tknzr_fast,
                   pad_token=None,
                   mask_token=None):
    """
    Interestingly, HuggingFace does not allow the base tokenizer to be called.
    This is a bizarre choice, but accordingly we have to look for something else
    , which is why I use the PreTrainedTokenizerFast to wrap the base tokenizer.
    Written in Rust, it's faster than the base tokenizer class, but also lets
    you call the tokenizer as tknzr('text to be tokenized').

    Input
        tknzr_file (str) : .json file of the tokenizer trained previously
        *_tokens (str)  : tokens that are to be used in the corresponding context
                            Some of them are not implemented yet...
    Output
        tknzr     : tokenizer as PreTrainedTokenizerFast class to be passed on
    """
    if flag_tknzr_fast:
        tknzr = PreTrainedTokenizerFast(tokenizer_file=tknzr_file)
    else:
        tknzr = PreTrainedTokenizer(tokenizer_file=tknzr_file)
    tknzr.pad_token = pad_token
    tknzr.mask_token = mask_token

    return tknzr
예제 #3
0
def load_tokenizer(folder="."):
    folder = Path(folder)
    return PreTrainedTokenizerFast(
        WhitespaceTokenizer(str(folder / vocab_file)),
        pad_token="<pad>",
        mask_token="<mask>",
    )
예제 #4
0
파일: utils.py 프로젝트: willbsoon/KoBART
def get_kobart_tokenizer(cachedir='~/kobart/'):
    """Get KoGPT2 Tokenizer file path after downloading
    """
    global tokenizer
    model_info = tokenizer
    file_path, is_cached = download(model_info['url'],
                                    model_info['fname'],
                                    model_info['chksum'],
                                    cachedir=cachedir)
    cachedir_full = os.path.expanduser(cachedir)
    if not os.path.exists(os.path.join(cachedir_full,
                                       'emji_tokenizer')) or not is_cached:
        if not is_cached:
            shutil.rmtree(os.path.join(cachedir_full, 'emji_tokenizer'),
                          ignore_errors=True)
        zipf = ZipFile(os.path.expanduser(file_path))
        zipf.extractall(path=cachedir_full)
    tok_path = os.path.join(cachedir_full, 'emji_tokenizer/model.json')
    tokenizer_obj = PreTrainedTokenizerFast(tokenizer_file=tok_path,
                                            bos_token='<s>',
                                            eos_token='</s>',
                                            unk_token='<unk>',
                                            pad_token='<pad>',
                                            mask_token='<mask>')
    return tokenizer_obj
예제 #5
0
    def __init__(self,
                 equations=None,
                 images=None,
                 tokenizer=None,
                 shuffle=True,
                 batchsize=16,
                 max_dimensions=(1024, 512),
                 pad=False,
                 keep_smaller_batches=False,
                 test=False):
        """Generates a torch dataset from pairs of `equations` and `images`.

        Args:
            equations (str, optional): Path to equations. Defaults to None.
            images (str, optional): Directory where images are saved. Defaults to None.
            tokenizer (str, optional): Path to saved tokenizer. Defaults to None.
            shuffle (bool, opitonal): Defaults to True. 
            batchsize (int, optional): Defaults to 16.
            max_dimensions (tuple(int, int), optional): Maximal dimensions the model can handle
            pad (bool): Pad the images to `max_dimensions`. Defaults to False.
            keep_smaller_batches (bool): Whether to also return batches with smaller size than `batchsize`. Defaults to False.
            test (bool): Whether to use the test transformation or not. Defaults to False.
        """

        if images is not None and equations is not None:
            assert tokenizer is not None
            self.images = [
                path.replace('\\', '/')
                for path in glob.glob(join(images, '*.png'))
            ]
            self.sample_size = len(self.images)
            eqs = open(equations, 'r').read().split('\n')
            self.indices = [
                int(os.path.basename(img).split('.')[0]) for img in self.images
            ]
            self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer)
            self.shuffle = shuffle
            self.batchsize = batchsize
            self.max_dimensions = max_dimensions
            self.pad = pad
            self.keep_smaller_batches = keep_smaller_batches
            self.test = test
            self.data = defaultdict(lambda: [])
            # check the image dimension for every image and group them together
            try:
                for i, im in tqdm(enumerate(self.images),
                                  total=len(self.images)):
                    width, height = imagesize.get(im)
                    if width <= max_dimensions[0] and height <= max_dimensions[
                            1]:
                        self.data[(width, height)].append(
                            (eqs[self.indices[i]], im))
            except KeyboardInterrupt:
                pass
            self.data = dict(self.data)
            self._get_size()

            iter(self)
 def __init__(self, hparams, **kwargs):
     super(KoBARTConditionalGeneration, self).__init__(hparams, **kwargs)
     self.model = BartForConditionalGeneration.from_pretrained(self.hparams.model_path)
     self.model.train()
     self.bos_token = '<s>'
     self.eos_token = '</s>'
     self.tokenizer = PreTrainedTokenizerFast(
         tokenizer_file=os.path.join(self.hparams.tokenizer_path, 'model.json'),
         bos_token=self.bos_token, eos_token=self.eos_token, unk_token='<unk>', pad_token='<pad>', mask_token='<mask>')
 def __init__(self, filepath, tok_vocab, max_seq_len=128) -> None:
     self.filepath = filepath
     self.data = pd.read_csv(self.filepath) #encoding='cp949'
     self.bos_token = '<s>'
     self.eos_token = '</s>'
     self.max_seq_len = max_seq_len
     self.tokenizer = PreTrainedTokenizerFast(
         tokenizer_file=tok_vocab,
         bos_token=self.bos_token, eos_token=self.eos_token, unk_token='<unk>', pad_token='<pad>', mask_token='<mask>')
예제 #8
0
 def load_custom_tokenizer(self, path):
     tokenizer = ByteLevelBPETokenizer(path + "-vocab.json",
                                       path + "-merges.txt")
     # Add preprocessing tokens like Roberta
     tokenizer._tokenizer.post_processor = BertProcessing(
         ("</s>", tokenizer.token_to_id("</s>")),
         ("<s>", tokenizer.token_to_id("<s>")),
     )
     return PreTrainedTokenizerFast(tokenizer,
                                    pad_token="<pad>",
                                    mask_token="<mask>",
                                    unk_token="<unk>",
                                    bos_token="<s>",
                                    eos_token="</s>")
예제 #9
0
def main(args):
    data = np.load(args.data, allow_pickle=True)
    tokenizer_path = args.tokenizer
    tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path,
                                        max_len=512,
                                        mask_token="<mask>",
                                        pad_token="<pad>")
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.convert_tokens_to_ids("</s>")),
        ("<s>", tokenizer.convert_tokens_to_ids("<s>")),
    )

    config = RobertaConfig(
        vocab_size=tokenizer.vocab_size,
        max_position_embeddings=514,
        num_attention_heads=12,
        num_hidden_layers=6,
        type_vocab_size=1,
    )

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=True,
                                                    mlm_probability=0.15)
    dataset = PhoneDatasetMLM(data, tokenizer)

    model = RobertaForMaskedLM(config=config)

    training_args = TrainingArguments(
        output_dir=args.output_dir,
        overwrite_output_dir=True,
        num_train_epochs=1,
        per_device_train_batch_size=64,
        logging_steps=2,
        save_steps=10_000,
        save_total_limit=2,
        prediction_loss_only=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )

    trainer.train()
    trainer.save_model(args.output_dir)
예제 #10
0
def preprocess(texts, tokenizer_path, max_len=32):

    input_ids, input_masks = [], []

    tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path)
    tokenizer.mask_token = '[MASK]'
    tokenizer.pad_token = "[PAD]"
    tokenizer.sep_token = "[SEP]"
    tokenizer.cls_token = "[CLS]"
    tokenizer.unk_token = "[UNK]"

    for text in tqdm(texts):
        encoded = tokenizer.encode_plus(text,
                                        max_length=max_len,
                                        pad_to_max_length=True,
                                        truncation=True)
        input_ids.append(encoded['input_ids'])
        input_masks.append(encoded['attention_mask'])

    return [np.array(input_ids), np.array(input_masks)]
예제 #11
0
def initialize(arguments=None):
    if arguments is None:
        arguments = Munch({
            'config': 'settings/config.yaml',
            'checkpoint': 'checkpoints/weights.pth',
            'no_cuda': True,
            'no_resize': False
        })
    logging.getLogger().setLevel(logging.FATAL)
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    with open(arguments.config, 'r') as f:
        params = yaml.load(f, Loader=yaml.FullLoader)
    args = parse_args(Munch(params))
    args.update(**vars(arguments))
    args.wandb = False
    args.device = 'cuda' if torch.cuda.is_available(
    ) and not args.no_cuda else 'cpu'

    model = get_model(args)
    model.load_state_dict(torch.load(args.checkpoint,
                                     map_location=args.device))

    if 'image_resizer.pth' in os.listdir(os.path.dirname(
            args.checkpoint)) and not arguments.no_resize:
        image_resizer = ResNetV2(layers=[2, 3, 3],
                                 num_classes=max(args.max_dimensions) // 32,
                                 global_pool='avg',
                                 in_chans=1,
                                 drop_rate=.05,
                                 preact=True,
                                 stem_type='same',
                                 conv_layer=StdConv2dSame).to(args.device)
        image_resizer.load_state_dict(
            torch.load(os.path.join(os.path.dirname(args.checkpoint),
                                    'image_resizer.pth'),
                       map_location=args.device))
        image_resizer.eval()
    else:
        image_resizer = None
    tokenizer = PreTrainedTokenizerFast(tokenizer_file=args.tokenizer)
    return args, model, image_resizer, tokenizer
예제 #12
0
def main(args):
    test_x = np.load(os.path.join(args.test_dir, "test_x.npy"),
                     allow_pickle=True)
    test_y = np.load(os.path.join(args.test_dir, "test_y.npy"),
                     allow_pickle=True)
    num_classes1 = len(np.unique(test_y))

    if args.test2_dir is not None:
        test_x2 = np.load(os.path.join(args.test2_dir, "test_x.npy"),
                          allow_pickle=True)
        test_y2 = np.load(os.path.join(args.test2_dir, "test_y.npy"),
                          allow_pickle=True)
        test_y2 += num_classes1
        test_x = np.concatenate((test_x, test_x2), axis=0)
        test_y = np.concatenate((test_y, test_y2), axis=0)

    num_classes = len(np.unique(test_y))

    tokenizer_path = args.tokenizer
    tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path,
                                        max_len=512,
                                        mask_token="<mask>",
                                        pad_token="<pad>")
    test_dataset = PhoneRobertaDataset(test_x, test_y, tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    config = RobertaConfig(
        vocab_size=tokenizer.vocab_size,
        max_position_embeddings=514,
        num_attention_heads=args.heads,  # default 12
        num_hidden_layers=args.num_layers,  # default 6
        type_vocab_size=1,
        num_labels=num_classes)
    model = RobertaForSequenceClassification(config)
    device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
    model.load_state_dict(torch.load(args.model))
    preds_all, labels_all = evaluate(model, device, test_loader)

    if args.test2_dir is not None:
        print("Evaluate on separate validation using the best model")
        evaluate_separate(preds_all, labels_all, num_classes1)
예제 #13
0
def initialize(arguments):
    filename = join(dirname(__file__), arguments.config)
    with open(filename, 'r') as f:
        params = yaml.load(f, Loader=yaml.FullLoader)
    args = Munch(params)
    args.update(**vars(arguments))
    args.wandb = False
    args.device = 'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu'

    model = get_model(args)
    model.load_state_dict(torch.load(args.checkpoint, map_location=args.device))

    if 'image_resizer.pth' in os.listdir(os.path.dirname(args.checkpoint)) and not arguments.no_resize:
        image_resizer = ResNetV2(layers=[2, 3, 3], num_classes=22, global_pool='avg', in_chans=1, drop_rate=.05,
                                 preact=True, stem_type='same', conv_layer=StdConv2dSame).to(args.device)
        image_resizer.load_state_dict(torch.load(os.path.join(os.path.dirname(args.checkpoint), 'image_resizer.pth'), map_location=args.device))
        image_resizer.eval()
    else:
        image_resizer = None
    tokenizer = PreTrainedTokenizerFast(tokenizer_file=args.tokenizer)
    return args, model, image_resizer, tokenizer
예제 #14
0
    def __train_pytorch(self, output_path, simulate):
        # Check for GPU.
        if torch.cuda.is_available():
            logger.info("Found a GPU.")
        else:
            logger.warning("Did not find a GPU.")

        # Create tokenizer.
        if not os.path.exists(self.config.tokenizer_path):
            raise Exception(
                f"No tokenizer found at {self.config.tokenizer_path}")
        tokenizer = Tokenizer.from_file(self.config.tokenizer_path)
        pretrained_tokenizer = PreTrainedTokenizerFast(
            tokenizer_file=self.config.tokenizer_path)
        pretrained_tokenizer.add_special_tokens({'pad_token': '[PAD]'})

        # Create the model.
        model_config = GPT2Config(
            vocab_size=tokenizer.get_vocab_size(),
            #bos_token_id=tokenizer.token_to_id("PIECE_START"),
            #eos_token_id=tokenizer.token_to_id("PIECE_END"),
            pad_token_id=tokenizer.token_to_id("[PAD]"),
            n_head=self.config.n_head,
            n_layer=self.config.n_layer,
            n_embd=self.config.n_embd,
            n_positions=self.config.n_positions,
            n_ctx=self.config.n_ctx)
        logger.info(model_config)
        model = GPT2LMHeadModel(model_config)

        # Prepare the training dataset.
        print("Preparing training dataset...")
        dataset_train = TokenSequenceDataset(
            tokenizer=pretrained_tokenizer,
            dataset_paths=self.config.dataset_train_files,
            block_size=self.config.pad_length,
            simulate=simulate)
        logger.info("Training dataset prepared.")

        # Prepare the validation dataset.
        print("Preparing validate dataset...")
        dataset_valid = TokenSequenceDataset(
            tokenizer=pretrained_tokenizer,
            dataset_paths=self.config.dataset_validate_files,
            block_size=self.config.pad_length,
            simulate=simulate)
        logger.info("Validation dataset prepared.")

        # Prepare data collator.
        data_collator = DataCollatorWithPadding(
            tokenizer=pretrained_tokenizer,
            padding="max_length",
            max_length=self.config.pad_length)

        # Create the trainer.
        print("Creating trainer...")
        training_args = TrainingArguments(
            output_dir=os.path.join(output_path),
            overwrite_output_dir=True,
            evaluation_strategy="steps",
            num_train_epochs=self.config.epochs,
            per_gpu_train_batch_size=self.config.batch_size,
            save_steps=1_000,
            save_total_limit=2,
            prediction_loss_only=False,
            logging_strategy="steps",
            logging_dir=os.path.join(output_path, "logs"),
            load_best_model_at_end=True,
            save_strategy="steps")
        trainer = Trainer(model=model,
                          args=training_args,
                          data_collator=data_collator,
                          train_dataset=dataset_train,
                          eval_dataset=dataset_valid)

        # Train the model.
        logger.info("Training the model...")
        trainer.train()

        # Save the model.
        model_path = os.path.join(output_path, "best_model")
        trainer.save_model(model_path)
        logger.info(f"Model saved to {model_path}.")
예제 #15
0
import json

from transformers.tokenization_utils import PreTrainedTokenizer
import utils

from transformers import PreTrainedTokenizerFast

# This will tokenize and add special tokens
# Todo

ast_tok = "<ast>"

tokenizer = PreTrainedTokenizerFast(tokenizer_file = "tokenizer/code-tokenizer.json")

with open("output/new_ast_raw.json", "r") as fin, open("output/converted_train.txt", "w") as fout:
    for line in utils.file_tqdm(fin):
        json_line = json.loads(line)
        json_tokens = json_line["nodes"]
        is_ext = json_line["ext"]
        if not is_ext:
            encoded = tokenizer.encode(ast_tok + " " + " ".join(json_tokens))
        else:
            encoded = tokenizer.encode(" ".join(json_tokens))
        fout.write(" ".join(str(e) for e in encoded) + " \n")
예제 #16
0
def train_custom_tokenizer(dataset,
                           token_model,
                           tknzr_file,
                           vocab_size,
                           vocab=None,
                           pretrain_fast=False,
                           max_input_chars_per_word=None,
                           eos_token=None,
                           bos_token=None,
                           pad_token=None,
                           mask_token=None,
                           unk_token=None):
    """
    Building a Tokenizer using HuggingFace library. The pipeline seems to be:

        - Model           : algorithm that tokenizes, it is a mandatory
                            component. There are only 4 models implemented
                            (BPE, Unigram, WordLevel, WordPiece)
        - Normalizer      : some preprocessing that could happen before, but
                            doesn't necessarily have to
        - Pre-Tokenizer   : splitting the input according to some rules
        - Post-Processing : needing to add some tokens/input after (mostly seems
                            to be eos, bos tokens)
        - Decoder         : certain previous pipeline steps need to be reversed
                            for proper decoding
        - Trainer         : The corresponding training algorithm for the model

    Note : Some pre-processing might need to happen beforehand in previous
            functions (might be easier using pandas before)

    Input
        token_model (str)        : algorithm to use for tokenization
        dataset (class)          : a python iterator that goes through the data
                                    to be used for training
        token_dir (str)          : directory with tokenizers
        vocab_size (int)         : size of the vocabulary to use
        tokenFilename (str)     : filename of particular token we want to
                                    train. Will overwrite previously save files.
        vocab (list of str)      : models other than BPE can use non-mandatory
                                    vocab as input
        max_input_chars_per_word : used for WordPiece

    Output
        tokenizer                : huggingFace Tokenizer object, our fully
                                    trainer tokenizer

    """
    special_token_lst = [
        pad_token, bos_token, eos_token, mask_token, unk_token
    ]

    # NFKC
    normalizer_lst = []
    pre_tokenizer_lst = [Whitespace, ByteLevel]
    decoder_lst = []

    bos_idx = special_token_lst.index(bos_token)
    eos_idx = special_token_lst.index(eos_token)

    if token_model == 'BPE':
        model = BPE(unk_token=unk_token)
        Trainer = BpeTrainer
    elif token_model == 'Unigram':
        model = Unigram(vocab=vocab)
        Trainer = UnigramTrainer
    elif token_model == 'WordLevel':
        model = WordLevel(unk_token=unk_token, vocab=vocab)
        Trainer = WordLevelTrainer
    elif token_model == 'WordPiece':
        model = WordPiece(unk_token=unk_token,
                          vocab=vocab,
                          max_input_chars_per_word=max_input_chars_per_word)
        Trainer = WordPieceTrainer
    else:
        error_msg = f'Error: token_model ({token_model}) not an algorithm in%s' \
                    % VALID_TOKENIZATIONS
        raise SystemExit(error_msg)

    # instantiation
    tokenizer = Tokenizer(model)

    # Select a tokenization trainer
    if vocab_size is None:
        trainer = Trainer(show_progress=True, special_tokens=special_token_lst)
    else:
        trainer = Trainer(vocab_size=vocab_size,
                          show_progress=True,
                          special_tokens=special_token_lst)

    # Set the normalizer
    tokenizer.normalizer = normalizers.Sequence(
        [fcn() for fcn in normalizer_lst])

    # Set the pre-tokenizer
    tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
        [fcn() for fcn in pre_tokenizer_lst])

    # Set the post-processing
    tokenizer.post_processor = processors.TemplateProcessing(
        single=bos_token + " $A " + eos_token,
        special_tokens=[(bos_token, bos_idx), (eos_token, eos_idx)],
        #  pair=bos_token+" $A "+eos_token" $B:1 "+eos_token+":1",
    )

    # Set the decoder
    if ByteLevel in pre_tokenizer_lst:
        tokenizer.decoder = decoders.ByteLevel()
    if Metaspace in pre_tokenizer_lst:
        tokenizer.decoder = decoders.Metaspace()
    if token_model == 'WordPiece':
        tokenizer.decoder = decoders.WordPiece()

    # creating iterator
    def batch_iterator():
        for i in np.arange(0, len(dataset)):
            yield dataset[i]

    # train call
    tokenizer.train_from_iterator(trainer=trainer,
                                  iterator=batch_iterator(),
                                  length=len(dataset))

    if Path(tknzr_file).exists():
        print(f"Warning : overwriting previously save tokenizer with\
                        same filename ( {tknzr_file} ).")
    tokenizer.save(tknzr_file)

    if pretrain_fast:
        tokenizer = PreTrainedTokenizerFast(tokenizer_file=tknzr_file)
    else:
        tokenizer = PreTrainedTokenizer(tokenizer_file=tknzr_file)
    tokenizer.pad_token = pad_token
    tokenizer.mask_token = mask_token

    return tokenizer
예제 #17
0
# Add special tokens - for decoder only!
add_special_tokens = False

# Paths.
data_path = "/home/tkornuta/data/local-leonardo-sierra5k"
sierra_path = os.path.join(data_path, "leonardo_sierra")
decoder_tokenizer_path = os.path.join(data_path, tokenizer_name)

# Load original BERT Ttokenizer.
encoder_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load decoder operating on the Sierra PDDL language.
decoder_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file=decoder_tokenizer_path,
    unk_token='[UNK]',
    sep_token='[SEP]',
    pad_token='[PAD]',
    cls_token='[CLS]]',
    mask_token='[MASK]',
)
decoder_tokenizer.add_special_tokens({
    'bos_token': '[BOS]',
    'eos_token': '[EOS]'
})
#print(f"\Decoder tokenizer vocabulary ({len(decoder_tokenizer.get_vocab())}):\n" + "-"*50)
#for k, v in decoder_tokenizer.get_vocab().items():
#    print(k, ": ", v)
# decoder_tokenizer.model_max_length=512 ??

# Create dataset/dataloader.
sierra_ds = SierraDataset(data_path=data_path, goals_sep=goals_sep)
sierra_dl = DataLoader(sierra_ds, batch_size=256, shuffle=True, num_workers=2)
예제 #18
0
    def __init__(
        self,
        file_path: str = None,
        vocab_file: str = os.path.join(STATIC_PATH, "gpt2_vocab.json"),
        merges_file: str = os.path.join(STATIC_PATH, "gpt2_merges.txt"),
        tokenizer: GPT2TokenizerFast = None,
        tokenizer_file: str = None,
        texts: List[str] = None,
        line_by_line: bool = False,
        from_cache: bool = False,
        header: bool = True,
        save_cache: bool = False,
        cache_destination: str = "dataset_cache.tar.gz",
        compress: bool = True,
        block_size: int = 1024,
        tokenized_texts: bool = False,
        text_delim: str = "\n",
        bos_token: str = "<|endoftext|>",
        eos_token: str = "<|endoftext|>",
        unk_token: str = "<|endoftext|>",
        pad_token: str = "<|endoftext|>",
        progress_bar_refresh_rate: int = 20,
        **kwargs,
    ) -> None:

        self.line_by_line = False

        # Special case; load tokenized texts immediately
        if tokenized_texts:
            self.tokens = tokenized_texts
            self.num_subsets = self.tokens.shape[0] - block_size
            self.block_size = block_size
            self.file_path = "merged TokenDataset"
            self.str_suffix = "by merging TokenDatasets."
            return

        assert any([texts, file_path]), "texts or file_path must be specified."

        if not tokenizer:
            if tokenizer_file:
                # load the custom tokenizer from a serialized tokenizer
                tokenizer = PreTrainedTokenizerFast(
                    tokenizer_file=tokenizer_file,
                    bos_token=bos_token,
                    eos_token=eos_token,
                    unk_token=unk_token,
                    pad_token=pad_token,
                )
            else:
                tokenizer = GPT2TokenizerFast(
                    vocab_file=vocab_file,
                    merges_file=merges_file,
                    bos_token=bos_token,
                    eos_token=eos_token,
                    unk_token=unk_token,
                    pad_token=pad_token,
                    verbose=False,
                )
                # https://github.com/huggingface/transformers/issues/10202
                tokenizer.add_special_tokens(
                    {"additional_special_tokens": ["<|endoftext|>"]}
                )

        # If a cache path is provided, load it.
        if from_cache:
            open_func = gzip.open if file_path.endswith(".gz") else open

            with open_func(file_path, "rb") as f:
                self.tokens = np.load(f)
            self.num_subsets = self.tokens.shape[0] - block_size
            self.block_size = block_size
            self.line_by_line = line_by_line
            self.str_suffix = "via cache."

            logger.info(
                f"TokenDataset containing {self.num_subsets:,} subsets loaded {self.str_suffix}"
            )
            return

        # if texts are present, just tokenize them.
        elif texts:
            self.str_suffix = "via application."

        # if a file is specified, and it's line-delimited,
        # the text must be processed line-by-line into a a single bulk file
        elif line_by_line:
            assert os.path.isfile(
                file_path
            ), f"{file_path} is not present in the current directory."

            text_delim = None
            self.line_by_line = True
            self.file_path = file_path
            self.str_suffix = f"from line-by-line file at {file_path}."

        # if a file is specified, and it's not line-delimited,
        # the texts must be parsed as a single bulk file.
        else:
            assert os.path.isfile(
                file_path
            ), f"{file_path} is not present in the current directory."
            if file_path.endswith(".csv"):
                logger.warning(
                    "You are tokenizing a CSV file, but you did not "
                    + "set line_by_line=True. Please change if unintended."
                )

            eos_token = ""
            header = False
            self.file_path = file_path
            self.str_suffix = f"from file at {file_path}."

        # Encode tokens in a batched manner to ensure constant memory usage
        if texts:
            self.tokens = encode_tokens_from_list(
                texts, eos_token, tokenizer, progress_bar_refresh_rate
            )
        else:
            self.tokens = encode_tokens_from_file(
                file_path,
                eos_token,
                tokenizer,
                text_delim,
                header,
                progress_bar_refresh_rate,
            )

        assert (
            self.tokens.shape[0] >= block_size
        ), f"There are fewer than {block_size} encoded tokens."
        self.num_subsets = self.tokens.shape[0] - block_size
        self.block_size = block_size

        if save_cache:
            self.save(cache_destination, compress=compress)
예제 #19
0
text_tokenizer.load_vocab(dataset_path / 'vocab.json')

# Create transformers compatible tokenizer
tokenizer = Tokenizer(WordLevel(text_tokenizer.vocab))
tokenizer.pre_tokenizer = CharDelimiterSplit(' ')
tokenizer.model.unk_token = '<unk>'

tokenizer_path = dataset_path / 'tokenizer1'
tokenizer_path.mkdir(parents=True, exist_ok=True)
tokenizer.save(str(tokenizer_path / "tokenizer.json"))

# Re-create as roberta compatible tokenizer
tokenizer_path = dataset_path / 'tokenizer1'
print(tokenizer_path)

tokenizer2 = PreTrainedTokenizerFast(tokenizer_file=str(tokenizer_path / "tokenizer.json"))
tokenizer2._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer2._tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer2._tokenizer.token_to_id("<s>")),
)
tokenizer2._tokenizer.enable_truncation(max_length=128)  # 512
tokenizer2.mask_token = "<mask>"
tokenizer2.pad_token = "<pad>"

# 3. Train a language model
config = RobertaConfig(
    vocab_size=tokenizer2._tokenizer.get_vocab_size(),
    hidden_size=240,
    intermediate_size=2048,
    max_position_embeddings=514,
    num_attention_heads=12,
예제 #20
0
    def __init__(
        self,
        model: str = None,
        model_folder: str = None,
        config: Union[str, GPT2Config] = None,
        vocab_file: str = None,
        merges_file: str = None,
        tokenizer_file: str = None,
        schema_tokens: List[str] = None,
        schema_return: List[str] = None,
        cache_dir: str = "aitextgen",
        tf_gpt2: str = None,
        to_gpu: bool = False,
        to_fp16: bool = False,
        verbose: bool = False,
        gradient_checkpointing: bool = False,
        bos_token: str = None,
        eos_token: str = None,
        unk_token: str = None,
        lightning_processing: str = 'dp'**kwargs,
    ) -> None:

        if model:
            assert not os.path.isfile(model), (
                "As of aitextgen 0.5.0, you must " +
                "use `model_folder` to load an existing model.")

        if not verbose:
            for module in [
                    "transformers.file_utils",
                    "transformers.configuration_utils",
                    "transformers.tokenization_utils",
                    "filelock",
                    "transformers.modeling_gpt2",
            ]:
                logging.getLogger(module).setLevel(logging.WARN)
            logging.getLogger("transformers.modeling_utils").setLevel(
                logging.ERROR)

        if tf_gpt2:
            self.openai_tf_gpt2 = tf_gpt2

            # Download + convert the TF weights if a PyTorch model has not been created
            if not os.path.isfile(
                    os.path.join(cache_dir, f"pytorch_model_{tf_gpt2}.bin")):
                assert tf_gpt2 in [
                    "124M",
                    "355M",
                    "774M",
                    "1558M",
                ], "Invalid TensorFlow GPT-2 model size."

                logger.info(
                    f"Downloading the {tf_gpt2} GPT-2 TensorFlow weights/config "
                    + "from Google's servers")

                download_gpt2(cache_dir, tf_gpt2)

                logger.info(
                    f"Converting the {tf_gpt2} GPT-2 TensorFlow weights to PyTorch."
                )

                config_path = os.path.join(cache_dir, tf_gpt2, "hparams.json")

                convert_gpt2_checkpoint_to_pytorch(
                    os.path.join(cache_dir, tf_gpt2),
                    config_path,
                    cache_dir,
                )

                os.rename(
                    os.path.join(cache_dir, "pytorch_model.bin"),
                    os.path.join(cache_dir, f"pytorch_model_{tf_gpt2}.bin"),
                )

                os.rename(
                    os.path.join(cache_dir, "config.json"),
                    os.path.join(cache_dir, f"config_{tf_gpt2}.json"),
                )

            logger.info(f"Loading {tf_gpt2} GPT-2 model from /{cache_dir}.")
            model = os.path.join(cache_dir, f"pytorch_model_{tf_gpt2}.bin")
            config = os.path.join(cache_dir, f"config_{tf_gpt2}.json")

            self.model = GPT2LMHeadModel.from_pretrained(model, config=config)

        elif model_folder:
            # A folder is provided containing pytorch_model.bin and config.json
            assert os.path.exists(
                os.path.join(model_folder, "pytorch_model.bin")
            ), f"There is no pytorch_model.bin in /{model_folder}."
            assert os.path.exists(os.path.join(
                model_folder,
                "config.json")), f"There is no config.json in /{model_folder}."

            logger.info(
                f"Loading model from provided weights and config in /{model_folder}."
            )
            self.model = AutoModelForCausalLM.from_pretrained(
                model_folder, local_files_only=True)
        elif config:
            # Manually construct a model from scratch
            logger.info("Constructing model from provided config.")
            if isinstance(config, str):
                config = AutoConfig.from_pretrained(config)
            self.model = AutoModelForCausalLM.from_config(config=config)
        else:
            # Download and cache model from Huggingface
            if os.path.isdir(cache_dir) and len(os.listdir(cache_dir)) > 0:
                logger.info(
                    f"Loading {model or 'gpt2'} model from /{cache_dir}.")
            else:
                logger.info(
                    f"Downloading {model or 'gpt2'} model to /{cache_dir}.")
            self.model = AutoModelForCausalLM.from_pretrained(
                model or "gpt2", cache_dir=cache_dir)
            if model and "gpt2" not in model:
                logger.info(f"Using the tokenizer for {model}.")
                self.tokenizer = AutoTokenizer.from_pretrained(
                    model,
                    cache_dir=cache_dir,
                )

        logger.info(self)

        if gradient_checkpointing or tf_gpt2 in ["355M", "774M", "1558M"]:
            logger.info("Gradient checkpointing enabled for model training.")
            setattr(self.model.config, "gradient_checkpointing", True)
            setattr(self.model.config, "use_cache", False)

        if schema_tokens:
            setattr(self.model.config, "schema_tokens", schema_tokens)

        if schema_return:
            setattr(self.model.config, "schema_return", schema_return)

        if self.tokenizer is None:
            # Update tokenizer settings (if not set already)
            args = locals()
            custom_tokenizer = False
            for attr in [
                    "vocab_file",
                    "merges_file",
                    "tokenizer_file",
                    "bos_token",
                    "eos_token",
                    "unk_token",
            ]:
                if args[attr] is not None:
                    custom_tokenizer = True
                    setattr(self, attr, args[attr])

            if custom_tokenizer:
                logger.info("Using a custom tokenizer.")
            else:
                logger.info("Using the default GPT-2 Tokenizer.")

            if tokenizer_file:
                # load the custom GPT-2 tokenizer from a serialized tokenizer.
                # GPT-Neo uses the GPT-2 tokenizer.
                self.tokenizer = PreTrainedTokenizerFast(
                    tokenizer_file=tokenizer_file,
                    bos_token=self.bos_token,
                    eos_token=self.eos_token,
                    unk_token=self.unk_token,
                    pad_token=self.pad_token,
                )
            else:
                self.tokenizer = GPT2TokenizerFast(
                    vocab_file=self.vocab_file,
                    merges_file=self.merges_file,
                    bos_token=self.bos_token,
                    eos_token=self.eos_token,
                    unk_token=self.unk_token,
                    pad_token=self.pad_token,
                    verbose=False,
                )
                if not custom_tokenizer:
                    # https://github.com/huggingface/transformers/issues/10202
                    self.tokenizer.add_special_tokens(
                        {"additional_special_tokens": ["<|endoftext|>"]})

        self.tokenizer.padding_side = "left"

        if to_gpu:
            if to_fp16:
                logger.warn(
                    "Currently, FP16 text generation results in random output. "
                    +
                    "You may want to avoid using to_fp16 for the time being.")
                self.to_fp16()
            self.to_gpu()
 def test_instantiation_from_tokenizers(self):
     bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
     PreTrainedTokenizerFast(tokenizer_object=bert_tokenizer)
예제 #22
0
                          PreTrainedTokenizerFast, Trainer, TrainingArguments)


DATA_PATH = 'data/item_name.txt'

parser = argparse.ArgumentParser(description='Training language model')
parser.add_argument('--config_path', type=str, default='src/configs/train_lm1.yaml',
                    help='path to config file')
args = parser.parse_args()

config = OmegaConf.load(args.config_path)
print(OmegaConf.to_yaml(config))

os.environ['WANDB_DISABLED'] = 'true'

tokenizer = PreTrainedTokenizerFast(tokenizer_file=config.tokenizer_path)
tokenizer.mask_token = '[MASK]'
tokenizer.pad_token = "[PAD]"
tokenizer.sep_token = "[SEP]"
tokenizer.cls_token = "[CLS]"
tokenizer.unk_token = "[UNK]"

distilbert_config = DistilBertConfig(vocab_size=config.vocab_size,
                                     n_heads=8, dim=512, hidden_dim=2048)
model = DistilBertForMaskedLM(distilbert_config)

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=DATA_PATH,
    block_size=64)
data_collator = DataCollatorForLanguageModeling(
예제 #23
0
                 tokenizer.decode(output, clean_up_tokenization_spaces=True)))
            print()
    return


if __name__ == '__main__':
    style = 'WordLevel'
    dataset = 'wikitext-2'
    tpath = default_tpath(dataset, style)
    tokenizer, vocab = train_tokenizer_vocab(dataset,
                                             style=style,
                                             force_retrain=True)
    #tokenizer_examples(tokenizer, raw_tokenizer=True, title='default_raw')

    from transformers import PreTrainedTokenizerFast
    fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file=tpath)
    tokenizer_examples(fast_tokenizer,
                       raw_tokenizer=False,
                       title='default_notraw')
"""
    flag_retrain = False
    use_arxiv = False

    if use_arxiv:
        tpath = DIR_TOKENIZERS + os.sep + 'BPE_arxiv.json'
    else:
        tpath = DIR_TOKENIZERS + os.sep + 'BPE_wiki.json'

    if flag_retrain:
        tokenizer = train_BPE(use_arxiv=use_arxiv, outpath=tpath)
    else:
예제 #24
0
def main(args):
    train_x, train_y, valid_x, valid_y = load_xy(args.data_dir)
    num_classes1 = len(np.unique(train_y))
    if args.data2_dir is not None:
        train_x2, train_y2, valid_x2, valid_y2 = load_xy(args.data2_dir)
        train_y2 += num_classes1
        valid_y2 += num_classes1
        train_x = np.concatenate((train_x, train_x2), axis=0)
        train_y = np.concatenate((train_y, train_y2), axis=0)
        valid_x = np.concatenate((valid_x, valid_x2), axis=0)
        valid_y = np.concatenate((valid_y, valid_y2), axis=0)
    num_classes = len(np.unique(train_y))
    
    tokenizer_path = args.tokenizer
    tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path, max_len=512, mask_token="<mask>", pad_token="<pad>")
    train_dataset = PhoneRobertaDataset(train_x, train_y, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
    valid_dataset = PhoneRobertaDataset(valid_x, valid_y, tokenizer)
    valid_loader = DataLoader(valid_dataset, batch_size=args.batch_size, shuffle=False)
    

    
    lr = args.lr
    num_epochs = args.epochs
    verbose = args.verbose
    
    if args.pretrained:
        model = RobertaForSequenceClassification.from_pretrained(args.pretrained, num_labels=num_classes)
    else:
        config = RobertaConfig(
        vocab_size=tokenizer.vocab_size,
        max_position_embeddings=514,
        num_attention_heads=args.heads, # default 12
        num_hidden_layers=args.num_layers, # default 6
        type_vocab_size=1,
        num_labels=num_classes
         )
        model = RobertaForSequenceClassification(config)
        
    model.to(device)
    print(model)
    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = ReduceLROnPlateau(optimizer, factor=0.5, patience=3, 
                                    verbose=verbose)
    
#     best_model_dict = None
    best_acc = 0
    best_preds = None
    acc_logs = []
    
    for epoch in range(1, num_epochs + 1):
        train_loss, train_acc = train_epoch(model, train_loader, optimizer, verbose)
        y_preds, y_true, valid_loss, valid_acc = valid_epoch(model, valid_loader, verbose)
        acc_logs.append(valid_acc)
        if verbose:
            print("Epoch {} finished.".format(epoch))
            print('='*20)
        if args.scheduler:
            scheduler.step(valid_loss)
        
        if valid_acc > best_acc:
            torch.save(model.state_dict(), args.save_model_path)
            best_acc = valid_acc
            best_preds = y_preds
            
#     if best_model_dict and args.save_model_path:
#         torch.save(best_model_dict, args.save_model_path)
    

    print("Evaluate on aggreagate validation using the best model")
    evaluate(best_preds, y_true)
    if args.data2_dir is not None:
        print("Evaluate on separate validation using the best model")
        evaluate_separate(best_preds, y_true, num_classes1)
    print("Best validation accuracy: ", best_acc, "%")
    if args.log_acc:
        np.save("roberta/logs/log_acc.npy", np.array(acc_logs))
 def test_instantiation_from_tokenizers_json_file(self):
     bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
     with tempfile.TemporaryDirectory() as tmpdirname:
         bert_tokenizer.save(os.path.join(tmpdirname, "tokenizer.json"))
         PreTrainedTokenizerFast(
             tokenizer_file=os.path.join(tmpdirname, "tokenizer.json"))
예제 #26
0
파일: test.py 프로젝트: dagisky/tokenizers
# tok.save("THE_TEST.tokenizer.json", pretty=True)
# print(tok.encode("𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 🦊 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 🐶").tokens)
#
# tok = Tokenizer.from_file("THE_TEST.tokenizer.json")
# # with open("THE_TEST.tokenizer.json", "r") as f:
# #     t = f.read()
# #     tok = Tokenizer.from_str(t)
# print(tok.encode("𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 🦊 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 🐶").tokens)

from tokenizers import Tokenizer
from tokenizers.implementations import BaseTokenizer
from transformers import PreTrainedTokenizerFast, LineByLineTextDataset

# tokenizer = Tokenizer(
#     BPE("../../data/roberta-base-vocab.json", "../../data/roberta-base-merges.txt")
# )
tokenizer = Tokenizer.from_file("../../data/roberta-tok.tokenizer")
print(tokenizer.encode("Hello there!").tokens)

tok_transformers = PreTrainedTokenizerFast(BaseTokenizer(tokenizer))
print(tok_transformers.tokenize("Hello there!"))

dataset = LineByLineTextDataset(tokenizer=tok_transformers,
                                file_path="../../data/botchan.txt",
                                block_size=12)

# tokenizer = ByteLevelBPETokenizer.from_files(
#     "../../data/roberta-base-vocab.json", "../../data/roberta-base-merges.txt"
# )
# print(tokenizer.encode("Hello there!").tokens)
예제 #27
0
from transformers import  BertGenerationEncoder, BertGenerationDecoder, EncoderDecoderModel
from transformers import BertTokenizer, PreTrainedTokenizerFast
from transformers import BertConfig

from sierra_dataset import SierraDataset


data_path = "/home/tkornuta/data/local-leonardo-sierra5k"
sierra_path = os.path.join(data_path, "leonardo_sierra")
decoder_tokenizer_path = os.path.join(data_path, "leonardo_sierra.plan_decoder_tokenizer.json")

# Load original BERT Ttokenizer.
encoder_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load decoder operating on the Sierra PDDL language.
decoder_tokenizer = PreTrainedTokenizerFast(tokenizer_file=decoder_tokenizer_path)
decoder_tokenizer.add_special_tokens({'unk_token': '[UNK]'})
decoder_tokenizer.add_special_tokens({'sep_token': '[SEP]'})
decoder_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
decoder_tokenizer.add_special_tokens({'cls_token': '[CLS]'})
decoder_tokenizer.add_special_tokens({'mask_token': '[MASK]'})
decoder_tokenizer.add_special_tokens({'bos_token': '[BOS]'})
decoder_tokenizer.add_special_tokens({'eos_token': '[EOS]'})
#print(f"\Decoder tokenizer vocabulary ({len(decoder_tokenizer.get_vocab())}):\n" + "-"*50)
#for k, v in decoder_tokenizer.get_vocab().items():
#    print(k, ": ", v)
# decoder_tokenizer.model_max_length=512 ??

# Create dataset/dataloader.
sierra_ds = SierraDataset(data_path=data_path)
sierra_dl = DataLoader(sierra_ds, batch_size=64, shuffle=True, num_workers=2)
예제 #28
0
        ],
    )
    # save tokenizer
    tok_path = os.path.join(output_path, "tokenizer")
    tok_path_file = os.path.join(tok_path, "vocab.json")
    os.makedirs(tok_path, exist_ok=True)
    # bpe_tokenizer.save_model(tok_path)
    bpe_tokenizer.save(tok_path_file, True)

    # load tokenizer with Roberta configuration
    bpe_tokenizer = PreTrainedTokenizerFast(
        tokenizer_file=tok_path_file,
        max_length=max_len,
        lowercase=True,
        unk_token="<unk>",
        sep_token="<sep>",
        pad_token="<pad>",
        cls_token="<cls>",
        mask_token="<mask>",
        bos_token="<s>",
        eos_token="</s>",
    )
    # bpe_tokenizer = FunnelTokenizerFast(
    #     vocab_file=tok_path,
    #     max_length=max_len,
    #     lowercase=True,
    #     sep_token="<sep>",
    #     pad_token="<pad>",
    #     cls_token="<cls>",
    #     mask_token="<mask>",
    #     bos_token="<s>",
    #     eos_token="</s>",