Пример #1
0
class ChatDataset(Dataset):
    def __init__(self, filepath, tok_vocab, max_seq_len=128) -> None:
        self.filepath = filepath
        self.data = pd.read_csv(self.filepath)
        self.bos_token = '<s>'
        self.eos_token = '</s>'
        self.max_seq_len = max_seq_len
        self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=tok_vocab,
                                                 bos_token=self.bos_token,
                                                 eos_token=self.eos_token,
                                                 unk_token='<unk>',
                                                 pad_token='<pad>',
                                                 mask_token='<mask>')

    def __len__(self):
        return len(self.data)

    def make_input_id_mask(self, tokens, index):
        input_id = self.tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(input_id)
        if len(input_id) < self.max_seq_len:
            while len(input_id) < self.max_seq_len:
                input_id += [self.tokenizer.pad_token_id]
                attention_mask += [0]
        else:
            # logging.warning(f'exceed max_seq_len for given article : {index}')
            input_id = input_id[:self.max_seq_len -
                                1] + [self.tokenizer.eos_token_id]
            attention_mask = attention_mask[:self.max_seq_len]
        return input_id, attention_mask

    def __getitem__(self, index):
        record = self.data.iloc[index]
        q, a = record['Q'], record['A']
        q_tokens = [self.bos_token] + \
            self.tokenizer.tokenize(q) + [self.eos_token]
        a_tokens = [self.bos_token] + \
            self.tokenizer.tokenize(a) + [self.eos_token]
        encoder_input_id, encoder_attention_mask = self.make_input_id_mask(
            q_tokens, index)
        decoder_input_id, decoder_attention_mask = self.make_input_id_mask(
            a_tokens, index)
        labels = self.tokenizer.convert_tokens_to_ids(
            a_tokens[1:(self.max_seq_len + 1)])
        if len(labels) < self.max_seq_len:
            while len(labels) < self.max_seq_len:
                # for cross entropy loss masking
                labels += [-100]
        return {
            'input_ids':
            np.array(encoder_input_id, dtype=np.int_),
            'attention_mask':
            np.array(encoder_attention_mask, dtype=np.float_),
            'decoder_input_ids':
            np.array(decoder_input_id, dtype=np.int_),
            'decoder_attention_mask':
            np.array(decoder_attention_mask, dtype=np.float_),
            'labels':
            np.array(labels, dtype=np.int_)
        }
Пример #2
0
# tok.save("THE_TEST.tokenizer.json", pretty=True)
# print(tok.encode("𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 🦊 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 🐶").tokens)
#
# tok = Tokenizer.from_file("THE_TEST.tokenizer.json")
# # with open("THE_TEST.tokenizer.json", "r") as f:
# #     t = f.read()
# #     tok = Tokenizer.from_str(t)
# print(tok.encode("𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 🦊 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 🐶").tokens)

from tokenizers import Tokenizer
from tokenizers.implementations import BaseTokenizer
from transformers import PreTrainedTokenizerFast, LineByLineTextDataset

# tokenizer = Tokenizer(
#     BPE("../../data/roberta-base-vocab.json", "../../data/roberta-base-merges.txt")
# )
tokenizer = Tokenizer.from_file("../../data/roberta-tok.tokenizer")
print(tokenizer.encode("Hello there!").tokens)

tok_transformers = PreTrainedTokenizerFast(BaseTokenizer(tokenizer))
print(tok_transformers.tokenize("Hello there!"))

dataset = LineByLineTextDataset(tokenizer=tok_transformers,
                                file_path="../../data/botchan.txt",
                                block_size=12)

# tokenizer = ByteLevelBPETokenizer.from_files(
#     "../../data/roberta-base-vocab.json", "../../data/roberta-base-merges.txt"
# )
# print(tokenizer.encode("Hello there!").tokens)
Пример #3
0
def convert_examples_to_features(
        model_meta: ModelMeta,
        input_examples: List[InputExample],
        class_list: List[str],
        tokenizer: PreTrainedTokenizerFast,
        use_iob2_format: bool = False) -> List[InputFeatures]:
    max_allowed_tokens = model_meta.max_seq_length - tokenizer.num_special_tokens_to_add(
    ) + int(model_meta.sep_token_extra)
    class_map = {label: i for i, label in enumerate(class_list)}
    features = []
    for ex_index, example in tqdm(enumerate(input_examples),
                                  total=len(input_examples),
                                  desc='generating features'):
        #loop over sentences in an example
        context_subsets = []
        current_subset = []
        last_count = 0
        for sentence_meta in example.sentence_meta_list:
            tokens = []
            label_ids = []
            for word, label in zip(sentence_meta.words, sentence_meta.labels):
                word_tokens = tokenizer.tokenize(word)
                if len(word_tokens) > 0:
                    tokens.extend(word_tokens)
                    # Use the real label id for the first token of the word, and padding ids for the remaining tokens
                    temp_pad_id = model_meta.pad_token_label_id
                    label_id = class_map[label]
                    if label != 'O':
                        if use_iob2_format:
                            temp_pad_id = class_map['I-' + label[2:]]
                        else:
                            temp_pad_id = class_map[label]
                    label_ids.extend([label_id] + [temp_pad_id] *
                                     (len(word_tokens) - 1))
            assert (len(tokens) == len(label_ids))
            num_tokens = len(tokens)
            sent_feature = SentenceMeta(tokens, label_ids)
            if last_count + num_tokens >= max_allowed_tokens:
                if len(current_subset) == 0:
                    #meaning a single sentence is exceeding the max_seq_len of the model
                    context_subsets.append([sent_feature])
                else:
                    context_subsets.append(current_subset)
                    current_subset = [sent_feature]
                    last_count = num_tokens
            else:
                last_count += num_tokens
                current_subset.append(sent_feature)
        if len(current_subset) > 0:
            context_subsets.append(current_subset)
        #finally create concatenated features based on sent features
        for context_subset in context_subsets:
            feature_tokens = []
            feature_labels = []
            for sent_feature in context_subset:
                feature_tokens.extend(sent_feature.words)
                feature_labels.extend(sent_feature.labels)
            input_features = convert_tokens_to_features(
                tokenizer=tokenizer,
                model_meta=model_meta,
                feature_tokens=feature_tokens,
                feature_labels=feature_labels)
            features.append(input_features)
    return features