class ChatDataset(Dataset): def __init__(self, filepath, tok_vocab, max_seq_len=128) -> None: self.filepath = filepath self.data = pd.read_csv(self.filepath) self.bos_token = '<s>' self.eos_token = '</s>' self.max_seq_len = max_seq_len self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=tok_vocab, bos_token=self.bos_token, eos_token=self.eos_token, unk_token='<unk>', pad_token='<pad>', mask_token='<mask>') def __len__(self): return len(self.data) def make_input_id_mask(self, tokens, index): input_id = self.tokenizer.convert_tokens_to_ids(tokens) attention_mask = [1] * len(input_id) if len(input_id) < self.max_seq_len: while len(input_id) < self.max_seq_len: input_id += [self.tokenizer.pad_token_id] attention_mask += [0] else: # logging.warning(f'exceed max_seq_len for given article : {index}') input_id = input_id[:self.max_seq_len - 1] + [self.tokenizer.eos_token_id] attention_mask = attention_mask[:self.max_seq_len] return input_id, attention_mask def __getitem__(self, index): record = self.data.iloc[index] q, a = record['Q'], record['A'] q_tokens = [self.bos_token] + \ self.tokenizer.tokenize(q) + [self.eos_token] a_tokens = [self.bos_token] + \ self.tokenizer.tokenize(a) + [self.eos_token] encoder_input_id, encoder_attention_mask = self.make_input_id_mask( q_tokens, index) decoder_input_id, decoder_attention_mask = self.make_input_id_mask( a_tokens, index) labels = self.tokenizer.convert_tokens_to_ids( a_tokens[1:(self.max_seq_len + 1)]) if len(labels) < self.max_seq_len: while len(labels) < self.max_seq_len: # for cross entropy loss masking labels += [-100] return { 'input_ids': np.array(encoder_input_id, dtype=np.int_), 'attention_mask': np.array(encoder_attention_mask, dtype=np.float_), 'decoder_input_ids': np.array(decoder_input_id, dtype=np.int_), 'decoder_attention_mask': np.array(decoder_attention_mask, dtype=np.float_), 'labels': np.array(labels, dtype=np.int_) }
# tok.save("THE_TEST.tokenizer.json", pretty=True) # print(tok.encode("𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 🦊 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 🐶").tokens) # # tok = Tokenizer.from_file("THE_TEST.tokenizer.json") # # with open("THE_TEST.tokenizer.json", "r") as f: # # t = f.read() # # tok = Tokenizer.from_str(t) # print(tok.encode("𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 🦊 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 🐶").tokens) from tokenizers import Tokenizer from tokenizers.implementations import BaseTokenizer from transformers import PreTrainedTokenizerFast, LineByLineTextDataset # tokenizer = Tokenizer( # BPE("../../data/roberta-base-vocab.json", "../../data/roberta-base-merges.txt") # ) tokenizer = Tokenizer.from_file("../../data/roberta-tok.tokenizer") print(tokenizer.encode("Hello there!").tokens) tok_transformers = PreTrainedTokenizerFast(BaseTokenizer(tokenizer)) print(tok_transformers.tokenize("Hello there!")) dataset = LineByLineTextDataset(tokenizer=tok_transformers, file_path="../../data/botchan.txt", block_size=12) # tokenizer = ByteLevelBPETokenizer.from_files( # "../../data/roberta-base-vocab.json", "../../data/roberta-base-merges.txt" # ) # print(tokenizer.encode("Hello there!").tokens)
def convert_examples_to_features( model_meta: ModelMeta, input_examples: List[InputExample], class_list: List[str], tokenizer: PreTrainedTokenizerFast, use_iob2_format: bool = False) -> List[InputFeatures]: max_allowed_tokens = model_meta.max_seq_length - tokenizer.num_special_tokens_to_add( ) + int(model_meta.sep_token_extra) class_map = {label: i for i, label in enumerate(class_list)} features = [] for ex_index, example in tqdm(enumerate(input_examples), total=len(input_examples), desc='generating features'): #loop over sentences in an example context_subsets = [] current_subset = [] last_count = 0 for sentence_meta in example.sentence_meta_list: tokens = [] label_ids = [] for word, label in zip(sentence_meta.words, sentence_meta.labels): word_tokens = tokenizer.tokenize(word) if len(word_tokens) > 0: tokens.extend(word_tokens) # Use the real label id for the first token of the word, and padding ids for the remaining tokens temp_pad_id = model_meta.pad_token_label_id label_id = class_map[label] if label != 'O': if use_iob2_format: temp_pad_id = class_map['I-' + label[2:]] else: temp_pad_id = class_map[label] label_ids.extend([label_id] + [temp_pad_id] * (len(word_tokens) - 1)) assert (len(tokens) == len(label_ids)) num_tokens = len(tokens) sent_feature = SentenceMeta(tokens, label_ids) if last_count + num_tokens >= max_allowed_tokens: if len(current_subset) == 0: #meaning a single sentence is exceeding the max_seq_len of the model context_subsets.append([sent_feature]) else: context_subsets.append(current_subset) current_subset = [sent_feature] last_count = num_tokens else: last_count += num_tokens current_subset.append(sent_feature) if len(current_subset) > 0: context_subsets.append(current_subset) #finally create concatenated features based on sent features for context_subset in context_subsets: feature_tokens = [] feature_labels = [] for sent_feature in context_subset: feature_tokens.extend(sent_feature.words) feature_labels.extend(sent_feature.labels) input_features = convert_tokens_to_features( tokenizer=tokenizer, model_meta=model_meta, feature_tokens=feature_tokens, feature_labels=feature_labels) features.append(input_features) return features