def tokenize_character(X_text_list_train, X_text_list_test, max_sent_len=800): """ Tokenizes characters at word level :param X_text_list_train: :param X_text_list_test: :param max_sent_len: :return: x_char_encoder, x_char_padded_train, x_char_padded_test, max_word_length """ X_text_list_train = [ lst[:max_sent_len] + (max_sent_len - len(lst)) * ["<end>"] for lst in X_text_list_train ] X_text_list_test = [ lst[:max_sent_len] + (max_sent_len - len(lst)) * ["<end>"] for lst in X_text_list_test ] x_char_encoder = CharacterEncoder( sample=[" ".join(sent) for sent in X_text_list_train], append_eos=False, ) x_char_encoded_train = [ [x_char_encoder.encode(char) for char in word] for word in X_text_list_train ] x_char_encoded_test = [ [x_char_encoder.encode(char) for char in word] for word in X_text_list_test ] max_word_length = max( [ max([internal.shape[0] for internal in external]) for external in x_char_encoded_train ] ) outer_list = [] for lst in x_char_encoded_train: inner_list = [] for ten in lst: res = torch.zeros(max_word_length, dtype=torch.long) res[: ten.shape[0]] = ten[:max_word_length] inner_list.append(res) outer_list.append(inner_list) x_char_padded_train = torch.stack([torch.stack(lst) for lst in outer_list]) outer_list = [] for lst in x_char_encoded_test: inner_list = [] for ten in lst: res = torch.zeros(max_word_length, dtype=torch.long) res[: ten.shape[0]] = ten[:max_word_length] inner_list.append(res) outer_list.append(inner_list) x_char_padded_test = torch.stack([torch.stack(lst) for lst in outer_list]) return x_char_encoder, x_char_padded_train, x_char_padded_test, max_word_length
def train( self, pair_dataset, append_eos=True, append_sos=True, min_occurrences=1000 ): """train a tokenizer""" # create generator based on slice of data (3*10^5 sentences) dataset_example_gen = ( ex["correct"] + " " + ex["incorrect"] for ex in itr.islice(pair_dataset, self._tokenizer_max_seq) ) self.tokenizer = CharacterEncoder( dataset_example_gen, append_eos=True, append_sos=True, min_occurrences=1000 ) # after training set the variables self.vocab_size = self.tokenizer.vocab_size self.padding_index = self.tokenizer.padding_index # =0
def tokenize_character(X_text_list_train, x_padded_train, x_padded_test, x_encoder): x_char_encoder = CharacterEncoder( sample=X_text_list_train, append_eos=False, ) x_char_encoded_train = [[ x_char_encoder.encode(x_encoder.index_to_token[word.item()]) for word in text ] for text in x_padded_train] MAX_WORD_LENGTH = max([ max([internal.shape[0] for internal in external]) for external in x_char_encoded_train ]) # x_char_padded = max([max([internal.shape[0] for internal in external]) for external in x_char_encoded]) # x_char_padded = torch.LongTensor(pad_sequence(x_char_encoded, MAX_SENTENCE_LEN+1)) outer_list = [] for lst in x_char_encoded_train: inner_list = [] for ten in lst: res = torch.zeros(MAX_WORD_LENGTH, dtype=torch.long) res[:ten.shape[0]] = ten inner_list.append(res) outer_list.append(inner_list) x_char_padded_train = torch.stack([torch.stack(lst) for lst in outer_list]) x_char_encoded_test = [[ x_char_encoder.encode(x_encoder.index_to_token[word]) for word in text ] for text in x_padded_test] outer_list = [] for lst in x_char_encoded_test: inner_list = [] for ten in lst: res = torch.zeros(MAX_WORD_LENGTH, dtype=torch.long) res[:ten.shape[0]] = ten inner_list.append(res) outer_list.append(inner_list) x_char_padded_test = torch.stack([torch.stack(lst) for lst in outer_list]) return x_char_encoder, x_char_padded_train, x_char_padded_test
class CharTokenizerEncoder(TokenizerEncoder): def __init__(self): self.vocab_size = None self.padding_index = None # determine how many sequences we take to build the vocabulary self._tokenizer_max_seq = 3 * 10 ** 5 # pickle tokenizer file name self.tokenizer_name = "char_corrector_tokenizer" def train( self, pair_dataset, append_eos=True, append_sos=True, min_occurrences=1000 ): """train a tokenizer""" # create generator based on slice of data (3*10^5 sentences) dataset_example_gen = ( ex["correct"] + " " + ex["incorrect"] for ex in itr.islice(pair_dataset, self._tokenizer_max_seq) ) self.tokenizer = CharacterEncoder( dataset_example_gen, append_eos=True, append_sos=True, min_occurrences=1000 ) # after training set the variables self.vocab_size = self.tokenizer.vocab_size self.padding_index = self.tokenizer.padding_index # =0 def encode(self, text): pass def encode_batch(self, samples): """ Encodes list of strings Args: ----------- samples: list of strings """ # it is compatible with pytrochNLP tokens, lengths = self.tokenizer.batch_encode(samples) return tokens, lengths def decode(self, text): pass
def __init__( self, markdown_lines: List[str], tokenizer, seq_len=128, ): self.intent_dict = {} self.entity_dict = {} self.entity_dict["O"] = 0 # using BIO tagging self.dataset = [] self.seq_len = seq_len intent_value_list = [] entity_type_list = [] current_intent_focus = "" text_list = [] for line in tqdm( markdown_lines, desc= "Organizing Intent & Entity dictionary in NLU markdown file ...", ): if len(line.strip()) < 2: current_intent_focus = "" continue if "## " in line: if "intent:" in line: intent_value_list.append(line.split(":")[1].strip()) current_intent_focus = line.split(":")[1].strip() else: current_intent_focus = "" else: if current_intent_focus != "": text = line[2:].strip().lower() for type_str in re.finditer(r"\([a-zA-Z_1-2]+\)", text): entity_type = (text[type_str.start() + 1:type_str.end() - 1].replace( "(", "").replace(")", "")) entity_type_list.append(entity_type) text = re.sub(r"\([a-zA-Z_1-2]+\)", "", text) # remove (...) str text = text.replace("[", "").replace( "]", "") # remove '[',']' special char if len(text) > 0: text_list.append(text.strip()) #dataset tokenizer setting if "ElectraTokenizer" in str(type(tokenizer)): self.tokenizer = tokenizer self.pad_token_id = 0 self.unk_token_id = 1 self.eos_token_id = 3 #[SEP] token self.bos_token_id = 2 #[CLS] token else: if tokenizer == 'char': self.tokenizer = CharacterEncoder(text_list) # torchnlp base special token indices self.pad_token_id = 0 self.unk_token_id = 1 self.eos_token_id = 2 self.bos_token_id = 3 elif tokenizer == 'space': self.tokenizer = WhitespaceEncoder(text_list) # torchnlp base special token indices self.pad_token_id = 0 self.unk_token_id = 1 self.eos_token_id = 2 self.bos_token_id = 3 elif tokenizer == 'kobert': self.tokenizer = kobert_tokenizer() self.pad_token_id = 1 self.unk_token_id = 0 self.eos_token_id = 3 #[SEP] token self.bos_token_id = 2 #[CLS] token else: raise ValueError('not supported tokenizer type') intent_value_list = sorted(intent_value_list) for intent_value in intent_value_list: if intent_value not in self.intent_dict.keys(): self.intent_dict[intent_value] = len(self.intent_dict) entity_type_list = sorted(entity_type_list) for entity_type in entity_type_list: if entity_type + '_B' not in self.entity_dict.keys(): self.entity_dict[str(entity_type) + '_B'] = len( self.entity_dict) if entity_type + '_I' not in self.entity_dict.keys(): self.entity_dict[str(entity_type) + '_I'] = len( self.entity_dict) current_intent_focus = "" for line in tqdm( markdown_lines, desc="Extracting Intent & Entity in NLU markdown files...", ): if len(line.strip()) < 2: current_intent_focus = "" continue if "## " in line: if "intent:" in line: current_intent_focus = line.split(":")[1].strip() else: current_intent_focus = "" else: if current_intent_focus != "": # intent & entity sentence occur case text = line[2:].strip().lower() entity_value_list = [] for value in re.finditer(r"\[(.*?)\]", text): entity_value_list.append( text[value.start() + 1:value.end() - 1].replace( "[", "").replace("]", "")) entity_type_list = [] for type_str in re.finditer(r"\([a-zA-Z_1-2]+\)", text): entity_type = (text[type_str.start() + 1:type_str.end() - 1].replace( "(", "").replace(")", "")) entity_type_list.append(entity_type) text = re.sub(r"\([a-zA-Z_1-2]+\)", "", text) # remove (...) str text = text.replace("[", "").replace( "]", "") # remove '[',']' special char if len(text) > 0: each_data_dict = {} each_data_dict["text"] = text.strip() each_data_dict["intent"] = current_intent_focus each_data_dict["intent_idx"] = self.intent_dict[ current_intent_focus] each_data_dict["entities"] = [] for value, type_str in zip(entity_value_list, entity_type_list): for entity in re.finditer(value, text): entity_tokens = self.tokenize(value) for i, entity_token in enumerate( entity_tokens): if i == 0: BIO_type_str = type_str + '_B' else: BIO_type_str = type_str + '_I' each_data_dict["entities"].append({ "start": text.find(entity_token, entity.start(), entity.end()), "end": text.find(entity_token, entity.start(), entity.end()) + len(entity_token), "entity": type_str, "value": entity_token, "entity_idx": self.entity_dict[BIO_type_str], }) self.dataset.append(each_data_dict) print(f"Intents: {self.intent_dict}") print(f"Entities: {self.entity_dict}")
class ABCSec2SeqDataModule(pl.LightningDataModule): '''simple pytroch-lightning data module witch generates artificial data, generate simple translation task from permuted alphabet to normal alphabet as list of dicts [ { "correct" : "...", "incorrect": "..."} { "correct" : "...", "incorrect": "..."} ] ''' def __init__( self, batch_size=4, N_random_samples=1000, N_valid_size=200, num_workers=1 ): super().__init__() self.batch_size = batch_size self.vocab_size = -1 self.padding_index = -1 assert N_random_samples > N_valid_size self.N_random_samples = N_random_samples self.N_valid_size = N_valid_size self.num_workers = num_workers def prepare_data(self): # stuff here is done once at the very beginning of training # before any distributed training starts # https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html#prepare-data pass def _setup_task1(self, N_random_samples): """generate simple translation task from permuted alphabet to normal alphabet. Fixed length only permuted characters """ # If you want to play with it: # - you can try shorter vocab for faster trainning eg. 'abcdefghij' (ony 10 chars) init_string = "abcdefghijklmnopqrstuwxyz" dataset = [] for i in range(N_random_samples): l = list(init_string) random.shuffle(l) dataset.append( {"correct": f"{i}-{init_string}", "incorrect": f'{i}-{"".join(l)}'} ) return dataset def _setup_task2(self, N_random_samples): """generates simple translation task random length, random letters""" init_string = "abcdefghijklmnoprstuwxyz" str_len = len(init_string) dataset = [] for i in range(N_random_samples): # random seq len [3, str_len] rnd_len = random.randint(3, str_len) # random characters choosen with replacement t = random.choices(init_string, k=rnd_len) t_sort = sorted(t) dataset.append({"correct": "".join(t_sort), "incorrect": "".join(t)}) return dataset def _bucket_train_sort_func(self, i): """defines sort key for bucketBatchSampler, should be defined in top scope because in distributed mode lambda can't be pickled """ return -len(self.train_ds[i]["incorrect"]) def _bucket_val_sort_func(self, i): """defines sort key for bucketBatchSampler, should be defined in top scope because in distributed mode lambda can't be pickled """ return -len(self.valid_ds[i]["incorrect"]) def _sampler_sort_func(self, x): """ function for SortedSampler, sort in reverse orderby incorrect sequence lenght, added random value for hashing the rows in distributed scenario, each epoch get slight different set of sentences sequences arent sorted exacly, but it does not matter much, """ return -len(x["incorrect"]) + random.randint(0, 4) def setup(self, stage): # https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html#setup N_valid_size = self.N_valid_size # dataset = self._setup_task1(self.N_random_samples) dataset = self._setup_task2(self.N_random_samples) # list of dicts self.train_ds = dataset[0:-N_valid_size] self.valid_ds = dataset[-N_valid_size:] # load dataset build vocab and numericalize # todo: change it bad design! only for prototyping and learning dataset_example_gen = (ex["correct"] + " " + ex["incorrect"] for ex in dataset) self.tokenizer = CharacterEncoder( dataset_example_gen, append_eos=True, append_sos=True ) pickle.dump( self.tokenizer, open(f"./abc_data_character_encoder.p", "wb"), ) self.train_sampler = SortedSampler( self.train_ds, sort_key=self._sampler_sort_func ) self.val_sampler = SortedSampler( self.valid_ds, sort_key=self._sampler_sort_func ) # #samplers from torchnlp, did not work with distibutedDataParallel # self.train_sampler = BucketBatchSampler( # sampler=SequentialSampler(self.train_ds), # # bucket_size_multiplier=1000, # batch_size=self.batch_size, # drop_last=True, # sort_key=self._bucket_train_sort_func, # #sort_key=lambda i: -len(self.train_ds[i]["incorrect"]), # ) # self.val_sampler = BucketBatchSampler( # sampler=SequentialSampler(self.valid_ds), # batch_size=self.batch_size, # drop_last=True, # sort_key = self._bucket_val_sort_func, # #sort_key=lambda i: -len(self.valid_ds[i]["incorrect"]), # ) # samplers from catalyst # DistributedWrapperSampler # DynamicBatchLensampler # https://github.com/catalyst-team/catalyst/blob/master/catalyst/data/sampler.py # DynamicLenBatchSampler, DistributedSamplerWrapper # train_sampler = RandomSampler(self.train_ds) # train_sampler = DynamicLenBatchSampler(train_sampler, self.batch_size, drop_last=True) # self.train_sampler = train_sampler # self.train_sampler = DistributedSamplerWrapper(train_sampler) # valid_sampler = RandomSampler(self.valid_ds) # valid_sampler = DynamicLenBatchSampler(valid_sampler, self.batch_size, drop_last=True) # self.val_sampler = valid_sampler # self.valid_sampler = DistributedSamplerWrapper(valid_sampler) ### todo: do wymiany self.vocab_size = self.tokenizer.vocab_size self.padding_index = self.tokenizer.padding_index # =0 def __collate_fn(self, sample: list, prepare_target=True): """ torch.utils.Dataloader collate_fn change layout of data from list of dicts to dict of tensors [ {text: 'a', label:'0'} {text: 'b', label:'1'} {text: 'c', label:'2'} ] to { text: ['a', 'b', 'c'], label:[0,1,2] } and encode tokens to its ids in vocab, do also 0 padding """ # sort in reverse order, need for packed sequence sorted_sample = sorted(sample, key=lambda x: -len(x["incorrect"])) collate_sample = collate_tensors( sorted_sample, stack_tensors=stack_and_pad_tensors ) ### todo: do wymiany src_tokens, src_lengths = self.tokenizer.batch_encode( collate_sample["incorrect"] ) # cant change layout here, becaure when use distributeddataloader (multi-gpu) it will # divide first dim by the number of gpus, # change from [batch, seq_len] -> to [seq_len, batch] # src_tokens = src_tokens.transpose(0, 1) inputs = {"src_ids": src_tokens, "src_lengths": src_lengths} ### todo: do wymiany ### encode tokens based on vocab trg_tokens, trg_lengths = self.tokenizer.batch_encode(collate_sample["correct"]) # change from [batch, seq_len] -> to [seq_len, batch] # trg_tokens = trg_tokens.transpose(0, 1) targets = {"trg_ids": trg_tokens, "trg_lengths": trg_lengths} return inputs, targets def train_dataloader(self): # dataloader with sampler for distributed training trainer should have set replace_sampler_ddp=False self._train_dl = DataLoader( dataset=self.train_ds, num_workers=self.num_workers, shuffle=False, sampler=self.train_sampler, collate_fn=self.__collate_fn, batch_size=self.batch_size, ) return self._train_dl def val_dataloader(self): # with normal sampler self._val_dl = DataLoader( dataset=self.valid_ds, collate_fn=self.__collate_fn, num_workers=self.num_workers, sampler=self.val_sampler, batch_size=self.batch_size, shuffle=False, ) return self._val_dl
def setup(self, stage): # https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html#setup N_valid_size = self.N_valid_size # dataset = self._setup_task1(self.N_random_samples) dataset = self._setup_task2(self.N_random_samples) # list of dicts self.train_ds = dataset[0:-N_valid_size] self.valid_ds = dataset[-N_valid_size:] # load dataset build vocab and numericalize # todo: change it bad design! only for prototyping and learning dataset_example_gen = (ex["correct"] + " " + ex["incorrect"] for ex in dataset) self.tokenizer = CharacterEncoder( dataset_example_gen, append_eos=True, append_sos=True ) pickle.dump( self.tokenizer, open(f"./abc_data_character_encoder.p", "wb"), ) self.train_sampler = SortedSampler( self.train_ds, sort_key=self._sampler_sort_func ) self.val_sampler = SortedSampler( self.valid_ds, sort_key=self._sampler_sort_func ) # #samplers from torchnlp, did not work with distibutedDataParallel # self.train_sampler = BucketBatchSampler( # sampler=SequentialSampler(self.train_ds), # # bucket_size_multiplier=1000, # batch_size=self.batch_size, # drop_last=True, # sort_key=self._bucket_train_sort_func, # #sort_key=lambda i: -len(self.train_ds[i]["incorrect"]), # ) # self.val_sampler = BucketBatchSampler( # sampler=SequentialSampler(self.valid_ds), # batch_size=self.batch_size, # drop_last=True, # sort_key = self._bucket_val_sort_func, # #sort_key=lambda i: -len(self.valid_ds[i]["incorrect"]), # ) # samplers from catalyst # DistributedWrapperSampler # DynamicBatchLensampler # https://github.com/catalyst-team/catalyst/blob/master/catalyst/data/sampler.py # DynamicLenBatchSampler, DistributedSamplerWrapper # train_sampler = RandomSampler(self.train_ds) # train_sampler = DynamicLenBatchSampler(train_sampler, self.batch_size, drop_last=True) # self.train_sampler = train_sampler # self.train_sampler = DistributedSamplerWrapper(train_sampler) # valid_sampler = RandomSampler(self.valid_ds) # valid_sampler = DynamicLenBatchSampler(valid_sampler, self.batch_size, drop_last=True) # self.val_sampler = valid_sampler # self.valid_sampler = DistributedSamplerWrapper(valid_sampler) ### todo: do wymiany self.vocab_size = self.tokenizer.vocab_size self.padding_index = self.tokenizer.padding_index # =0
def __init__( self, markdown_lines: List[str], seq_len=128, pad_token_id=0, unk_token_id=1, eos_token_id=2, bos_token_id=3, tokenizer=None ): self.intent_dict = {} self.entity_dict = {} self.entity_dict[ "O" ] = 0 # based on XO tagging(one entity_type has assigned to one class) self.dataset = [] self.seq_len = seq_len # following torchnlp encoder preset self.pad_token_id = pad_token_id self.unk_token_id = unk_token_id self.eos_token_id = eos_token_id self.bos_token_id = bos_token_id current_intent_focus = "" for line in tqdm( markdown_lines, desc="Extracting Intent & Entity in NLU markdown files...", ): if len(line.strip()) < 2: continue if "## " in line: if "intent:" in line: current_intent_focus = line.split(":")[1].strip() if current_intent_focus not in self.intent_dict.keys(): self.intent_dict[current_intent_focus] = len( self.intent_dict.keys() ) else: current_intent_focus = "" else: if current_intent_focus != "": # intent & entity sentence occur case text = line[2:] entity_value_list = [] for value in re.finditer(r"\[[^)]*\]", text): entity_value_list.append( text[value.start() + 1 : value.end() - 1].replace('[','').replace(']','') ) entity_type_list = [] for type_str in re.finditer(r"\([^)]*\)", text): entity_type = text[type_str.start() + 1 : type_str.end() - 1].replace('(','').replace(')','') entity_type_list.append(entity_type) if entity_type not in self.entity_dict.keys(): self.entity_dict[entity_type] = len(self.entity_dict.keys()) text = re.sub(r"\([^)]*\)", "", text) text = text.replace("[", "").replace("]", "") each_data_dict = {} each_data_dict["text"] = text.strip() each_data_dict["intent"] = current_intent_focus each_data_dict["intent_idx"] = self.intent_dict[ current_intent_focus ] each_data_dict["entities"] = [] for value, type_str in zip(entity_value_list, entity_type_list): try: for entity in re.finditer(value, text): each_data_dict["entities"].append( { "start": entity.start(), "end": entity.end(), "entity": type_str, "entity_idx": self.entity_dict[type_str], } ) except Exception as ex: print (f'error occured : {ex}') print (f'value: {value}') print (f'text: {text}') self.dataset.append(each_data_dict) # encoder(tokenizer) definition self.encoder = CharacterEncoder([data["text"] for data in self.dataset]) self.tokenizer = tokenizer
class RasaIntentEntityDataset(torch.utils.data.Dataset): """ RASA NLU markdown file lines based Custom Dataset Class Dataset Example in nlu.md ## intent:intent_데이터_자동_선물하기_멀티턴 <- intent name - T끼리 데이터 주기적으로 보내기 <- utterance without entity - 인터넷 데이터 [달마다](Every_Month)마다 보내줄 수 있어? <- utterance with entity """ def __init__( self, markdown_lines: List[str], seq_len=128, pad_token_id=0, unk_token_id=1, eos_token_id=2, bos_token_id=3, tokenizer=None ): self.intent_dict = {} self.entity_dict = {} self.entity_dict[ "O" ] = 0 # based on XO tagging(one entity_type has assigned to one class) self.dataset = [] self.seq_len = seq_len # following torchnlp encoder preset self.pad_token_id = pad_token_id self.unk_token_id = unk_token_id self.eos_token_id = eos_token_id self.bos_token_id = bos_token_id current_intent_focus = "" for line in tqdm( markdown_lines, desc="Extracting Intent & Entity in NLU markdown files...", ): if len(line.strip()) < 2: continue if "## " in line: if "intent:" in line: current_intent_focus = line.split(":")[1].strip() if current_intent_focus not in self.intent_dict.keys(): self.intent_dict[current_intent_focus] = len( self.intent_dict.keys() ) else: current_intent_focus = "" else: if current_intent_focus != "": # intent & entity sentence occur case text = line[2:] entity_value_list = [] for value in re.finditer(r"\[[^)]*\]", text): entity_value_list.append( text[value.start() + 1 : value.end() - 1].replace('[','').replace(']','') ) entity_type_list = [] for type_str in re.finditer(r"\([^)]*\)", text): entity_type = text[type_str.start() + 1 : type_str.end() - 1].replace('(','').replace(')','') entity_type_list.append(entity_type) if entity_type not in self.entity_dict.keys(): self.entity_dict[entity_type] = len(self.entity_dict.keys()) text = re.sub(r"\([^)]*\)", "", text) text = text.replace("[", "").replace("]", "") each_data_dict = {} each_data_dict["text"] = text.strip() each_data_dict["intent"] = current_intent_focus each_data_dict["intent_idx"] = self.intent_dict[ current_intent_focus ] each_data_dict["entities"] = [] for value, type_str in zip(entity_value_list, entity_type_list): try: for entity in re.finditer(value, text): each_data_dict["entities"].append( { "start": entity.start(), "end": entity.end(), "entity": type_str, "entity_idx": self.entity_dict[type_str], } ) except Exception as ex: print (f'error occured : {ex}') print (f'value: {value}') print (f'text: {text}') self.dataset.append(each_data_dict) # encoder(tokenizer) definition self.encoder = CharacterEncoder([data["text"] for data in self.dataset]) self.tokenizer = tokenizer def tokenize(self, text: str, padding: bool = True, return_tensor: bool = True): # bos_token=3, eos_token=2, unk_token=1, pad_token=0 if self.tokenizer is not None: tokens = self.tokenizer.encode(text) if type(tokens) == list: tokens = torch.tensor(tokens) else: tokens = self.encoder.encode(text) bos_tensor = torch.tensor([self.bos_token_id]) eos_tensor = torch.tensor([self.eos_token_id]) tokens = torch.cat((bos_tensor, tokens, eos_tensor), 0) if padding: if len(tokens) > self.seq_len: tokens = tokens[:self.seq_len] else: pad_tensor = torch.tensor( [self.pad_token_id] * (self.seq_len - len(tokens)) ) tokens = torch.cat((tokens, pad_tensor), 0) if return_tensor: return tokens else: return tokens.numpy() def __len__(self): return len(self.dataset) def __getitem__(self, idx): tokens = self.tokenize(self.dataset[idx]["text"]) intent_idx = torch.tensor([self.dataset[idx]["intent_idx"]]) entity_idx = np.zeros(self.seq_len) for entity_info in self.dataset[idx]["entities"]: for i in range(entity_info["start"], entity_info["end"] + 1): entity_idx[i] = entity_info["entity_idx"] entity_idx = torch.from_numpy(entity_idx) return tokens, intent_idx, entity_idx def get_intent_idx(self): return self.intent_dict def get_entity_idx(self): return self.entity_dict def get_vocab_size(self): if self.tokenizer is not None: return len(self.tokenizer) return len(self.encoder.vocab) def get_seq_len(self): return self.seq_len
def test_character_encoder_min_occurrences(sample): encoder = CharacterEncoder(sample, min_occurrences=10) input_ = 'English-language pangram' output = encoder.encode(input_) assert encoder.decode(output) == ''.join([DEFAULT_UNKNOWN_TOKEN] * len(input_))
def encoder(sample): return CharacterEncoder(sample)