def _dict_to_samples(self, dictionary: dict, **kwargs) -> [Sample]: # this tokenization also stores offsets and a start_of_word mask tokenized = tokenize_with_metadata(dictionary["text"], self.tokenizer) # truncate tokens, offsets and start_of_word to max_seq_len that can be handled by the model for seq_name in tokenized.keys(): tokenized[seq_name], _, _ = truncate_sequences( seq_a=tokenized[seq_name], seq_b=None, tokenizer=self.tokenizer, max_seq_len=self.max_seq_len) return [Sample(id=None, clear_text=dictionary, tokenized=tokenized)]
def _dict_to_samples(self, dictionary, all_dicts=None): assert len(all_dicts) > 1, "Need at least 2 documents to sample random sentences from" doc = dictionary["doc"] samples = [] # create one sample for each sentence in the doc (except for the very last -> "nextSentence" is impossible) for idx in range(len(doc) - 1): tokenized = {} if self.next_sent_pred: text_a, text_b, is_next_label = get_sentence_pair(doc, all_dicts, idx) sample_in_clear_text = { "text_a": text_a, "text_b": text_b, "nextsentence_label": is_next_label, } # tokenize tokenized["text_a"] = tokenize_with_metadata( text_a, self.tokenizer ) tokenized["text_b"] = tokenize_with_metadata( text_b, self.tokenizer ) # truncate to max_seq_len for seq_name in ["tokens", "offsets", "start_of_word"]: tokenized["text_a"][seq_name], tokenized["text_b"][seq_name], _ = truncate_sequences( seq_a=tokenized["text_a"][seq_name], seq_b=tokenized["text_b"][seq_name], tokenizer=self.tokenizer, max_seq_len=self.max_seq_len) samples.append(Sample(id=None, clear_text=sample_in_clear_text, tokenized=tokenized)) # if we don't do next sentence prediction, we should feed in a single sentence else: text_a = doc[idx] sample_in_clear_text = { "text_a": text_a, "text_b": None, "nextsentence_label": None, } # tokenize tokenized["text_a"] = tokenize_with_metadata( text_a, self.tokenizer ) # truncate to max_seq_len for seq_name in ["tokens", "offsets", "start_of_word"]: tokenized["text_a"][seq_name], _, _ = truncate_sequences( seq_a=tokenized["text_a"][seq_name], seq_b=None, tokenizer=self.tokenizer, max_seq_len=self.max_seq_len) samples.append(Sample(id=None, clear_text=sample_in_clear_text, tokenized=tokenized)) return samples
def _dict_to_samples(self, dictionary: dict, **kwargs) -> [Sample]: # this tokenization also stores offsets tokenized = tokenize_with_metadata(dictionary["text"], self.tokenizer) # truncate tokens, offsets and start_of_word to max_seq_len that can be handled by the model for seq_name in tokenized.keys(): tokenized[seq_name], _, _ = truncate_sequences(seq_a=tokenized[seq_name], seq_b=None, tokenizer=self.tokenizer, max_seq_len=self.max_seq_len) # Samples don't have labels during Inference mode if "label" in dictionary: label = float(dictionary["label"]) scaled_label = (label - self.tasks["regression"]["label_list"][0]) / self.tasks["regression"]["label_list"][1] dictionary["label"] = scaled_label return [Sample(id=None, clear_text=dictionary, tokenized=tokenized)]
def test_truncate_sequences(caplog): caplog.set_level(logging.CRITICAL) lang_names = ["bert-base-cased", "roberta-base", "xlnet-base-cased"] tokenizers = [] for lang_name in lang_names: t = Tokenizer.load(lang_name, lower_case=False) tokenizers.append(t) # artificial sequences (could be tokens, offsets, or anything else) seq_a = list(range(10)) seq_b = list(range(15)) max_seq_len = 20 for tokenizer in tokenizers: for strategy in ["longest_first", "only_first","only_second"]: trunc_a, trunc_b, overflow = truncate_sequences(seq_a=seq_a,seq_b=seq_b,tokenizer=tokenizer, max_seq_len=max_seq_len, truncation_strategy=strategy) assert len(trunc_a) + len(trunc_b) + tokenizer.num_added_tokens(pair=True) == max_seq_len
def parts_to_sample(self, admission_part, discharge_part, label) -> Sample: tokenized = {"text_a": admission_part, "text_b": discharge_part} sample_in_clear_text = { "text_a": admission_part["clear_text"], "text_b": discharge_part["clear_text"], "nextsentence_label": label, } # truncate to max_seq_len for seq_name in ["tokens", "offsets", "start_of_word"]: tokenized["text_a"][seq_name], tokenized["text_b"][ seq_name], _ = truncate_sequences( seq_a=tokenized["text_a"][seq_name], seq_b=tokenized["text_b"][seq_name], tokenizer=self.tokenizer, max_seq_len=self.max_seq_len) return Sample(id=None, clear_text=sample_in_clear_text, tokenized=tokenized)