def _dict_to_samples(self, dictionary, all_dicts=None): assert len( all_dicts ) > 1, "Need at least 2 documents to sample random sentences from" doc = dictionary["doc"] samples = [] # create one sample for each sentence in the doc (except for the very last -> "nextSentence" is impossible) for idx in range(len(doc) - 1): tokenized = {} if self.next_sent_pred: text_a, text_b, is_next_label = get_sentence_pair( doc, all_dicts, idx) sample_in_clear_text = { "text_a": text_a, "text_b": text_b, "nextsentence_label": is_next_label, } # tokenize tokenized["text_a"] = tokenize_with_metadata( text_a, self.tokenizer) tokenized["text_b"] = tokenize_with_metadata( text_b, self.tokenizer) # truncate to max_seq_len for seq_name in ["tokens", "offsets", "start_of_word"]: tokenized["text_a"][seq_name], tokenized["text_b"][ seq_name], _ = truncate_sequences( seq_a=tokenized["text_a"][seq_name], seq_b=tokenized["text_b"][seq_name], tokenizer=self.tokenizer, max_seq_len=self.max_seq_len) samples.append( Sample(id=None, clear_text=sample_in_clear_text, tokenized=tokenized)) # if we don't do next sentence prediction, we should feed in a single sentence else: text_a = doc[idx] sample_in_clear_text = { "text_a": text_a, "text_b": None, "nextsentence_label": None, } # tokenize tokenized["text_a"] = tokenize_with_metadata( text_a, self.tokenizer) # truncate to max_seq_len for seq_name in ["tokens", "offsets", "start_of_word"]: tokenized["text_a"][seq_name], _, _ = truncate_sequences( seq_a=tokenized["text_a"][seq_name], seq_b=None, tokenizer=self.tokenizer, max_seq_len=self.max_seq_len) samples.append( Sample(id=None, clear_text=sample_in_clear_text, tokenized=tokenized)) return samples
def create_samples_sentence_pairs(baskets): """Creates examples for Language Model Finetuning that consist of two sentences and the isNext label indicating if the two are subsequent sentences from one doc""" all_docs = [b.raw["doc"] for b in baskets] for basket in baskets: doc = basket.raw["doc"] basket.samples = [] for idx in range(len(doc) - 1): id = "%s-%s" % (basket.id, idx) text_a, text_b, is_next_label = get_sentence_pair(doc, all_docs, idx) sample_in_clear_text = { "text_a": text_a, "text_b": text_b, "is_next_label": is_next_label, } basket.samples.append(Sample(id=id, clear_text=sample_in_clear_text)) return baskets