def _dict_to_samples(self, dictionary, all_dicts=None): assert len( all_dicts ) > 1, "Need at least 2 documents to sample random sentences from" doc = dictionary["doc"] samples = [] # create one sample for each sentence in the doc (except for the very last -> "nextSentence" is impossible) for idx in range(len(doc) - 1): tokenized = {} if self.next_sent_pred: text_a, text_b, is_next_label = get_sentence_pair( doc, all_dicts, idx) sample_in_clear_text = { "text_a": text_a, "text_b": text_b, "nextsentence_label": is_next_label, } # tokenize tokenized["text_a"] = tokenize_with_metadata( text_a, self.tokenizer) tokenized["text_b"] = tokenize_with_metadata( text_b, self.tokenizer) # truncate to max_seq_len for seq_name in ["tokens", "offsets", "start_of_word"]: tokenized["text_a"][seq_name], tokenized["text_b"][ seq_name], _ = truncate_sequences( seq_a=tokenized["text_a"][seq_name], seq_b=tokenized["text_b"][seq_name], tokenizer=self.tokenizer, max_seq_len=self.max_seq_len) samples.append( Sample(id=None, clear_text=sample_in_clear_text, tokenized=tokenized)) # if we don't do next sentence prediction, we should feed in a single sentence else: text_a = doc[idx] sample_in_clear_text = { "text_a": text_a, "text_b": None, "nextsentence_label": None, } # tokenize tokenized["text_a"] = tokenize_with_metadata( text_a, self.tokenizer) # truncate to max_seq_len for seq_name in ["tokens", "offsets", "start_of_word"]: tokenized["text_a"][seq_name], _, _ = truncate_sequences( seq_a=tokenized["text_a"][seq_name], seq_b=None, tokenizer=self.tokenizer, max_seq_len=self.max_seq_len) samples.append( Sample(id=None, clear_text=sample_in_clear_text, tokenized=tokenized)) return samples
def _dict_to_samples(self, dictionary: dict, **kwargs) -> List[Sample]: # this tokenization also stores offsets tokenized = tokenize_with_metadata(dictionary["text"], self.tokenizer) if len(tokenized["tokens"]) == 0: text = dictionary["text"] logger.warning( f"The following text could not be tokenized, likely because it contains a character that the tokenizer does not recognize: {text}" ) return [] # truncate tokens, offsets and start_of_word to max_seq_len that can be handled by the model for seq_name in tokenized.keys(): tokenized[seq_name], _, _ = truncate_sequences( seq_a=tokenized[seq_name], seq_b=None, tokenizer=self.tokenizer, max_seq_len=self.max_seq_len) # Samples don't have labels during Inference mode for task_name, task in self.tasks.items(): if task_name in dictionary: label = float(dictionary[task_name]) scaled_label = (label - task["label_list"][0]) / task["label_list"][1] dictionary[task_name] = scaled_label if self.features: feats_embed = dictionary.pop("features") return [ FeaturesEmbeddingSample(id=None, clear_text=dictionary, tokenized=tokenized, feat_embeds=feats_embed) ] return [Sample(id=None, clear_text=dictionary, tokenized=tokenized)]
def _dict_to_samples(cls, dict: dict, **kwargs) -> [Sample]: # this tokenization also stores offsets tokenized = tokenize_with_metadata(dict["text"], cls.tokenizer, cls.max_seq_len) # Samples don't have labels during Inference mode if "label" in dict: dict["label"] = float(dict["label"]) return [Sample(id=None, clear_text=dict, tokenized=tokenized)]
def create_samples_sentence_pairs_using_placeholder(baskets, tokenizer, max_seq_len): """A modified version of create_samples_sentence_pairs from farm/data_handlers/samples.py which calls a modified version of get_sentence_pair which just fetches a placeholder for the second sentence.""" # TODO why not just use create_char_mlm_prediction_samples_sentence_pairs? Check if it makes a difference. for basket in tqdm(baskets): doc = basket.raw["doc"] basket.samples = [] for idx in range(len(doc) - 1): id = "%s-%s" % (basket.id, idx) text_a, text_b, is_next_label = get_sentence_pair_with_placeholder( doc, idx) sample_in_clear_text = { "text_a": text_a, "text_b": text_b, "is_next_label": is_next_label, } tokenized = {} tokenized["text_a"] = tokenize_with_metadata( text_a, tokenizer, max_seq_len) tokenized["text_b"] = tokenize_with_metadata( text_b, tokenizer, max_seq_len) basket.samples.append( Sample(id=id, clear_text=sample_in_clear_text, tokenized=tokenized)) return baskets
def test_sample_to_features_qa(caplog): if caplog: caplog.set_level(logging.CRITICAL) sample_types = ["span", "no_answer"] for sample_type in sample_types: clear_text = json.load( open(f"samples/qa/{sample_type}/clear_text.json")) tokenized = json.load(open(f"samples/qa/{sample_type}/tokenized.json")) features_gold = json.load( open(f"samples/qa/{sample_type}/features.json")) max_seq_len = len(features_gold["input_ids"]) tokenizer = Tokenizer.load(pretrained_model_name_or_path=MODEL, do_lower_case=False) curr_id = "-".join([str(x) for x in features_gold["id"]]) s = Sample(id=curr_id, clear_text=clear_text, tokenized=tokenized) features = sample_to_features_qa(s, tokenizer, max_seq_len, SP_TOKENS_START, SP_TOKENS_MID, SP_TOKENS_END)[0] features = to_list(features) keys = features_gold.keys() for k in keys: value_gold = features_gold[k] value = to_list(features[k]) assert value == value_gold, f"Mismatch between the {k} features in the {sample_type} test sample."
def _dict_to_samples(self, dictionary: dict, **kwargs) -> List[Sample]: # this tokenization also stores offsets, which helps to map our entity tags back to original positions tokenized = tokenize_with_metadata(dictionary["text"], self.tokenizer) if len(tokenized["tokens"]) == 0: text = dictionary["text"] logger.warning( f"The following text could not be tokenized, likely because it contains a character that the tokenizer does not recognize: {text}" ) return [] # truncate tokens, offsets and start_of_word to max_seq_len that can be handled by the model for seq_name in tokenized.keys(): tokenized[seq_name], _, _ = truncate_sequences( seq_a=tokenized[seq_name], seq_b=None, tokenizer=self.tokenizer, max_seq_len=self.max_seq_len) # Samples don't have labels during Inference mode for task_name, task in self.tasks.items(): if task_name in dictionary: scaled_dict_labels = [] for label in dictionary[task_name]: label = float(label) scaled_label = ( label - task["label_list"][0]) / task["label_list"][1] scaled_dict_labels.append(scaled_label) dictionary[task_name] = scaled_dict_labels return [Sample(id=None, clear_text=dictionary, tokenized=tokenized)]
def _dict_to_samples(self, dictionary: dict, **kwargs) -> [Sample]: # this tokenization also stores offsets, which helps to map our entity tags back to original positions tokenized = tokenize_with_metadata(dictionary["text"], self.tokenizer) # truncate tokens, offsets and start_of_word to max_seq_len that can be handled by the model for seq_name in tokenized.keys(): tokenized[seq_name], _, _ = truncate_sequences(seq_a=tokenized[seq_name], seq_b=None, tokenizer=self.tokenizer, max_seq_len=self.max_seq_len) return [Sample(id=None, clear_text=dictionary, tokenized=tokenized)]
def _dict_to_samples(self, dict: dict, **kwargs) -> [Sample]: # this tokenization also stores offsets tokenized = tokenize_with_metadata(dict["text"], self.tokenizer, self.max_seq_len) # Samples don't have labels during Inference mode if "label" in dict: label = float(dict["label"]) scaled_label = (label - self.tasks["regression"]["label_list"][0]) / self.tasks["regression"]["label_list"][1] dict["label"] = scaled_label return [Sample(id=None, clear_text=dict, tokenized=tokenized)]
def _dict_to_samples(self, dictionary: dict, **kwargs) -> [Sample]: # this tokenization also stores offsets tokenized = tokenize_with_metadata(dictionary["text"], self.tokenizer) # truncate tokens, offsets and start_of_word to max_seq_len that can be handled by the model for seq_name in tokenized.keys(): tokenized[seq_name], _, _ = truncate_sequences(seq_a=tokenized[seq_name], seq_b=None, tokenizer=self.tokenizer, max_seq_len=self.max_seq_len) # Samples don't have labels during Inference mode if "label" in dictionary: label = float(dictionary["label"]) scaled_label = (label - self.tasks["regression"]["label_list"][0]) / self.tasks["regression"]["label_list"][1] dictionary["label"] = scaled_label return [Sample(id=None, clear_text=dictionary, tokenized=tokenized)]
def _dict_to_samples(cls, dict, all_dicts=None): """ Converts a dict with a document to a sample (which will subsequently be featurized). It is used during prediction. This is a modified version of BertStyleLMProcessor._dict_to_samples from farm/data_handler/processor.py. It has been modified to create samples with just a single text, rather than two, as is the case for a normal BERT model. """ doc = dict["doc"] samples = [] for idx in range(len(doc) - 1): tokenized = {} tokenized["text_a"] = tokenize_with_metadata( doc[idx], cls.tokenizer, cls.max_seq_len) samples.append( Sample(id=None, clear_text={"doc": doc[idx]}, tokenized=tokenized)) return samples
def parts_to_sample(self, admission_part, discharge_part, label) -> Sample: tokenized = {"text_a": admission_part, "text_b": discharge_part} sample_in_clear_text = { "text_a": admission_part["clear_text"], "text_b": discharge_part["clear_text"], "nextsentence_label": label, } # truncate to max_seq_len for seq_name in ["tokens", "offsets", "start_of_word"]: tokenized["text_a"][seq_name], tokenized["text_b"][ seq_name], _ = truncate_sequences( seq_a=tokenized["text_a"][seq_name], seq_b=tokenized["text_b"][seq_name], tokenizer=self.tokenizer, max_seq_len=self.max_seq_len) return Sample(id=None, clear_text=sample_in_clear_text, tokenized=tokenized)
def _dict_to_samples(cls, dict, all_dicts=None): doc = dict["doc"] samples = [] for idx in range(len(doc) - 1): text_a, text_b, is_next_label = get_sentence_pair(doc, all_dicts, idx) sample_in_clear_text = { "text_a": text_a, "text_b": text_b, "is_next_label": is_next_label, } tokenized = {} tokenized["text_a"] = tokenize_with_metadata( text_a, cls.tokenizer, cls.max_seq_len ) tokenized["text_b"] = tokenize_with_metadata( text_b, cls.tokenizer, cls.max_seq_len ) samples.append( Sample(id=None, clear_text=sample_in_clear_text, tokenized=tokenized) ) return samples
def create_char_mlm_prediction_samples_sentence_pairs(baskets, tokenizer, max_seq_len): """A modified version of create_samples_sentence_pairs from farm/data_handlers/samples.py which simply assigns the first text as text_a and the second text as text_b. This only works becauses the docs contain a sentence to be predicted and a placeholder as the second text.""" for basket in tqdm(baskets): doc = basket.raw["doc"] basket.samples = [] id = "%s" % (basket.id) text_a = doc[0] text_b = doc[1] is_next_label = 1 sample_in_clear_text = { "text_a": text_a, "text_b": text_b, "is_next_label": is_next_label, } tokenized = {} tokenized["text_a"] = tokenize_with_metadata(text_a, tokenizer, max_seq_len) tokenized["text_b"] = tokenize_with_metadata(text_b, tokenizer, max_seq_len) basket.samples.append( Sample(id=id, clear_text=sample_in_clear_text, tokenized=tokenized)) return baskets
def dataset_from_dicts(self, dicts, indices=None, return_baskets=False, non_initial_token="X"): self.baskets = [] self.pre_tokenizer = WhitespaceSplit() texts = [x["text"] for x in dicts] words_and_spans = [self.pre_tokenizer.pre_tokenize_str(x) for x in texts] words = [[x[0] for x in y] for y in words_and_spans] word_spans_batch = [[x[1] for x in y] for y in words_and_spans] tokenized_batch = self.tokenizer.batch_encode_plus( words, return_offsets_mapping=True, return_special_tokens_mask=True, return_token_type_ids=True, return_attention_mask=True, truncation=True, max_length=self.max_seq_len, padding="max_length", is_split_into_words=True, ) for i in range(len(dicts)): tokenized = tokenized_batch[i] d = dicts[i] id_external = self._id_from_dict(d) if indices: id_internal = indices[i] else: id_internal = i input_ids = tokenized.ids segment_ids = tokenized.type_ids initial_mask = self._get_start_of_word(tokenized.words) assert len(initial_mask) == len(input_ids) padding_mask = tokenized.attention_mask if return_baskets: token_to_word_map = tokenized.words word_spans = word_spans_batch[i] tokenized_dict = { "tokens": tokenized.tokens, "word_spans": word_spans, "token_to_word_map": token_to_word_map, "start_of_word": initial_mask } else: tokenized_dict = {} feature_dict = { "input_ids": input_ids, "padding_mask": padding_mask, "segment_ids": segment_ids, "initial_mask": initial_mask, } for task_name, task in self.tasks.items(): try: label_name = task["label_name"] labels_word = d[label_name] label_list = task["label_list"] label_tensor_name = task["label_tensor_name"] if task["task_type"] == "classification": label_ids = [label_list.index(labels_word)] elif task["task_type"] == "ner": labels_token = expand_labels(labels_word, initial_mask, non_initial_token) label_ids = [label_list.index(lt) for lt in labels_token] except ValueError: label_ids = None problematic_labels = set(labels_token).difference(set(label_list)) print(f"[Task: {task_name}] Could not convert labels to ids via label_list!" f"\nWe found a problem with labels {str(problematic_labels)}") except KeyError: label_ids = None # print(f"[Task: {task_name}] Could not convert labels to ids via label_list!" # "\nIf your are running in *inference* mode: Don't worry!" # "\nIf you are running in *training* mode: Verify you are supplying a proper label list to your processor and check that labels in input data are correct.") if label_ids: feature_dict[label_tensor_name] = label_ids curr_sample = Sample(id=None, clear_text=d, tokenized=tokenized_dict, features=[feature_dict]) curr_basket = SampleBasket(id_internal=id_internal, raw=d, id_external=id_external, samples=[curr_sample]) self.baskets.append(curr_basket) if indices and 0 not in indices: pass else: self._log_samples(1) dataset, tensor_names = self._create_dataset() ret = [dataset, tensor_names, self.problematic_sample_ids] if return_baskets: ret.append(self.baskets) return tuple(ret)
def _dict_to_samples(cls, dict: dict, **kwargs) -> [Sample]: # this tokenization also stores offsets, which helps to map our entity tags back to original positions tokenized = tokenize_with_metadata(dict["text"], cls.tokenizer, cls.max_seq_len) return [Sample(id=None, clear_text=dict, tokenized=tokenized)]
def _dict_to_samples(cls, dict: dict, **kwargs) -> [Sample]: # this tokenization also stores offsets tokenized = tokenize_with_metadata(dict["text"], cls.tokenizer, cls.max_seq_len) return [Sample(id=None, clear_text=dict, tokenized=tokenized)]
def create_samples_qa_Natural_Question(dictionary, max_query_len, max_seq_len, doc_stride, n_special_tokens): """ This method will split question-document pairs from the SampleBasket into question-passage pairs which will each form one sample. The "t" and "c" in variables stand for token and character respectively. """ # Initialize some basic variables # is_training = check_if_training(dictionary) question_tokens = dictionary["question_tokens"][:max_query_len] question_len_t = len(question_tokens) question_offsets = dictionary["question_offsets"] doc_tokens = dictionary["document_tokens"] doc_offsets = dictionary["document_offsets"] doc_text = dictionary["document_text"] doc_start_of_word = dictionary["document_start_of_word"] samples = [] # Calculate the number of tokens that can be reserved for the passage. This is calculated by considering # the max_seq_len, the number of tokens in the question and the number of special tokens that will be added # when the question and passage are joined (e.g. [CLS] and [SEP]) passage_len_t = max_seq_len - question_len_t - n_special_tokens # Perform chunking of document into passages. The sliding window moves in steps of doc_stride. # passage_spans is a list of dictionaries where each defines the start and end of each passage # on both token and character level passage_spans = chunk_into_passages(doc_offsets, doc_stride, passage_len_t, doc_text) for passage_span in passage_spans: # Unpack each variable in the dictionary. The "_t" and "_c" indicate # whether the index is on the token or character level passage_start_t = passage_span["passage_start_t"] passage_end_t = passage_span["passage_end_t"] passage_start_c = passage_span["passage_start_c"] passage_end_c = passage_span["passage_end_c"] passage_id = passage_span["passage_id"] # passage_offsets will be relative to the start of the passage (i.e. they will start at 0) # TODO: Is passage offsets actually needed? At this point, maybe we only care about token level passage_offsets = doc_offsets[passage_start_t:passage_end_t] passage_start_of_word = doc_start_of_word[ passage_start_t:passage_end_t] passage_offsets = [x - passage_offsets[0] for x in passage_offsets] passage_tokens = doc_tokens[passage_start_t:passage_end_t] passage_text = dictionary["document_text"][ passage_start_c:passage_end_c] # Deal with the potentially many answers (e.g. Squad or NQ dev set) answers_clear, answers_tokenized = process_answers( dictionary["answers"], doc_offsets, passage_start_c, passage_start_t) clear_text = { "passage_text": passage_text, "question_text": dictionary["question_text"], "passage_id": passage_id, "answers": answers_clear } tokenized = { "passage_start_t": passage_start_t, "passage_tokens": passage_tokens, "passage_offsets": passage_offsets, "passage_start_of_word": passage_start_of_word, "question_tokens": question_tokens, "question_offsets": question_offsets, "question_start_of_word": dictionary["question_start_of_word"][:max_query_len], "answers": answers_tokenized, "document_offsets": doc_offsets } # So that to_doc_preds can access them samples.append( Sample(id=passage_id, clear_text=clear_text, tokenized=tokenized)) return samples
def _dict_to_samples(self, dict: dict, **kwargs) -> [Sample]: # this tokenization also stores offsets, which helps to map our entity tags back to original positions words = re.findall(r"<t>(.*?)</t>", dict["text"], flags=0) word_one = words[0] term_one_idx = -1 term_two_idx = -1 term_one_idxs = [m.start() for m in re.finditer(re.escape(word_one), dict["text"])] for idx, k in enumerate(term_one_idxs): try: if dict["text"][k-3:k] == '<t>': term_one_idx = idx except: pass if len(words) > 1: word_two = words[1] word_two_tokenized = tokenize_with_metadata(word_two, self.tokenizer, self.max_seq_len)['tokens'] term_two_idxs = [m.start() for m in re.finditer(re.escape(word_two), dict["text"])] for idx, k in enumerate(term_two_idxs): try: if dict["text"][k-3:k] == '<t>': term_two_idx = idx except: pass dict["text"] = re.sub(r'<t>','', dict["text"]) dict["text"] = re.sub(r'</t>','', dict["text"]) tokenized = tokenize_with_metadata(dict["text"], self.tokenizer, self.max_seq_len) word_one_tokenized = tokenize_with_metadata(word_one, self.tokenizer, self.max_seq_len)['tokens'] x1, y = [], [] for token in tokenized['tokens']: if token == '[CLS]': x1.append(5) y.append('[CLS]') elif token == '[SEP]': x1.append(4) y.append('[SEP]') else: x1.append(0) y.append('N') idx = find_overlap(word_one_tokenized, tokenized['tokens'], term_one_idx) if idx > -1: for x in range(0,len(word_one_tokenized)): x1[idx+x] = 1 y[idx+x] = 'Y' else: print("-1--") print(word_one_tokenized) print(tokenized['tokens']) x1, y = [], [] for token in tokenized['tokens']: if token == '[CLS]': x1.append(5) y.append('[CLS]') elif token == '[SEP]': x1.append(4) y.append('[SEP]') else: x1.append(0) y.append('N') if len(words) > 1: idx = find_overlap(word_two_tokenized, tokenized['tokens'], term_two_idx) if idx > -1: for x in range(0,len(word_two_tokenized)): y[idx+x] = 'Y' x1[idx+x] = 1 else: print("-2--") print(word_two_tokenized) print(tokenized['tokens']) x1, y = [], [] for token in tokenized['tokens']: if token == '[CLS]': x1.append(5) y.append('[CLS]') elif token == '[SEP]': x1.append(4) y.append('[SEP]') else: x1.append(0) y.append('N') tokenized['custom_data'] = x1 tokenized['ner_label'] = y dict['custom_data'] = x1 dict['ner_label'] = y return [Sample(id=None, clear_text=dict, tokenized=tokenized)]