def _init_baskets_from_file(self, file): dicts = self._file_to_dicts(file) dataset_name = os.path.splitext(os.path.basename(file))[0] baskets = [ SampleBasket(raw=tr, id=f"{dataset_name}-{i}") for i, tr in enumerate(dicts) ] return baskets
def _init_baskets_from_file(self, file): dicts = self.file_to_dicts(file) dataset_name = file.stem baskets = [ SampleBasket(raw=tr, id=f"{dataset_name}-{i}") for i, tr in enumerate(dicts) ] return baskets
def dataset_from_dicts(self, dicts, index=0, rest_api_schema=False, return_baskets=False): """ Contains all the functionality to turn a list of dict objects into a PyTorch Dataset and a list of tensor names. This can be used for inference mode. :param dicts: List of dictionaries where each contains the data of one input sample. :type dicts: list of dicts :return: a Pytorch dataset and a list of tensor names. """ if rest_api_schema: id_prefix = "infer" else: id_prefix = "train" # We need to add the index (coming from multiprocessing chunks) to have a unique basket ID self.baskets = [ SampleBasket(raw=tr, id=f"{id_prefix}-{i + index}") for i, tr in enumerate(dicts) ] self._init_samples_in_baskets() self._featurize_samples() if index == 0: self._log_samples(3) if return_baskets: dataset, tensor_names = self._create_dataset(keep_baskets=True) return dataset, tensor_names, self.baskets else: dataset, tensor_names = self._create_dataset() return dataset, tensor_names
def dataset_from_dicts(self, dicts, indices=None, rest_api_schema=False, return_baskets=False, fewer_samples=True): """ Contains all the functionality to turn a list of dict objects into a PyTorch Dataset and a list of tensor names. This can be used for inference mode. :param dicts: List of dictionaries where each contains the data of one input sample. :type dicts: list of dicts :return: a Pytorch dataset and a list of tensor names. """ if rest_api_schema: id_prefix = "infer" else: id_prefix = "train" # We need to add the index (coming from multiprocessing chunks) to have a unique basket ID if indices: self.baskets = [ SampleBasket(raw=tr, id_internal=f"{id_prefix}-{index}") for (tr, index) in zip(dicts, indices) ] else: self.baskets = [ SampleBasket(raw=tr, id_internal=f"{id_prefix}-{i}") for (i, tr) in enumerate(dicts) ] self._init_samples_in_baskets(fewer_samples=fewer_samples) self._featurize_samples() if indices: logger.info(f"Currently working on indices: {indices}") if 0 in indices: self._log_samples(2) if 50 in indices: self._print_samples(30) else: self._log_samples(2) if return_baskets: dataset, tensor_names = self._create_dataset(keep_baskets=True) return dataset, tensor_names, self.baskets else: dataset, tensor_names = self._create_dataset() return dataset, tensor_names
def dataset_from_dicts(self, dicts): self.baskets = [ SampleBasket(raw=tr, id="infer - {}".format(i)) for i, tr in enumerate(dicts) ] self._init_samples_in_baskets() self._featurize_samples() dataset, tensor_names = self._create_dataset() return dataset, tensor_names
def _dicts_to_baskets(self, dicts, indices): # Perform tokenization on documents and questions resulting in a nested list of doc-question pairs dicts_tokenized = [self.apply_tokenization(d) for d in dicts] baskets = [] for index, document in zip(indices, dicts_tokenized): for q_idx, raw in enumerate(document): basket = SampleBasket(raw=raw, id=f"{index}-{q_idx}") baskets.append(basket) return baskets
def dataset_from_dicts(self, dicts, index=None, from_inference=False): if(from_inference): dicts = [self._convert_inference(x) for x in dicts] self.baskets = [ SampleBasket(raw=tr, id="infer - {}".format(i)) for i, tr in enumerate(dicts) ] self._init_samples_in_baskets() self._featurize_samples() if index == 0: self._log_samples(3) dataset, tensor_names = self._create_dataset() return dataset, tensor_names
def _dicts_to_baskets(self, dicts, index=None): # Perform tokenization on documents and questions resulting in a nested list of doc-question pairs dicts_tokenized = [self.apply_tokenization(d) for d in dicts] baskets = [] for d_idx, document in enumerate(dicts_tokenized): for q_idx, raw in enumerate(document): squad_id_hex = dicts[d_idx]["qas"][q_idx]["id"] if squad_id_hex is None: id_1 = d_idx + index id_2 = q_idx else: id_1, id_2 = encode_squad_id(squad_id_hex) basket = SampleBasket(raw=raw, id=f"{id_1}-{id_2}") baskets.append(basket) return baskets
def dataset_from_dicts(self, dicts, index=None, rest_api_schema=False): if rest_api_schema: dicts = [self._convert_rest_api_dict(x) for x in dicts] if rest_api_schema: id_prefix = "infer" else: id_prefix = "train" self.baskets = [ SampleBasket(raw=tr, id=f"{id_prefix}-{i}") for i, tr in enumerate(dicts) ] self._init_samples_in_baskets() self._featurize_samples() if index == 0: self._log_samples(3) dataset, tensor_names = self._create_dataset() return dataset, tensor_names
def dataset_from_dicts(self, dicts): """ Contains all the functionality to turn a list of dict objects into a PyTorch Dataset and a list of tensor names. This is used for inference mode. :param dicts: List of dictionaries where each contains the data of one input sample. :type dicts: list of dicts :return: a Pytorch dataset and a list of tensor names. """ self.baskets = [ SampleBasket(raw=tr, id="infer - {}".format(i)) for i, tr in enumerate(dicts) ] self._init_samples_in_baskets() self._featurize_samples() dataset, tensor_names = self._create_dataset() return dataset, tensor_names
def dataset_from_dicts(self, dicts, index=0, rest_api_schema=False, return_baskets=False): if rest_api_schema: dicts = [self._convert_rest_api_dict(x) for x in dicts] #We need to add the index (coming from multiprocessing chunks) to have a unique numerical basket ID self.baskets = [ SampleBasket(raw=tr, id=(i + index) * 10000) for i, tr in enumerate(dicts) ] self._init_samples_in_baskets_squad() self._featurize_samples() if index == 0: self._log_samples(3) if return_baskets: dataset, tensor_names = self._create_dataset(keep_baskets=True) return dataset, tensor_names, self.baskets else: dataset, tensor_names = self._create_dataset(keep_baskets=False) return dataset, tensor_names
def tokenize_batch_question_answering(pre_baskets, tokenizer, indices): """ Tokenizes text data for question answering tasks. Tokenization means splitting words into subwords, depending on the tokenizer's vocabulary. - We first tokenize all documents in batch mode. (When using FastTokenizers Rust multithreading can be enabled by TODO add how to enable rust mt) - Then we tokenize each question individually - We construct dicts with question and corresponding document text + tokens + offsets + ids :param pre_baskets: input dicts with QA info #todo change to input objects :param tokenizer: tokenizer to be used :param indices: list, indices used during multiprocessing so that IDs assigned to our baskets are unique :return: baskets, list containing question and corresponding document information """ assert len(indices) == len(pre_baskets) assert tokenizer.is_fast, "Processing QA data is only supported with fast tokenizers for now.\n" \ "Please load Tokenizers with 'use_fast=True' option." baskets = [] # # Tokenize texts in batch mode texts = [d["context"] for d in pre_baskets] tokenized_docs_batch = tokenizer.batch_encode_plus( texts, return_offsets_mapping=True, return_special_tokens_mask=True, add_special_tokens=False, verbose=False) # Extract relevant data tokenids_batch = tokenized_docs_batch["input_ids"] offsets_batch = [] for o in tokenized_docs_batch["offset_mapping"]: offsets_batch.append(np.array([x[0] for x in o])) start_of_words_batch = [] for e in tokenized_docs_batch.encodings: start_of_words_batch.append(_get_start_of_word_QA(e.words)) for i_doc, d in enumerate(pre_baskets): document_text = d["context"] # # Tokenize questions one by one for i_q, q in enumerate(d["qas"]): question_text = q["question"] tokenized_q = tokenizer.encode_plus( question_text, return_offsets_mapping=True, return_special_tokens_mask=True, add_special_tokens=False) # Extract relevant data question_tokenids = tokenized_q["input_ids"] question_offsets = [x[0] for x in tokenized_q["offset_mapping"]] question_sow = _get_start_of_word_QA( tokenized_q.encodings[0].words) external_id = q["id"] # The internal_id depends on unique ids created for each process before forking internal_id = f"{indices[i_doc]}-{i_q}" raw = { "document_text": document_text, "document_tokens": tokenids_batch[i_doc], "document_offsets": offsets_batch[i_doc], "document_start_of_word": start_of_words_batch[i_doc], "question_text": question_text, "question_tokens": question_tokenids, "question_offsets": question_offsets, "question_start_of_word": question_sow, "answers": q["answers"], } # TODO add only during debug mode (need to create debug mode) raw["document_tokens_strings"] = tokenized_docs_batch.encodings[ i_doc].tokens raw["question_tokens_strings"] = tokenized_q.encodings[0].tokens baskets.append( SampleBasket(raw=raw, id_internal=internal_id, id_external=external_id, samples=None)) return baskets
def dataset_from_dicts(self, dicts, indices=None, return_baskets=False, non_initial_token="X"): self.baskets = [] self.pre_tokenizer = WhitespaceSplit() texts = [x["text"] for x in dicts] words_and_spans = [self.pre_tokenizer.pre_tokenize_str(x) for x in texts] words = [[x[0] for x in y] for y in words_and_spans] word_spans_batch = [[x[1] for x in y] for y in words_and_spans] tokenized_batch = self.tokenizer.batch_encode_plus( words, return_offsets_mapping=True, return_special_tokens_mask=True, return_token_type_ids=True, return_attention_mask=True, truncation=True, max_length=self.max_seq_len, padding="max_length", is_split_into_words=True, ) for i in range(len(dicts)): tokenized = tokenized_batch[i] d = dicts[i] id_external = self._id_from_dict(d) if indices: id_internal = indices[i] else: id_internal = i input_ids = tokenized.ids segment_ids = tokenized.type_ids initial_mask = self._get_start_of_word(tokenized.words) assert len(initial_mask) == len(input_ids) padding_mask = tokenized.attention_mask if return_baskets: token_to_word_map = tokenized.words word_spans = word_spans_batch[i] tokenized_dict = { "tokens": tokenized.tokens, "word_spans": word_spans, "token_to_word_map": token_to_word_map, "start_of_word": initial_mask } else: tokenized_dict = {} feature_dict = { "input_ids": input_ids, "padding_mask": padding_mask, "segment_ids": segment_ids, "initial_mask": initial_mask, } for task_name, task in self.tasks.items(): try: label_name = task["label_name"] labels_word = d[label_name] label_list = task["label_list"] label_tensor_name = task["label_tensor_name"] if task["task_type"] == "classification": label_ids = [label_list.index(labels_word)] elif task["task_type"] == "ner": labels_token = expand_labels(labels_word, initial_mask, non_initial_token) label_ids = [label_list.index(lt) for lt in labels_token] except ValueError: label_ids = None problematic_labels = set(labels_token).difference(set(label_list)) print(f"[Task: {task_name}] Could not convert labels to ids via label_list!" f"\nWe found a problem with labels {str(problematic_labels)}") except KeyError: label_ids = None # print(f"[Task: {task_name}] Could not convert labels to ids via label_list!" # "\nIf your are running in *inference* mode: Don't worry!" # "\nIf you are running in *training* mode: Verify you are supplying a proper label list to your processor and check that labels in input data are correct.") if label_ids: feature_dict[label_tensor_name] = label_ids curr_sample = Sample(id=None, clear_text=d, tokenized=tokenized_dict, features=[feature_dict]) curr_basket = SampleBasket(id_internal=id_internal, raw=d, id_external=id_external, samples=[curr_sample]) self.baskets.append(curr_basket) if indices and 0 not in indices: pass else: self._log_samples(1) dataset, tensor_names = self._create_dataset() ret = [dataset, tensor_names, self.problematic_sample_ids] if return_baskets: ret.append(self.baskets) return tuple(ret)