def _dict_to_samples(self, dictionary: dict, **kwargs) -> [Sample]: n_special_tokens = self.tokenizer.num_added_tokens(pair=True) samples = create_samples_squad(dictionary=dictionary, max_query_len=self.max_query_length, max_seq_len=self.max_seq_len, doc_stride=self.doc_stride, n_special_tokens=n_special_tokens) return samples
def _dict_to_samples(self, dictionary: dict, **kwargs) -> [Sample]: if "paragraphs" not in dictionary: # TODO change this inference mode hack dictionary = self._convert_rest_api_dict(infer_dict=dictionary) samples = create_samples_squad(entry=dictionary) for sample in samples: tokenized = tokenize_with_metadata(text=" ".join( sample.clear_text["doc_tokens"]), tokenizer=self.tokenizer) sample.tokenized = tokenized return samples
def _dict_to_samples(cls, dict: dict, **kwargs) -> [Sample]: # TODO split samples that are too long in this function, related to todo in self._sample_to_features if "paragraphs" not in dict: # TODO change this inference mode hack dict = cls._convert_inference(infer_dict=dict) samples = create_samples_squad(entry=dict) for sample in samples: tokenized = tokenize_with_metadata( text=" ".join(sample.clear_text["doc_tokens"]), tokenizer=cls.tokenizer, max_seq_len=cls.max_seq_len, ) sample.tokenized = tokenized return samples