def extract(self, extractor: Extractor, extractable: Extractable = None, tokenizer: Tokenizer = None, joiner: str = " ", **options) -> List[Extraction]: """ Invoke the extractor on the given extractable, accumulating all the extractions in a list. Args: extractor (Extractor): extractable (extractable): tokenizer: user can pass custom tokenizer if extractor wants token joiner: user can pass joiner if extractor wants text options: user can pass arguments as a dict to the extract() function of different extractors Returns: List of Extraction, containing all the extractions. """ if not extractable: extractable = self if not tokenizer: tokenizer = self.etk.default_tokenizer extracted_results = list() if extractor.input_type == InputType.TOKENS: if self.etk.error_policy == ErrorPolicy.PROCESS: if isinstance(extractable.value, list): self.etk.log( "Extractor needs tokens, tokenizer needs string to tokenize, got list, converting to string", "warning", self.doc_id, self.url) warnings.warn( "Extractor needs tokens, tokenizer needs string to tokenize, got list, converting to string") elif isinstance(extractable.value, dict): self.etk.log( "Extractor needs tokens, tokenizer needs string to tokenize, got dict, converting to string", "warning", self.doc_id, self.url) warnings.warn( "Extractor needs tokens, tokenizer needs string to tokenize, got dict, converting to string") tokens = extractable.get_tokens(tokenizer) if tokens: extracted_results = extractor.extract(tokens, **options) else: raise ExtractorValueError( "Extractor needs string, tokenizer needs string to tokenize, got " + str(type(extractable.value))) elif extractor.input_type == InputType.TEXT: if self.etk.error_policy == ErrorPolicy.PROCESS: if isinstance(extractable.value, list): self.etk.log("Extractor needs string, got extractable value as list, converting to string", "warning", self.doc_id, self.url) warnings.warn("Extractor needs string, got extractable value as list, converting to string") elif isinstance(extractable.value, dict): self.etk.log("Extractor needs string, got extractable value as dict, converting to string", "warning", self.doc_id, self.url) warnings.warn("Extractor needs string, got extractable value as dict, converting to string") text = extractable.get_string(joiner) if text: extracted_results = extractor.extract(text, **options) else: # raise ExtractorValueError("Extractor needs string, got " + str(type(extractable.value))) # TODO: Yixiang - needs to be handled properly pass elif extractor.input_type == InputType.OBJECT: extracted_results = extractor.extract(extractable.value, **options) elif extractor.input_type == InputType.HTML: if bool(BeautifulSoup(extractable.value, "html.parser").find()): extracted_results = extractor.extract(extractable.value, **options) else: # raise ExtractorValueError("Extractor needs HTML, got non HTML string") # TODO: Yixiang - needs to be handled properly pass try: jsonPath = extractable.full_path except AttributeError: jsonPath = None for e in extracted_results: # for the purpose of provenance hierarrchy tracking, a parent's id for next generation. e.prov_id = self.provenance_id_index extraction_provenance_record: ExtractionProvenanceRecord = ExtractionProvenanceRecord( e.prov_id, jsonPath, e.provenance["extractor_name"], e.provenance["start_char"], e.provenance["end_char"], e.provenance["confidence"], self, extractable.prov_id) self._provenances[e.prov_id] = extraction_provenance_record # for the purpose of provenance hierarchy tracking self.provenance_id_index_incrementer() self.create_provenance(extraction_provenance_record) return extracted_results
def test_Extractable(self) -> None: e = Extractable({ 'extracted_value': [{ 1: 2, 'das': [1, 2, 3] }], 'confidence': 2.3 }) t = Tokenizer(keep_multi_space=False) tokens = e.get_tokens(t) token_attrs = [] for i in tokens: token_attrs.append({ "orth": i.orth_, "offset": i.idx, "full_shape": i._.full_shape }) expected_token = [{ 'orth': 'extracted', 'offset': 0, 'full_shape': 'xxxxxxxxx' }, { 'orth': '_', 'offset': 9, 'full_shape': '_' }, { 'orth': 'value', 'offset': 10, 'full_shape': 'xxxxx' }, { 'orth': ':', 'offset': 16, 'full_shape': ':' }, { 'orth': '1', 'offset': 18, 'full_shape': 'd' }, { 'orth': ':', 'offset': 20, 'full_shape': ':' }, { 'orth': '2', 'offset': 22, 'full_shape': 'd' }, { 'orth': 'das', 'offset': 24, 'full_shape': 'xxx' }, { 'orth': ':', 'offset': 28, 'full_shape': ':' }, { 'orth': '1', 'offset': 30, 'full_shape': 'd' }, { 'orth': '2', 'offset': 32, 'full_shape': 'd' }, { 'orth': '3', 'offset': 34, 'full_shape': 'd' }, { 'orth': 'confidence', 'offset': 36, 'full_shape': 'xxxxxxxxxx' }, { 'orth': ':', 'offset': 47, 'full_shape': ':' }, { 'orth': '2.3', 'offset': 49, 'full_shape': 'd.d' }] self.assertEqual(token_attrs, expected_token) text = e.get_string() expected_str = "extracted_value : 1 : 2 das : 1 2 3 confidence : 2.3 " self.assertEqual(text, expected_str)