def invoke_extractor(self, extractor: Extractor, extractable: Extractable = None, tokenizer: Tokenizer = None, joiner: str = " ", **options) -> List[Extraction]: """ Invoke the extractor on the given extractable, accumulating all the extractions in a list. Args: extractor (Extractor): extractable (extractable): object for extraction tokenizer: user can pass custom tokenizer if extractor wants token joiner: user can pass joiner if extractor wants text options: user can pass arguments as a dict to the extract() function of different extractors Returns: List of Extraction, containing all the extractions. """ if not extractable: extractable = self if not tokenizer: tokenizer = self.default_tokenizer extracted_results = list() if extractor.input_type == InputType.TOKENS: tokens = extractable.get_tokens(tokenizer) if tokens: extracted_results = extractor.extract(tokens, **options) elif extractor.input_type == InputType.TEXT: text = extractable.get_string(joiner) if text: extracted_results = extractor.extract(text, **options) elif extractor.input_type == InputType.OBJECT: extracted_results = extractor.extract(extractable.value, **options) elif extractor.input_type == InputType.HTML: extracted_results = extractor.extract(extractable.value, **options) #self.extraction_provenance_records = [] for e in extracted_results: extraction_provenance_record: ExtractionProvenanceRecord = ExtractionProvenanceRecord( self.extraction_provenance_id_index, extractable.full_path, e.provenance["extractor_name"], e.provenance["start_char"], e.provenance["end_char"], e.provenance["confidence"], extractable.document, extractable.prov_id) #self.extraction_provenance_records.append(self.extraction_provenance_id_index) e.prov_id = self.extraction_provenance_id_index # for the purpose of provenance hierarrchy tracking self.extraction_provenance_id_index = self.extraction_provenance_id_index + 1 self.create_provenance(extraction_provenance_record) # TODO: the reason that extractors must return Extraction objects is so that # they can communicate back the provenance. return extracted_results
def invoke_extractor(self, extractor: Extractor, extractable: Extractable = None, tokenizer: Tokenizer = None, joiner: str = " ", **options) -> List[Extraction]: """ Invoke the extractor on the given extractable, accumulating all the extractions in a list. Args: extractor (Extractor): extractable (extractable): tokenizer: user can pass custom tokenizer if extractor wants token joiner: user can pass joiner if extractor wants text options: user can pass arguments as a dict to the extract() function of different extractors Returns: List of Extraction, containing all the extractions. """ if not extractable: extractable = self if not tokenizer: tokenizer = self.default_tokenizer extracted_results = list() if extractor.input_type == InputType.TOKENS: tokens = extractable.get_tokens(tokenizer) if tokens: extracted_results = extractor.extract(tokens, **options) elif extractor.input_type == InputType.TEXT: text = extractable.get_string(joiner) if text: extracted_results = extractor.extract(text, **options) elif extractor.input_type == InputType.OBJECT: extracted_results = extractor.extract(extractable.value, **options) elif extractor.input_type == InputType.HTML: extracted_results = extractor.extract(extractable.value, **options) # TODO: the reason that extractors must return Extraction objects is so that # they can communicate back the provenance. return extracted_results
def extract(self, extractor: Extractor, extractable: Extractable = None, tokenizer: Tokenizer = None, joiner: str = " ", **options) -> List[Extraction]: """ Invoke the extractor on the given extractable, accumulating all the extractions in a list. Args: extractor (Extractor): extractable (extractable): tokenizer: user can pass custom tokenizer if extractor wants token joiner: user can pass joiner if extractor wants text options: user can pass arguments as a dict to the extract() function of different extractors Returns: List of Extraction, containing all the extractions. """ if not extractable: extractable = self if not tokenizer: tokenizer = self.etk.default_tokenizer extracted_results = list() if extractor.input_type == InputType.TOKENS: if self.etk.error_policy == ErrorPolicy.PROCESS: if isinstance(extractable.value, list): self.etk.log( "Extractor needs tokens, tokenizer needs string to tokenize, got list, converting to string", "warning", self.doc_id, self.url) warnings.warn( "Extractor needs tokens, tokenizer needs string to tokenize, got list, converting to string") elif isinstance(extractable.value, dict): self.etk.log( "Extractor needs tokens, tokenizer needs string to tokenize, got dict, converting to string", "warning", self.doc_id, self.url) warnings.warn( "Extractor needs tokens, tokenizer needs string to tokenize, got dict, converting to string") tokens = extractable.get_tokens(tokenizer) if tokens: extracted_results = extractor.extract(tokens, **options) else: raise ExtractorValueError( "Extractor needs string, tokenizer needs string to tokenize, got " + str(type(extractable.value))) elif extractor.input_type == InputType.TEXT: if self.etk.error_policy == ErrorPolicy.PROCESS: if isinstance(extractable.value, list): self.etk.log("Extractor needs string, got extractable value as list, converting to string", "warning", self.doc_id, self.url) warnings.warn("Extractor needs string, got extractable value as list, converting to string") elif isinstance(extractable.value, dict): self.etk.log("Extractor needs string, got extractable value as dict, converting to string", "warning", self.doc_id, self.url) warnings.warn("Extractor needs string, got extractable value as dict, converting to string") text = extractable.get_string(joiner) if text: extracted_results = extractor.extract(text, **options) else: # raise ExtractorValueError("Extractor needs string, got " + str(type(extractable.value))) # TODO: Yixiang - needs to be handled properly pass elif extractor.input_type == InputType.OBJECT: extracted_results = extractor.extract(extractable.value, **options) elif extractor.input_type == InputType.HTML: if bool(BeautifulSoup(extractable.value, "html.parser").find()): extracted_results = extractor.extract(extractable.value, **options) else: # raise ExtractorValueError("Extractor needs HTML, got non HTML string") # TODO: Yixiang - needs to be handled properly pass try: jsonPath = extractable.full_path except AttributeError: jsonPath = None for e in extracted_results: # for the purpose of provenance hierarrchy tracking, a parent's id for next generation. e.prov_id = self.provenance_id_index extraction_provenance_record: ExtractionProvenanceRecord = ExtractionProvenanceRecord( e.prov_id, jsonPath, e.provenance["extractor_name"], e.provenance["start_char"], e.provenance["end_char"], e.provenance["confidence"], self, extractable.prov_id) self._provenances[e.prov_id] = extraction_provenance_record # for the purpose of provenance hierarchy tracking self.provenance_id_index_incrementer() self.create_provenance(extraction_provenance_record) return extracted_results