Пример #1
0
    def invoke_extractor(self,
                         extractor: Extractor,
                         extractable: Extractable = None,
                         tokenizer: Tokenizer = None,
                         joiner: str = "  ",
                         **options) -> List[Extraction]:
        """
        Invoke the extractor on the given extractable, accumulating all the extractions in a list.

        Args:
            extractor (Extractor):
            extractable (extractable): object for extraction
            tokenizer: user can pass custom tokenizer if extractor wants token
            joiner: user can pass joiner if extractor wants text
            options: user can pass arguments as a dict to the extract() function of different extractors

        Returns: List of Extraction, containing all the extractions.

        """
        if not extractable:
            extractable = self

        if not tokenizer:
            tokenizer = self.default_tokenizer

        extracted_results = list()

        if extractor.input_type == InputType.TOKENS:
            tokens = extractable.get_tokens(tokenizer)
            if tokens:
                extracted_results = extractor.extract(tokens, **options)

        elif extractor.input_type == InputType.TEXT:
            text = extractable.get_string(joiner)
            if text:
                extracted_results = extractor.extract(text, **options)

        elif extractor.input_type == InputType.OBJECT:
            extracted_results = extractor.extract(extractable.value, **options)

        elif extractor.input_type == InputType.HTML:
            extracted_results = extractor.extract(extractable.value, **options)

        #self.extraction_provenance_records = []
        for e in extracted_results:
            extraction_provenance_record: ExtractionProvenanceRecord = ExtractionProvenanceRecord(
                self.extraction_provenance_id_index, extractable.full_path,
                e.provenance["extractor_name"], e.provenance["start_char"],
                e.provenance["end_char"], e.provenance["confidence"],
                extractable.document, extractable.prov_id)
            #self.extraction_provenance_records.append(self.extraction_provenance_id_index)
            e.prov_id = self.extraction_provenance_id_index  # for the purpose of provenance hierarrchy tracking
            self.extraction_provenance_id_index = self.extraction_provenance_id_index + 1
            self.create_provenance(extraction_provenance_record)
        # TODO: the reason that extractors must return Extraction objects is so that
        # they can communicate back the provenance.

        return extracted_results
Пример #2
0
    def invoke_extractor(self,
                         extractor: Extractor,
                         extractable: Extractable = None,
                         tokenizer: Tokenizer = None,
                         joiner: str = "  ",
                         **options) -> List[Extraction]:
        """
       Invoke the extractor on the given extractable, accumulating all the extractions in a list.

       Args:
           extractor (Extractor):
           extractable (extractable):
           tokenizer: user can pass custom tokenizer if extractor wants token
           joiner: user can pass joiner if extractor wants text
           options: user can pass arguments as a dict to the extract() function of different extractors

       Returns: List of Extraction, containing all the extractions.

       """
        if not extractable:
            extractable = self

        if not tokenizer:
            tokenizer = self.default_tokenizer

        extracted_results = list()

        if extractor.input_type == InputType.TOKENS:
            tokens = extractable.get_tokens(tokenizer)
            if tokens:
                extracted_results = extractor.extract(tokens, **options)

        elif extractor.input_type == InputType.TEXT:
            text = extractable.get_string(joiner)
            if text:
                extracted_results = extractor.extract(text, **options)

        elif extractor.input_type == InputType.OBJECT:
            extracted_results = extractor.extract(extractable.value, **options)

        elif extractor.input_type == InputType.HTML:
            extracted_results = extractor.extract(extractable.value, **options)

        # TODO: the reason that extractors must return Extraction objects is so that
        # they can communicate back the provenance.

        return extracted_results
Пример #3
0
    def extract(self, extractor: Extractor, extractable: Extractable = None, tokenizer: Tokenizer = None,
                joiner: str = "  ", **options) -> List[Extraction]:

        """
        Invoke the extractor on the given extractable, accumulating all the extractions in a list.

        Args:
            extractor (Extractor):
            extractable (extractable):
            tokenizer: user can pass custom tokenizer if extractor wants token
            joiner: user can pass joiner if extractor wants text
            options: user can pass arguments as a dict to the extract() function of different extractors

        Returns: List of Extraction, containing all the extractions.

        """
        if not extractable:
            extractable = self

        if not tokenizer:
            tokenizer = self.etk.default_tokenizer

        extracted_results = list()

        if extractor.input_type == InputType.TOKENS:
            if self.etk.error_policy == ErrorPolicy.PROCESS:
                if isinstance(extractable.value, list):
                    self.etk.log(
                        "Extractor needs tokens, tokenizer needs string to tokenize, got list, converting to string",
                        "warning", self.doc_id, self.url)
                    warnings.warn(
                        "Extractor needs tokens, tokenizer needs string to tokenize, got list, converting to string")
                elif isinstance(extractable.value, dict):
                    self.etk.log(
                        "Extractor needs tokens, tokenizer needs string to tokenize, got dict, converting to string",
                        "warning", self.doc_id, self.url)
                    warnings.warn(
                        "Extractor needs tokens, tokenizer needs string to tokenize, got dict, converting to string")
                tokens = extractable.get_tokens(tokenizer)
                if tokens:
                    extracted_results = extractor.extract(tokens, **options)
            else:
                raise ExtractorValueError(
                    "Extractor needs string, tokenizer needs string to tokenize, got " + str(type(extractable.value)))

        elif extractor.input_type == InputType.TEXT:
            if self.etk.error_policy == ErrorPolicy.PROCESS:
                if isinstance(extractable.value, list):
                    self.etk.log("Extractor needs string, got extractable value as list, converting to string",
                                 "warning", self.doc_id, self.url)
                    warnings.warn("Extractor needs string, got extractable value as list, converting to string")
                elif isinstance(extractable.value, dict):
                    self.etk.log("Extractor needs string, got extractable value as dict, converting to string",
                                 "warning", self.doc_id, self.url)
                    warnings.warn("Extractor needs string, got extractable value as dict, converting to string")
                text = extractable.get_string(joiner)
                if text:
                    extracted_results = extractor.extract(text, **options)
            else:
                # raise ExtractorValueError("Extractor needs string, got " + str(type(extractable.value)))
                # TODO: Yixiang - needs to be handled properly
                pass

        elif extractor.input_type == InputType.OBJECT:
            extracted_results = extractor.extract(extractable.value, **options)

        elif extractor.input_type == InputType.HTML:
            if bool(BeautifulSoup(extractable.value, "html.parser").find()):
                extracted_results = extractor.extract(extractable.value, **options)
            else:
                # raise ExtractorValueError("Extractor needs HTML, got non HTML string")
                # TODO: Yixiang - needs to be handled properly
                pass

        try:
            jsonPath = extractable.full_path
        except AttributeError:
            jsonPath = None

        for e in extracted_results:
            # for the purpose of provenance hierarrchy tracking, a parent's id for next generation.
            e.prov_id = self.provenance_id_index
            extraction_provenance_record: ExtractionProvenanceRecord = ExtractionProvenanceRecord(
                e.prov_id, jsonPath, e.provenance["extractor_name"],
                e.provenance["start_char"], e.provenance["end_char"], e.provenance["confidence"], self,
                extractable.prov_id)
            self._provenances[e.prov_id] = extraction_provenance_record

            # for the purpose of provenance hierarchy tracking
            self.provenance_id_index_incrementer()
            self.create_provenance(extraction_provenance_record)

        return extracted_results