示例#1
0
    def extract(self, extractor: Extractor, extractable: Extractable = None, tokenizer: Tokenizer = None,
                joiner: str = "  ", **options) -> List[Extraction]:

        """
        Invoke the extractor on the given extractable, accumulating all the extractions in a list.

        Args:
            extractor (Extractor):
            extractable (extractable):
            tokenizer: user can pass custom tokenizer if extractor wants token
            joiner: user can pass joiner if extractor wants text
            options: user can pass arguments as a dict to the extract() function of different extractors

        Returns: List of Extraction, containing all the extractions.

        """
        if not extractable:
            extractable = self

        if not tokenizer:
            tokenizer = self.etk.default_tokenizer

        extracted_results = list()

        if extractor.input_type == InputType.TOKENS:
            if self.etk.error_policy == ErrorPolicy.PROCESS:
                if isinstance(extractable.value, list):
                    self.etk.log(
                        "Extractor needs tokens, tokenizer needs string to tokenize, got list, converting to string",
                        "warning", self.doc_id, self.url)
                    warnings.warn(
                        "Extractor needs tokens, tokenizer needs string to tokenize, got list, converting to string")
                elif isinstance(extractable.value, dict):
                    self.etk.log(
                        "Extractor needs tokens, tokenizer needs string to tokenize, got dict, converting to string",
                        "warning", self.doc_id, self.url)
                    warnings.warn(
                        "Extractor needs tokens, tokenizer needs string to tokenize, got dict, converting to string")
                tokens = extractable.get_tokens(tokenizer)
                if tokens:
                    extracted_results = extractor.extract(tokens, **options)
            else:
                raise ExtractorValueError(
                    "Extractor needs string, tokenizer needs string to tokenize, got " + str(type(extractable.value)))

        elif extractor.input_type == InputType.TEXT:
            if self.etk.error_policy == ErrorPolicy.PROCESS:
                if isinstance(extractable.value, list):
                    self.etk.log("Extractor needs string, got extractable value as list, converting to string",
                                 "warning", self.doc_id, self.url)
                    warnings.warn("Extractor needs string, got extractable value as list, converting to string")
                elif isinstance(extractable.value, dict):
                    self.etk.log("Extractor needs string, got extractable value as dict, converting to string",
                                 "warning", self.doc_id, self.url)
                    warnings.warn("Extractor needs string, got extractable value as dict, converting to string")
                text = extractable.get_string(joiner)
                if text:
                    extracted_results = extractor.extract(text, **options)
            else:
                # raise ExtractorValueError("Extractor needs string, got " + str(type(extractable.value)))
                # TODO: Yixiang - needs to be handled properly
                pass

        elif extractor.input_type == InputType.OBJECT:
            extracted_results = extractor.extract(extractable.value, **options)

        elif extractor.input_type == InputType.HTML:
            if bool(BeautifulSoup(extractable.value, "html.parser").find()):
                extracted_results = extractor.extract(extractable.value, **options)
            else:
                # raise ExtractorValueError("Extractor needs HTML, got non HTML string")
                # TODO: Yixiang - needs to be handled properly
                pass

        try:
            jsonPath = extractable.full_path
        except AttributeError:
            jsonPath = None

        for e in extracted_results:
            # for the purpose of provenance hierarrchy tracking, a parent's id for next generation.
            e.prov_id = self.provenance_id_index
            extraction_provenance_record: ExtractionProvenanceRecord = ExtractionProvenanceRecord(
                e.prov_id, jsonPath, e.provenance["extractor_name"],
                e.provenance["start_char"], e.provenance["end_char"], e.provenance["confidence"], self,
                extractable.prov_id)
            self._provenances[e.prov_id] = extraction_provenance_record

            # for the purpose of provenance hierarchy tracking
            self.provenance_id_index_incrementer()
            self.create_provenance(extraction_provenance_record)

        return extracted_results
示例#2
0
    def test_Extractable(self) -> None:
        e = Extractable({
            'extracted_value': [{
                1: 2,
                'das': [1, 2, 3]
            }],
            'confidence': 2.3
        })
        t = Tokenizer(keep_multi_space=False)
        tokens = e.get_tokens(t)
        token_attrs = []
        for i in tokens:
            token_attrs.append({
                "orth": i.orth_,
                "offset": i.idx,
                "full_shape": i._.full_shape
            })
        expected_token = [{
            'orth': 'extracted',
            'offset': 0,
            'full_shape': 'xxxxxxxxx'
        }, {
            'orth': '_',
            'offset': 9,
            'full_shape': '_'
        }, {
            'orth': 'value',
            'offset': 10,
            'full_shape': 'xxxxx'
        }, {
            'orth': ':',
            'offset': 16,
            'full_shape': ':'
        }, {
            'orth': '1',
            'offset': 18,
            'full_shape': 'd'
        }, {
            'orth': ':',
            'offset': 20,
            'full_shape': ':'
        }, {
            'orth': '2',
            'offset': 22,
            'full_shape': 'd'
        }, {
            'orth': 'das',
            'offset': 24,
            'full_shape': 'xxx'
        }, {
            'orth': ':',
            'offset': 28,
            'full_shape': ':'
        }, {
            'orth': '1',
            'offset': 30,
            'full_shape': 'd'
        }, {
            'orth': '2',
            'offset': 32,
            'full_shape': 'd'
        }, {
            'orth': '3',
            'offset': 34,
            'full_shape': 'd'
        }, {
            'orth': 'confidence',
            'offset': 36,
            'full_shape': 'xxxxxxxxxx'
        }, {
            'orth': ':',
            'offset': 47,
            'full_shape': ':'
        }, {
            'orth': '2.3',
            'offset': 49,
            'full_shape': 'd.d'
        }]

        self.assertEqual(token_attrs, expected_token)
        text = e.get_string()
        expected_str = "extracted_value : 1 : 2 das : 1 2 3    confidence : 2.3 "

        self.assertEqual(text, expected_str)