예제 #1
0
 def __init__(self, ignore_future_dates: bool=True, ignore_past_years: int=40) -> None:
     self.ignore_future_dates = ignore_future_dates
     self.ignore_past_years = ignore_past_years
     Extractor.__init__(self,
                        input_type=InputType.TEXT,
                        category="data extractor",
                        name="date parser")
예제 #2
0
    def __init__(self,
                 nlp,
                 rules: Dict,
                 extractor_name: str) -> None:
        """
        Initialize the extractor, storing the rule information and construct spacy rules
        Args:
            nlp
            rules: Dict
            extractor_name: str

        Returns:
        """

        Extractor.__init__(self,
                           input_type=InputType.TEXT,
                           category="spacy_rule_extractor",
                           name=extractor_name)
        self.rules = rules["rules"]
        self.nlp = copy.deepcopy(nlp)
        self.tokenizer = Tokenizer(self.nlp)
        self.matcher = Matcher(self.nlp.vocab)
        self.field_name = rules["field_name"]
        self.rule_lst = []
        for a_rule in self.rules:
            this_rule = Rule(a_rule, self.nlp)
            self.rule_lst.append(this_rule)
예제 #3
0
    def __init__(self, name: str = None, custom_nlp: type = None) -> None:
        Extractor.__init__(self,
                           input_type=InputType.TEXT,
                           category="Text extractor",
                           name=name if name else "Sentence extractor")

        load_parser = False
        if custom_nlp:
            try:
                custom_pipeline = copy.deepcopy(custom_nlp)
                pipe_names = custom_pipeline.pipe_names
                for pipe in pipe_names:
                    if pipe != "parser":
                        custom_pipeline.remove_pipe(pipe)

                try:
                    assert "parser" in custom_pipeline.pipe_names
                    self._parser = custom_pipeline
                except AssertionError:
                    print("Note: custom_pipeline does not have a parser. \n"
                          "Loading parser from en_core_web_sm... ")
                    load_parser = True

            except AttributeError as e:
                print("Note: custom_pipeline does not have expected "
                      "attributes.")
                print(e)
                print("Loading parser from en_core_web_sm...")
                load_parser = True
        else:
            load_parser = True

        if load_parser:
            self._parser = spacy.load("en_core_web_sm",
                                      disable=["tagger", "ner"])
예제 #4
0
    def __init__(self, nlp, rules: Dict, extractor_name: str) -> None:
        """
        Initialize the extractor, storing the rule information and construct spacy rules
        Args:
            nlp
            rules (Dict): spacy rules
            extractor_name: str

        Returns:
        """

        Extractor.__init__(self,
                           input_type=InputType.TEXT,
                           category="spacy_rule_extractor",
                           name=extractor_name)
        self.rules = rules["rules"]
        self.nlp = copy.deepcopy(nlp)
        self.tokenizer = Tokenizer(self.nlp)
        self.matcher = Matcher(self.nlp.vocab)
        self.field_name = rules[
            "field_name"] if "field_name" in rules else extractor_name
        self.rule_lst = {}
        self.hash_map = {}
        for idx, a_rule in enumerate(self.rules):
            this_rule = Rule(a_rule, self.nlp)
            self.rule_lst[this_rule.identifier + "rule_id##" +
                          str(idx)] = this_rule
예제 #5
0
 def __init__(self,
              etk: ETK = None,
              extractor_name: str = 'excel extractor') -> None:
     Extractor.__init__(self,
                        input_type=InputType.TEXT,
                        category="data extractor",
                        name=extractor_name)
     self.etk = etk
예제 #6
0
 def __init__(self, extractor_name: str, search_url: str, get_attr=False,
              get_attr_url="http://dbpedia.org/sparql"):
     Extractor.__init__(self, input_type=InputType.TEXT,
                        category="built_in_extractor",
                        name=extractor_name)
     self._search_url = search_url
     self._get_attr = get_attr
     self._get_attr_url = get_attr_url
예제 #7
0
    def invoke_extractor(self,
                         extractor: Extractor,
                         extractable: Extractable = None,
                         tokenizer: Tokenizer = None,
                         joiner: str = "  ",
                         **options) -> List[Extraction]:
        """
        Invoke the extractor on the given extractable, accumulating all the extractions in a list.

        Args:
            extractor (Extractor):
            extractable (extractable): object for extraction
            tokenizer: user can pass custom tokenizer if extractor wants token
            joiner: user can pass joiner if extractor wants text
            options: user can pass arguments as a dict to the extract() function of different extractors

        Returns: List of Extraction, containing all the extractions.

        """
        if not extractable:
            extractable = self

        if not tokenizer:
            tokenizer = self.default_tokenizer

        extracted_results = list()

        if extractor.input_type == InputType.TOKENS:
            tokens = extractable.get_tokens(tokenizer)
            if tokens:
                extracted_results = extractor.extract(tokens, **options)

        elif extractor.input_type == InputType.TEXT:
            text = extractable.get_string(joiner)
            if text:
                extracted_results = extractor.extract(text, **options)

        elif extractor.input_type == InputType.OBJECT:
            extracted_results = extractor.extract(extractable.value, **options)

        elif extractor.input_type == InputType.HTML:
            extracted_results = extractor.extract(extractable.value, **options)

        #self.extraction_provenance_records = []
        for e in extracted_results:
            extraction_provenance_record: ExtractionProvenanceRecord = ExtractionProvenanceRecord(
                self.extraction_provenance_id_index, extractable.full_path,
                e.provenance["extractor_name"], e.provenance["start_char"],
                e.provenance["end_char"], e.provenance["confidence"],
                extractable.document, extractable.prov_id)
            #self.extraction_provenance_records.append(self.extraction_provenance_id_index)
            e.prov_id = self.extraction_provenance_id_index  # for the purpose of provenance hierarrchy tracking
            self.extraction_provenance_id_index = self.extraction_provenance_id_index + 1
            self.create_provenance(extraction_provenance_record)
        # TODO: the reason that extractors must return Extraction objects is so that
        # they can communicate back the provenance.

        return extracted_results
예제 #8
0
 def __init__(self,
              glossary: List[str],
              extractor_name: str,
              tokenizer: Tokenizer,
              ngrams: int=2,
              case_sensitive=False) -> None:
     Extractor.__init__(self,
                        input_type=InputType.TOKENS,
                        category="glossary",
                        name=extractor_name)
     self.ngrams = ngrams
     self.case_sensitive = case_sensitive
     self.default_tokenizer = tokenizer
     self.joiner = " "
     self.glossary = self.populate_trie(glossary)
예제 #9
0
    def __init__(self, etk: ETK=None, extractor_name: str='date extractor') -> None:
        Extractor.__init__(self,
                           input_type=InputType.TEXT,
                           category="data extractor",
                           name=extractor_name)

        # The 'final_regex' and 'symbol_list' are generated by 'DateRegexGenerator'
        # If the single regexes are changed or more patterns are added,
        # please re-generate 'final_regex' and 'symbol_list' and paste here.
        d = DateRegexGenerator(singleton_regex, units)
        self._final_regex = d.final_regex
        self._symbol_list = d.symbol_list
        self._settings = {}
        self._last_original_resolution = None
        self._etk = etk
        self._lan = 'en'
예제 #10
0
    def invoke_extractor(self,
                         extractor: Extractor,
                         extractable: Extractable = None,
                         tokenizer: Tokenizer = None,
                         joiner: str = "  ",
                         **options) -> List[Extraction]:
        """
       Invoke the extractor on the given extractable, accumulating all the extractions in a list.

       Args:
           extractor (Extractor):
           extractable (extractable):
           tokenizer: user can pass custom tokenizer if extractor wants token
           joiner: user can pass joiner if extractor wants text
           options: user can pass arguments as a dict to the extract() function of different extractors

       Returns: List of Extraction, containing all the extractions.

       """
        if not extractable:
            extractable = self

        if not tokenizer:
            tokenizer = self.default_tokenizer

        extracted_results = list()

        if extractor.input_type == InputType.TOKENS:
            tokens = extractable.get_tokens(tokenizer)
            if tokens:
                extracted_results = extractor.extract(tokens, **options)

        elif extractor.input_type == InputType.TEXT:
            text = extractable.get_string(joiner)
            if text:
                extracted_results = extractor.extract(text, **options)

        elif extractor.input_type == InputType.OBJECT:
            extracted_results = extractor.extract(extractable.value, **options)

        elif extractor.input_type == InputType.HTML:
            extracted_results = extractor.extract(extractable.value, **options)

        # TODO: the reason that extractors must return Extraction objects is so that
        # they can communicate back the provenance.

        return extracted_results
예제 #11
0
 def __init__(self):
     e_name = 'cryptographic hash extractor'
     self._regex_extractors = [
         RegexExtractor(r"(\b[a-fA-F\d]{32}\b)",
                        'md5 ' + e_name,
                        general_tag='md5'),
         RegexExtractor(r"(\b[0-9a-f]{40}\b)",
                        'sha1 ' + e_name,
                        general_tag='sha1'),
         RegexExtractor(r"(\b[A-Fa-f0-9]{64}\b)",
                        'sha256 ' + e_name,
                        general_tag='sha256'),
     ]
     Extractor.__init__(self,
                        input_type=InputType.TEXT,
                        category="regex",
                        name=e_name)
예제 #12
0
    def __init__(self, nlp, tokenizer, extractor_name: str) -> None:
        """
        Initialize the extractor, storing the rule information and construct spacy rules
        Args:
            nlp:
            tokenizer: Tokenizer
            extractor_name: str

        Returns:
        """
        Extractor.__init__(self,
                           input_type=InputType.TEXT,
                           category="build_in_extractor",
                           name=extractor_name)

        self._nlp = copy.deepcopy(nlp)
        self._like_email_matcher = Matcher(self._nlp.vocab)
        self._tokenizer = tokenizer
예제 #13
0
    def __init__(
        self,
        decoding_dict: dict,
        extractor_name: str,
        default_action: str = 'delete',
        case_sensitive: bool = False,
        strip_key: bool = True,
        strip_value: bool = False,
    ) -> None:
        """

        Args:
            decoding_dict: dict -> a python dictionary for decoding values
            extractor_name: str -> extractor name
            default_action: enum['delete'] ->  what if the value not matched in dictionary
            case_sensitive: bool -> matching the key and value strictly or ignore cases
            strip_key: bool -> strip key and value for matching or not
            strip_value: bool -> return the striped value if matched or the original value
        """
        Extractor.__init__(self,
                           input_type=InputType.TEXT,
                           category="dictionary",
                           name=extractor_name)
        if case_sensitive and not strip_key:
            self.decoding_dict = decoding_dict
        else:
            new_dict = {}
            if not strip_key:  # not case_sensitive, ignore cases
                for k in decoding_dict:
                    new_dict[k.lower()] = decoding_dict[k]
            elif case_sensitive:  # strip key
                for k in decoding_dict:
                    new_dict[k.strip()] = decoding_dict[k]
            else:  # ignore case AND strip key
                for k in decoding_dict:
                    new_dict[k.lower().strip()] = decoding_dict[k]
            self.decoding_dict = new_dict

        self.case_sensitive = case_sensitive
        self.default_action = default_action
        self.strip_key = strip_key
        self.strip_value = strip_value

        self.joiner = " "
예제 #14
0
    def __init__(self, email_url: str, mailing_list_name: str,
                 extractor_name: str) -> None:
        """
        Initialize the extractor, storing mailing list and message information
        Args:
            email_url: str
            mailing_list_name: str
            extractor_name: str

        Returns:
        """

        Extractor.__init__(self,
                           input_type=InputType.TEXT,
                           category="build_in_extractor",
                           name=extractor_name)

        self.email_url = email_url
        self.mailing_list_name = mailing_list_name
예제 #15
0
    def __init__(self,
                 pattern: str,
                 extractor_name: str = 'regex extractor',
                 flags=0,
                 general_tag: str = None) -> None:
        Extractor.__init__(self,
                           input_type=InputType.TEXT,
                           category="regex",
                           name=extractor_name)

        self._compiled_regex = re.compile(pattern, flags)
        self._general_tag = general_tag

        self._match_functions = {
            MatchMode.MATCH: self._compiled_regex.match,
            MatchMode.SEARCH: self._compiled_regex.search,
            MatchMode.FINDALL: self._compiled_regex.finditer,
            MatchMode.SPLIT: self._compiled_regex.split
        }
예제 #16
0
    def __init__(self,
                 glossary: List[str],
                 extractor_name: str,
                 tokenizer: Tokenizer,
                 ngrams: int = 2,
                 case_sensitive=False) -> None:
        Extractor.__init__(self,
                           input_type=InputType.TOKENS,
                           category="glossary",
                           name=extractor_name)

        self._case_sensitive = case_sensitive
        self._default_tokenizer = tokenizer
        if not ngrams:
            ngrams = 0
            for word in glossary:
                ngrams = max(ngrams, len(self._default_tokenizer.tokenize(word)))
        self._ngrams = min(ngrams, 5)
        self._joiner = " "
        self._glossary = self._populate_trie(glossary)
예제 #17
0
 def __init__(self,
              extractor_name: str,
              tokenizer: None,
              ngrams: int = 2,
              case_sensitive=False,
              redis_host="localhost",
              redis_port=6379,
              redis_key_prefix="") -> None:
     # if we set tokenizer as None, extractor will use regex to extract tokens to expedite the extraction
     Extractor.__init__(self,
                        input_type=InputType.TOKENS,
                        category="glossary",
                        name=extractor_name)
     self._case_sensitive = case_sensitive
     self._default_tokenizer = tokenizer
     self._ngrams = min(ngrams, 5)
     self._joiner = " "
     self._redisconn = redis.StrictRedis(host=redis_host,
                                         port=int(redis_port),
                                         decode_responses=True)
     self._key_prefix = redis_key_prefix
예제 #18
0
 def __init__(self) -> None:
     Extractor.__init__(self,
                        input_type=InputType.OBJECT,
                        category="data",
                        name=EntityTableDataExtraction.extractor_name)
     self.glossaries = dict()
예제 #19
0
 def __init__(self) -> None:
     Extractor.__init__(self,
                        input_type=InputType.TEXT,
                        category="content",
                        name=TableExtractor.extractor_name)
예제 #20
0
 def __init__(self, extractor_name: str) -> None:
     Extractor.__init__(self,
                        input_type=InputType.TEXT,
                        category="data extractor",
                        name=extractor_name)
예제 #21
0
 def __init__(self, extractor_name: str, search_url: str):
     Extractor.__init__(self,
                        input_type=InputType.TEXT,
                        category="built_in_extractor",
                        name=extractor_name)
     self.search_url = search_url
예제 #22
0
 def __init__(self, rule_set: InferlinkRuleSet):
     Extractor.__init__(self,
                        input_type=InputType.HTML,
                        category="HTML extractor",
                        name="Inferlink extractor")
     self.rule_set = rule_set
예제 #23
0
 def __init__(self):
     Extractor.__init__(self,
                        input_type=InputType.HTML,
                        category="HTML extractor",
                        name="HTML metadata extractor")
     """
예제 #24
0
 def __init__(self):
     Extractor.__init__(self,
                        input_type=InputType.HTML,
                        category="HTML extractor",
                        name="HTML content extractor")
예제 #25
0
    def extract(self, extractor: Extractor, extractable: Extractable = None, tokenizer: Tokenizer = None,
                joiner: str = "  ", **options) -> List[Extraction]:

        """
        Invoke the extractor on the given extractable, accumulating all the extractions in a list.

        Args:
            extractor (Extractor):
            extractable (extractable):
            tokenizer: user can pass custom tokenizer if extractor wants token
            joiner: user can pass joiner if extractor wants text
            options: user can pass arguments as a dict to the extract() function of different extractors

        Returns: List of Extraction, containing all the extractions.

        """
        if not extractable:
            extractable = self

        if not tokenizer:
            tokenizer = self.etk.default_tokenizer

        extracted_results = list()

        if extractor.input_type == InputType.TOKENS:
            if self.etk.error_policy == ErrorPolicy.PROCESS:
                if isinstance(extractable.value, list):
                    self.etk.log(
                        "Extractor needs tokens, tokenizer needs string to tokenize, got list, converting to string",
                        "warning", self.doc_id, self.url)
                    warnings.warn(
                        "Extractor needs tokens, tokenizer needs string to tokenize, got list, converting to string")
                elif isinstance(extractable.value, dict):
                    self.etk.log(
                        "Extractor needs tokens, tokenizer needs string to tokenize, got dict, converting to string",
                        "warning", self.doc_id, self.url)
                    warnings.warn(
                        "Extractor needs tokens, tokenizer needs string to tokenize, got dict, converting to string")
                tokens = extractable.get_tokens(tokenizer)
                if tokens:
                    extracted_results = extractor.extract(tokens, **options)
            else:
                raise ExtractorValueError(
                    "Extractor needs string, tokenizer needs string to tokenize, got " + str(type(extractable.value)))

        elif extractor.input_type == InputType.TEXT:
            if self.etk.error_policy == ErrorPolicy.PROCESS:
                if isinstance(extractable.value, list):
                    self.etk.log("Extractor needs string, got extractable value as list, converting to string",
                                 "warning", self.doc_id, self.url)
                    warnings.warn("Extractor needs string, got extractable value as list, converting to string")
                elif isinstance(extractable.value, dict):
                    self.etk.log("Extractor needs string, got extractable value as dict, converting to string",
                                 "warning", self.doc_id, self.url)
                    warnings.warn("Extractor needs string, got extractable value as dict, converting to string")
                text = extractable.get_string(joiner)
                if text:
                    extracted_results = extractor.extract(text, **options)
            else:
                # raise ExtractorValueError("Extractor needs string, got " + str(type(extractable.value)))
                # TODO: Yixiang - needs to be handled properly
                pass

        elif extractor.input_type == InputType.OBJECT:
            extracted_results = extractor.extract(extractable.value, **options)

        elif extractor.input_type == InputType.HTML:
            if bool(BeautifulSoup(extractable.value, "html.parser").find()):
                extracted_results = extractor.extract(extractable.value, **options)
            else:
                # raise ExtractorValueError("Extractor needs HTML, got non HTML string")
                # TODO: Yixiang - needs to be handled properly
                pass

        try:
            jsonPath = extractable.full_path
        except AttributeError:
            jsonPath = None

        for e in extracted_results:
            # for the purpose of provenance hierarrchy tracking, a parent's id for next generation.
            e.prov_id = self.provenance_id_index
            extraction_provenance_record: ExtractionProvenanceRecord = ExtractionProvenanceRecord(
                e.prov_id, jsonPath, e.provenance["extractor_name"],
                e.provenance["start_char"], e.provenance["end_char"], e.provenance["confidence"], self,
                extractable.prov_id)
            self._provenances[e.prov_id] = extraction_provenance_record

            # for the purpose of provenance hierarchy tracking
            self.provenance_id_index_incrementer()
            self.create_provenance(extraction_provenance_record)

        return extracted_results
예제 #26
0
 def __init__(self, extractor_name: str, nlp=spacy.load('en_core_web_sm')):
     Extractor.__init__(self,
                        input_type=InputType.TEXT,
                        category="built_in_extractor",
                        name=extractor_name)
     self.__nlp = nlp
예제 #27
0
 def __init__(self):
     Extractor.__init__(self,
                        input_type=InputType.TEXT,
                        category="Text extractor",
                        name="Language Identification")
예제 #28
0
 def __init__(self) -> None:
     Extractor.__init__(self,
                        input_type=InputType.TEXT,
                        category="content",
                        name="DigTableExtractor")
     self.tableExtractorInstance = TableExtraction()