示例#1
0
    def __init__(self, name: str = None, custom_nlp: type = None) -> None:
        Extractor.__init__(self,
                           input_type=InputType.TEXT,
                           category="Text extractor",
                           name=name if name else "Sentence extractor")

        load_parser = False
        if custom_nlp:
            try:
                custom_pipeline = copy.deepcopy(custom_nlp)
                pipe_names = custom_pipeline.pipe_names
                for pipe in pipe_names:
                    if pipe != "parser":
                        custom_pipeline.remove_pipe(pipe)

                try:
                    assert "parser" in custom_pipeline.pipe_names
                    self._parser = custom_pipeline
                except AssertionError:
                    print("Note: custom_pipeline does not have a parser. \n"
                          "Loading parser from en_core_web_sm... ")
                    load_parser = True

            except AttributeError as e:
                print("Note: custom_pipeline does not have expected "
                      "attributes.")
                print(e)
                print("Loading parser from en_core_web_sm...")
                load_parser = True
        else:
            load_parser = True

        if load_parser:
            self._parser = spacy.load("en_core_web_sm",
                                      disable=["tagger", "ner"])
    def __init__(self,
                 nlp,
                 rules: Dict,
                 extractor_name: str) -> None:
        """
        Initialize the extractor, storing the rule information and construct spacy rules
        Args:
            nlp
            rules: Dict
            extractor_name: str

        Returns:
        """

        Extractor.__init__(self,
                           input_type=InputType.TEXT,
                           category="spacy_rule_extractor",
                           name=extractor_name)
        self.rules = rules["rules"]
        self.nlp = copy.deepcopy(nlp)
        self.tokenizer = Tokenizer(self.nlp)
        self.matcher = Matcher(self.nlp.vocab)
        self.field_name = rules["field_name"]
        self.rule_lst = []
        for a_rule in self.rules:
            this_rule = Rule(a_rule, self.nlp)
            self.rule_lst.append(this_rule)
示例#3
0
 def __init__(self, ignore_future_dates: bool=True, ignore_past_years: int=40) -> None:
     self.ignore_future_dates = ignore_future_dates
     self.ignore_past_years = ignore_past_years
     Extractor.__init__(self,
                        input_type=InputType.TEXT,
                        category="data extractor",
                        name="date parser")
示例#4
0
    def __init__(self, nlp, rules: Dict, extractor_name: str) -> None:
        """
        Initialize the extractor, storing the rule information and construct spacy rules
        Args:
            nlp
            rules (Dict): spacy rules
            extractor_name: str

        Returns:
        """

        Extractor.__init__(self,
                           input_type=InputType.TEXT,
                           category="spacy_rule_extractor",
                           name=extractor_name)
        self.rules = rules["rules"]
        self.nlp = copy.deepcopy(nlp)
        self.tokenizer = Tokenizer(self.nlp)
        self.matcher = Matcher(self.nlp.vocab)
        self.field_name = rules[
            "field_name"] if "field_name" in rules else extractor_name
        self.rule_lst = {}
        self.hash_map = {}
        for idx, a_rule in enumerate(self.rules):
            this_rule = Rule(a_rule, self.nlp)
            self.rule_lst[this_rule.identifier + "rule_id##" +
                          str(idx)] = this_rule
示例#5
0
 def __init__(self,
              etk: ETK = None,
              extractor_name: str = 'excel extractor') -> None:
     Extractor.__init__(self,
                        input_type=InputType.TEXT,
                        category="data extractor",
                        name=extractor_name)
     self.etk = etk
示例#6
0
 def __init__(self, extractor_name: str, search_url: str, get_attr=False,
              get_attr_url="http://dbpedia.org/sparql"):
     Extractor.__init__(self, input_type=InputType.TEXT,
                        category="built_in_extractor",
                        name=extractor_name)
     self._search_url = search_url
     self._get_attr = get_attr
     self._get_attr_url = get_attr_url
示例#7
0
 def __init__(self,
              glossary: List[str],
              extractor_name: str,
              tokenizer: Tokenizer,
              ngrams: int=2,
              case_sensitive=False) -> None:
     Extractor.__init__(self,
                        input_type=InputType.TOKENS,
                        category="glossary",
                        name=extractor_name)
     self.ngrams = ngrams
     self.case_sensitive = case_sensitive
     self.default_tokenizer = tokenizer
     self.joiner = " "
     self.glossary = self.populate_trie(glossary)
示例#8
0
    def __init__(self, etk: ETK=None, extractor_name: str='date extractor') -> None:
        Extractor.__init__(self,
                           input_type=InputType.TEXT,
                           category="data extractor",
                           name=extractor_name)

        # The 'final_regex' and 'symbol_list' are generated by 'DateRegexGenerator'
        # If the single regexes are changed or more patterns are added,
        # please re-generate 'final_regex' and 'symbol_list' and paste here.
        d = DateRegexGenerator(singleton_regex, units)
        self._final_regex = d.final_regex
        self._symbol_list = d.symbol_list
        self._settings = {}
        self._last_original_resolution = None
        self._etk = etk
        self._lan = 'en'
示例#9
0
 def __init__(self):
     e_name = 'cryptographic hash extractor'
     self._regex_extractors = [
         RegexExtractor(r"(\b[a-fA-F\d]{32}\b)",
                        'md5 ' + e_name,
                        general_tag='md5'),
         RegexExtractor(r"(\b[0-9a-f]{40}\b)",
                        'sha1 ' + e_name,
                        general_tag='sha1'),
         RegexExtractor(r"(\b[A-Fa-f0-9]{64}\b)",
                        'sha256 ' + e_name,
                        general_tag='sha256'),
     ]
     Extractor.__init__(self,
                        input_type=InputType.TEXT,
                        category="regex",
                        name=e_name)
示例#10
0
    def __init__(self, nlp, tokenizer, extractor_name: str) -> None:
        """
        Initialize the extractor, storing the rule information and construct spacy rules
        Args:
            nlp:
            tokenizer: Tokenizer
            extractor_name: str

        Returns:
        """
        Extractor.__init__(self,
                           input_type=InputType.TEXT,
                           category="build_in_extractor",
                           name=extractor_name)

        self._nlp = copy.deepcopy(nlp)
        self._like_email_matcher = Matcher(self._nlp.vocab)
        self._tokenizer = tokenizer
示例#11
0
    def __init__(
        self,
        decoding_dict: dict,
        extractor_name: str,
        default_action: str = 'delete',
        case_sensitive: bool = False,
        strip_key: bool = True,
        strip_value: bool = False,
    ) -> None:
        """

        Args:
            decoding_dict: dict -> a python dictionary for decoding values
            extractor_name: str -> extractor name
            default_action: enum['delete'] ->  what if the value not matched in dictionary
            case_sensitive: bool -> matching the key and value strictly or ignore cases
            strip_key: bool -> strip key and value for matching or not
            strip_value: bool -> return the striped value if matched or the original value
        """
        Extractor.__init__(self,
                           input_type=InputType.TEXT,
                           category="dictionary",
                           name=extractor_name)
        if case_sensitive and not strip_key:
            self.decoding_dict = decoding_dict
        else:
            new_dict = {}
            if not strip_key:  # not case_sensitive, ignore cases
                for k in decoding_dict:
                    new_dict[k.lower()] = decoding_dict[k]
            elif case_sensitive:  # strip key
                for k in decoding_dict:
                    new_dict[k.strip()] = decoding_dict[k]
            else:  # ignore case AND strip key
                for k in decoding_dict:
                    new_dict[k.lower().strip()] = decoding_dict[k]
            self.decoding_dict = new_dict

        self.case_sensitive = case_sensitive
        self.default_action = default_action
        self.strip_key = strip_key
        self.strip_value = strip_value

        self.joiner = " "
示例#12
0
    def __init__(self, email_url: str, mailing_list_name: str,
                 extractor_name: str) -> None:
        """
        Initialize the extractor, storing mailing list and message information
        Args:
            email_url: str
            mailing_list_name: str
            extractor_name: str

        Returns:
        """

        Extractor.__init__(self,
                           input_type=InputType.TEXT,
                           category="build_in_extractor",
                           name=extractor_name)

        self.email_url = email_url
        self.mailing_list_name = mailing_list_name
示例#13
0
    def __init__(self,
                 pattern: str,
                 extractor_name: str = 'regex extractor',
                 flags=0,
                 general_tag: str = None) -> None:
        Extractor.__init__(self,
                           input_type=InputType.TEXT,
                           category="regex",
                           name=extractor_name)

        self._compiled_regex = re.compile(pattern, flags)
        self._general_tag = general_tag

        self._match_functions = {
            MatchMode.MATCH: self._compiled_regex.match,
            MatchMode.SEARCH: self._compiled_regex.search,
            MatchMode.FINDALL: self._compiled_regex.finditer,
            MatchMode.SPLIT: self._compiled_regex.split
        }
示例#14
0
    def __init__(self,
                 glossary: List[str],
                 extractor_name: str,
                 tokenizer: Tokenizer,
                 ngrams: int = 2,
                 case_sensitive=False) -> None:
        Extractor.__init__(self,
                           input_type=InputType.TOKENS,
                           category="glossary",
                           name=extractor_name)

        self._case_sensitive = case_sensitive
        self._default_tokenizer = tokenizer
        if not ngrams:
            ngrams = 0
            for word in glossary:
                ngrams = max(ngrams, len(self._default_tokenizer.tokenize(word)))
        self._ngrams = min(ngrams, 5)
        self._joiner = " "
        self._glossary = self._populate_trie(glossary)
示例#15
0
 def __init__(self,
              extractor_name: str,
              tokenizer: None,
              ngrams: int = 2,
              case_sensitive=False,
              redis_host="localhost",
              redis_port=6379,
              redis_key_prefix="") -> None:
     # if we set tokenizer as None, extractor will use regex to extract tokens to expedite the extraction
     Extractor.__init__(self,
                        input_type=InputType.TOKENS,
                        category="glossary",
                        name=extractor_name)
     self._case_sensitive = case_sensitive
     self._default_tokenizer = tokenizer
     self._ngrams = min(ngrams, 5)
     self._joiner = " "
     self._redisconn = redis.StrictRedis(host=redis_host,
                                         port=int(redis_port),
                                         decode_responses=True)
     self._key_prefix = redis_key_prefix
示例#16
0
 def __init__(self, extractor_name: str, nlp=spacy.load('en_core_web_sm')):
     Extractor.__init__(self,
                        input_type=InputType.TEXT,
                        category="built_in_extractor",
                        name=extractor_name)
     self.__nlp = nlp
示例#17
0
 def __init__(self, extractor_name: str) -> None:
     Extractor.__init__(self,
                        input_type=InputType.TEXT,
                        category="data extractor",
                        name=extractor_name)
示例#18
0
 def __init__(self, rule_set: InferlinkRuleSet):
     Extractor.__init__(self,
                        input_type=InputType.HTML,
                        category="HTML extractor",
                        name="Inferlink extractor")
     self.rule_set = rule_set
示例#19
0
 def __init__(self):
     Extractor.__init__(self,
                        input_type=InputType.HTML,
                        category="HTML extractor",
                        name="HTML metadata extractor")
     """
 def __init__(self):
     Extractor.__init__(self,
                        input_type=InputType.HTML,
                        category="HTML extractor",
                        name="HTML content extractor")
 def __init__(self, extractor_name: str, search_url: str):
     Extractor.__init__(self,
                        input_type=InputType.TEXT,
                        category="built_in_extractor",
                        name=extractor_name)
     self.search_url = search_url
示例#22
0
 def __init__(self):
     Extractor.__init__(self,
                        input_type=InputType.TEXT,
                        category="Text extractor",
                        name="Language Identification")
示例#23
0
 def __init__(self) -> None:
     Extractor.__init__(self,
                        input_type=InputType.TEXT,
                        category="content",
                        name=TableExtractor.extractor_name)
示例#24
0
 def __init__(self) -> None:
     Extractor.__init__(self,
                        input_type=InputType.OBJECT,
                        category="data",
                        name=EntityTableDataExtraction.extractor_name)
     self.glossaries = dict()
示例#25
0
 def __init__(self) -> None:
     Extractor.__init__(self,
                        input_type=InputType.TEXT,
                        category="content",
                        name="DigTableExtractor")
     self.tableExtractorInstance = TableExtraction()