def __init__(self, ignore_future_dates: bool=True, ignore_past_years: int=40) -> None: self.ignore_future_dates = ignore_future_dates self.ignore_past_years = ignore_past_years Extractor.__init__(self, input_type=InputType.TEXT, category="data extractor", name="date parser")
def __init__(self, nlp, rules: Dict, extractor_name: str) -> None: """ Initialize the extractor, storing the rule information and construct spacy rules Args: nlp rules: Dict extractor_name: str Returns: """ Extractor.__init__(self, input_type=InputType.TEXT, category="spacy_rule_extractor", name=extractor_name) self.rules = rules["rules"] self.nlp = copy.deepcopy(nlp) self.tokenizer = Tokenizer(self.nlp) self.matcher = Matcher(self.nlp.vocab) self.field_name = rules["field_name"] self.rule_lst = [] for a_rule in self.rules: this_rule = Rule(a_rule, self.nlp) self.rule_lst.append(this_rule)
def __init__(self, name: str = None, custom_nlp: type = None) -> None: Extractor.__init__(self, input_type=InputType.TEXT, category="Text extractor", name=name if name else "Sentence extractor") load_parser = False if custom_nlp: try: custom_pipeline = copy.deepcopy(custom_nlp) pipe_names = custom_pipeline.pipe_names for pipe in pipe_names: if pipe != "parser": custom_pipeline.remove_pipe(pipe) try: assert "parser" in custom_pipeline.pipe_names self._parser = custom_pipeline except AssertionError: print("Note: custom_pipeline does not have a parser. \n" "Loading parser from en_core_web_sm... ") load_parser = True except AttributeError as e: print("Note: custom_pipeline does not have expected " "attributes.") print(e) print("Loading parser from en_core_web_sm...") load_parser = True else: load_parser = True if load_parser: self._parser = spacy.load("en_core_web_sm", disable=["tagger", "ner"])
def __init__(self, nlp, rules: Dict, extractor_name: str) -> None: """ Initialize the extractor, storing the rule information and construct spacy rules Args: nlp rules (Dict): spacy rules extractor_name: str Returns: """ Extractor.__init__(self, input_type=InputType.TEXT, category="spacy_rule_extractor", name=extractor_name) self.rules = rules["rules"] self.nlp = copy.deepcopy(nlp) self.tokenizer = Tokenizer(self.nlp) self.matcher = Matcher(self.nlp.vocab) self.field_name = rules[ "field_name"] if "field_name" in rules else extractor_name self.rule_lst = {} self.hash_map = {} for idx, a_rule in enumerate(self.rules): this_rule = Rule(a_rule, self.nlp) self.rule_lst[this_rule.identifier + "rule_id##" + str(idx)] = this_rule
def __init__(self, etk: ETK = None, extractor_name: str = 'excel extractor') -> None: Extractor.__init__(self, input_type=InputType.TEXT, category="data extractor", name=extractor_name) self.etk = etk
def __init__(self, extractor_name: str, search_url: str, get_attr=False, get_attr_url="http://dbpedia.org/sparql"): Extractor.__init__(self, input_type=InputType.TEXT, category="built_in_extractor", name=extractor_name) self._search_url = search_url self._get_attr = get_attr self._get_attr_url = get_attr_url
def invoke_extractor(self, extractor: Extractor, extractable: Extractable = None, tokenizer: Tokenizer = None, joiner: str = " ", **options) -> List[Extraction]: """ Invoke the extractor on the given extractable, accumulating all the extractions in a list. Args: extractor (Extractor): extractable (extractable): object for extraction tokenizer: user can pass custom tokenizer if extractor wants token joiner: user can pass joiner if extractor wants text options: user can pass arguments as a dict to the extract() function of different extractors Returns: List of Extraction, containing all the extractions. """ if not extractable: extractable = self if not tokenizer: tokenizer = self.default_tokenizer extracted_results = list() if extractor.input_type == InputType.TOKENS: tokens = extractable.get_tokens(tokenizer) if tokens: extracted_results = extractor.extract(tokens, **options) elif extractor.input_type == InputType.TEXT: text = extractable.get_string(joiner) if text: extracted_results = extractor.extract(text, **options) elif extractor.input_type == InputType.OBJECT: extracted_results = extractor.extract(extractable.value, **options) elif extractor.input_type == InputType.HTML: extracted_results = extractor.extract(extractable.value, **options) #self.extraction_provenance_records = [] for e in extracted_results: extraction_provenance_record: ExtractionProvenanceRecord = ExtractionProvenanceRecord( self.extraction_provenance_id_index, extractable.full_path, e.provenance["extractor_name"], e.provenance["start_char"], e.provenance["end_char"], e.provenance["confidence"], extractable.document, extractable.prov_id) #self.extraction_provenance_records.append(self.extraction_provenance_id_index) e.prov_id = self.extraction_provenance_id_index # for the purpose of provenance hierarrchy tracking self.extraction_provenance_id_index = self.extraction_provenance_id_index + 1 self.create_provenance(extraction_provenance_record) # TODO: the reason that extractors must return Extraction objects is so that # they can communicate back the provenance. return extracted_results
def __init__(self, glossary: List[str], extractor_name: str, tokenizer: Tokenizer, ngrams: int=2, case_sensitive=False) -> None: Extractor.__init__(self, input_type=InputType.TOKENS, category="glossary", name=extractor_name) self.ngrams = ngrams self.case_sensitive = case_sensitive self.default_tokenizer = tokenizer self.joiner = " " self.glossary = self.populate_trie(glossary)
def __init__(self, etk: ETK=None, extractor_name: str='date extractor') -> None: Extractor.__init__(self, input_type=InputType.TEXT, category="data extractor", name=extractor_name) # The 'final_regex' and 'symbol_list' are generated by 'DateRegexGenerator' # If the single regexes are changed or more patterns are added, # please re-generate 'final_regex' and 'symbol_list' and paste here. d = DateRegexGenerator(singleton_regex, units) self._final_regex = d.final_regex self._symbol_list = d.symbol_list self._settings = {} self._last_original_resolution = None self._etk = etk self._lan = 'en'
def invoke_extractor(self, extractor: Extractor, extractable: Extractable = None, tokenizer: Tokenizer = None, joiner: str = " ", **options) -> List[Extraction]: """ Invoke the extractor on the given extractable, accumulating all the extractions in a list. Args: extractor (Extractor): extractable (extractable): tokenizer: user can pass custom tokenizer if extractor wants token joiner: user can pass joiner if extractor wants text options: user can pass arguments as a dict to the extract() function of different extractors Returns: List of Extraction, containing all the extractions. """ if not extractable: extractable = self if not tokenizer: tokenizer = self.default_tokenizer extracted_results = list() if extractor.input_type == InputType.TOKENS: tokens = extractable.get_tokens(tokenizer) if tokens: extracted_results = extractor.extract(tokens, **options) elif extractor.input_type == InputType.TEXT: text = extractable.get_string(joiner) if text: extracted_results = extractor.extract(text, **options) elif extractor.input_type == InputType.OBJECT: extracted_results = extractor.extract(extractable.value, **options) elif extractor.input_type == InputType.HTML: extracted_results = extractor.extract(extractable.value, **options) # TODO: the reason that extractors must return Extraction objects is so that # they can communicate back the provenance. return extracted_results
def __init__(self): e_name = 'cryptographic hash extractor' self._regex_extractors = [ RegexExtractor(r"(\b[a-fA-F\d]{32}\b)", 'md5 ' + e_name, general_tag='md5'), RegexExtractor(r"(\b[0-9a-f]{40}\b)", 'sha1 ' + e_name, general_tag='sha1'), RegexExtractor(r"(\b[A-Fa-f0-9]{64}\b)", 'sha256 ' + e_name, general_tag='sha256'), ] Extractor.__init__(self, input_type=InputType.TEXT, category="regex", name=e_name)
def __init__(self, nlp, tokenizer, extractor_name: str) -> None: """ Initialize the extractor, storing the rule information and construct spacy rules Args: nlp: tokenizer: Tokenizer extractor_name: str Returns: """ Extractor.__init__(self, input_type=InputType.TEXT, category="build_in_extractor", name=extractor_name) self._nlp = copy.deepcopy(nlp) self._like_email_matcher = Matcher(self._nlp.vocab) self._tokenizer = tokenizer
def __init__( self, decoding_dict: dict, extractor_name: str, default_action: str = 'delete', case_sensitive: bool = False, strip_key: bool = True, strip_value: bool = False, ) -> None: """ Args: decoding_dict: dict -> a python dictionary for decoding values extractor_name: str -> extractor name default_action: enum['delete'] -> what if the value not matched in dictionary case_sensitive: bool -> matching the key and value strictly or ignore cases strip_key: bool -> strip key and value for matching or not strip_value: bool -> return the striped value if matched or the original value """ Extractor.__init__(self, input_type=InputType.TEXT, category="dictionary", name=extractor_name) if case_sensitive and not strip_key: self.decoding_dict = decoding_dict else: new_dict = {} if not strip_key: # not case_sensitive, ignore cases for k in decoding_dict: new_dict[k.lower()] = decoding_dict[k] elif case_sensitive: # strip key for k in decoding_dict: new_dict[k.strip()] = decoding_dict[k] else: # ignore case AND strip key for k in decoding_dict: new_dict[k.lower().strip()] = decoding_dict[k] self.decoding_dict = new_dict self.case_sensitive = case_sensitive self.default_action = default_action self.strip_key = strip_key self.strip_value = strip_value self.joiner = " "
def __init__(self, email_url: str, mailing_list_name: str, extractor_name: str) -> None: """ Initialize the extractor, storing mailing list and message information Args: email_url: str mailing_list_name: str extractor_name: str Returns: """ Extractor.__init__(self, input_type=InputType.TEXT, category="build_in_extractor", name=extractor_name) self.email_url = email_url self.mailing_list_name = mailing_list_name
def __init__(self, pattern: str, extractor_name: str = 'regex extractor', flags=0, general_tag: str = None) -> None: Extractor.__init__(self, input_type=InputType.TEXT, category="regex", name=extractor_name) self._compiled_regex = re.compile(pattern, flags) self._general_tag = general_tag self._match_functions = { MatchMode.MATCH: self._compiled_regex.match, MatchMode.SEARCH: self._compiled_regex.search, MatchMode.FINDALL: self._compiled_regex.finditer, MatchMode.SPLIT: self._compiled_regex.split }
def __init__(self, glossary: List[str], extractor_name: str, tokenizer: Tokenizer, ngrams: int = 2, case_sensitive=False) -> None: Extractor.__init__(self, input_type=InputType.TOKENS, category="glossary", name=extractor_name) self._case_sensitive = case_sensitive self._default_tokenizer = tokenizer if not ngrams: ngrams = 0 for word in glossary: ngrams = max(ngrams, len(self._default_tokenizer.tokenize(word))) self._ngrams = min(ngrams, 5) self._joiner = " " self._glossary = self._populate_trie(glossary)
def __init__(self, extractor_name: str, tokenizer: None, ngrams: int = 2, case_sensitive=False, redis_host="localhost", redis_port=6379, redis_key_prefix="") -> None: # if we set tokenizer as None, extractor will use regex to extract tokens to expedite the extraction Extractor.__init__(self, input_type=InputType.TOKENS, category="glossary", name=extractor_name) self._case_sensitive = case_sensitive self._default_tokenizer = tokenizer self._ngrams = min(ngrams, 5) self._joiner = " " self._redisconn = redis.StrictRedis(host=redis_host, port=int(redis_port), decode_responses=True) self._key_prefix = redis_key_prefix
def __init__(self) -> None: Extractor.__init__(self, input_type=InputType.OBJECT, category="data", name=EntityTableDataExtraction.extractor_name) self.glossaries = dict()
def __init__(self) -> None: Extractor.__init__(self, input_type=InputType.TEXT, category="content", name=TableExtractor.extractor_name)
def __init__(self, extractor_name: str) -> None: Extractor.__init__(self, input_type=InputType.TEXT, category="data extractor", name=extractor_name)
def __init__(self, extractor_name: str, search_url: str): Extractor.__init__(self, input_type=InputType.TEXT, category="built_in_extractor", name=extractor_name) self.search_url = search_url
def __init__(self, rule_set: InferlinkRuleSet): Extractor.__init__(self, input_type=InputType.HTML, category="HTML extractor", name="Inferlink extractor") self.rule_set = rule_set
def __init__(self): Extractor.__init__(self, input_type=InputType.HTML, category="HTML extractor", name="HTML metadata extractor") """
def __init__(self): Extractor.__init__(self, input_type=InputType.HTML, category="HTML extractor", name="HTML content extractor")
def extract(self, extractor: Extractor, extractable: Extractable = None, tokenizer: Tokenizer = None, joiner: str = " ", **options) -> List[Extraction]: """ Invoke the extractor on the given extractable, accumulating all the extractions in a list. Args: extractor (Extractor): extractable (extractable): tokenizer: user can pass custom tokenizer if extractor wants token joiner: user can pass joiner if extractor wants text options: user can pass arguments as a dict to the extract() function of different extractors Returns: List of Extraction, containing all the extractions. """ if not extractable: extractable = self if not tokenizer: tokenizer = self.etk.default_tokenizer extracted_results = list() if extractor.input_type == InputType.TOKENS: if self.etk.error_policy == ErrorPolicy.PROCESS: if isinstance(extractable.value, list): self.etk.log( "Extractor needs tokens, tokenizer needs string to tokenize, got list, converting to string", "warning", self.doc_id, self.url) warnings.warn( "Extractor needs tokens, tokenizer needs string to tokenize, got list, converting to string") elif isinstance(extractable.value, dict): self.etk.log( "Extractor needs tokens, tokenizer needs string to tokenize, got dict, converting to string", "warning", self.doc_id, self.url) warnings.warn( "Extractor needs tokens, tokenizer needs string to tokenize, got dict, converting to string") tokens = extractable.get_tokens(tokenizer) if tokens: extracted_results = extractor.extract(tokens, **options) else: raise ExtractorValueError( "Extractor needs string, tokenizer needs string to tokenize, got " + str(type(extractable.value))) elif extractor.input_type == InputType.TEXT: if self.etk.error_policy == ErrorPolicy.PROCESS: if isinstance(extractable.value, list): self.etk.log("Extractor needs string, got extractable value as list, converting to string", "warning", self.doc_id, self.url) warnings.warn("Extractor needs string, got extractable value as list, converting to string") elif isinstance(extractable.value, dict): self.etk.log("Extractor needs string, got extractable value as dict, converting to string", "warning", self.doc_id, self.url) warnings.warn("Extractor needs string, got extractable value as dict, converting to string") text = extractable.get_string(joiner) if text: extracted_results = extractor.extract(text, **options) else: # raise ExtractorValueError("Extractor needs string, got " + str(type(extractable.value))) # TODO: Yixiang - needs to be handled properly pass elif extractor.input_type == InputType.OBJECT: extracted_results = extractor.extract(extractable.value, **options) elif extractor.input_type == InputType.HTML: if bool(BeautifulSoup(extractable.value, "html.parser").find()): extracted_results = extractor.extract(extractable.value, **options) else: # raise ExtractorValueError("Extractor needs HTML, got non HTML string") # TODO: Yixiang - needs to be handled properly pass try: jsonPath = extractable.full_path except AttributeError: jsonPath = None for e in extracted_results: # for the purpose of provenance hierarrchy tracking, a parent's id for next generation. e.prov_id = self.provenance_id_index extraction_provenance_record: ExtractionProvenanceRecord = ExtractionProvenanceRecord( e.prov_id, jsonPath, e.provenance["extractor_name"], e.provenance["start_char"], e.provenance["end_char"], e.provenance["confidence"], self, extractable.prov_id) self._provenances[e.prov_id] = extraction_provenance_record # for the purpose of provenance hierarchy tracking self.provenance_id_index_incrementer() self.create_provenance(extraction_provenance_record) return extracted_results
def __init__(self, extractor_name: str, nlp=spacy.load('en_core_web_sm')): Extractor.__init__(self, input_type=InputType.TEXT, category="built_in_extractor", name=extractor_name) self.__nlp = nlp
def __init__(self): Extractor.__init__(self, input_type=InputType.TEXT, category="Text extractor", name="Language Identification")
def __init__(self) -> None: Extractor.__init__(self, input_type=InputType.TEXT, category="content", name="DigTableExtractor") self.tableExtractorInstance = TableExtraction()