def extract(self, html_text: str, strategy: Strategy=Strategy.ALL_TEXT) \ -> List[Extraction]: """ Extracts text from an HTML page using a variety of strategies Args: html_text (): html page in string strategy (): one of Strategy.ALL_TEXT, Strategy.MAIN_CONTENT_STRICT and Strategy.MAIN_CONTENT_RELAXED Returns: a list of Extraction(s) of a str, typically a singleton list with the extracted text """ if html_text: if strategy == Strategy.ALL_TEXT: soup = BeautifulSoup(html_text, 'html.parser') texts = soup.findAll(text=True) visible_texts = filter(self._tag_visible, texts) all_text = u" ".join(t.strip() for t in visible_texts) return [Extraction(all_text, self.name)] else: relax = strategy == Strategy.MAIN_CONTENT_RELAXED readable = Document( html_text, recallPriority=relax).summary(html_partial=False) clean_text = BeautifulSoup(readable.encode('utf-8'), 'lxml').strings readability_text = ' '.join(clean_text) return [Extraction(readability_text, self.name)] else: return []
def extract(self, text: str, method: str) -> List[Extraction]: """ Args: text (str): any text, can contain HTML method (Enum[IdentificationTool.LANGID, IdentificationTool.LANGDETECT]): specifies which of the two algorithms to use Returns: List(Extraction): an extraction containing the language code used in the text. Returns the empty list of the extractor fails to identify the language in the text. """ if method == IdentificationTool.LANGID.name: language = classify(text)[0] return [Extraction(value=language, extractor_name=self.name)] elif method == IdentificationTool.LANGDETECT.name: try: language = detect(text) except: language = 'unknown' if language == 'unknown': return list() else: return [Extraction(value=language, extractor_name=self.name)] else: return list()
def extract(self, text: str) -> List[Extraction]: """ Extract with the input text Args: text: str Returns: List[Extraction] """ result = [] first_phase_doc = self.nlp(text) self.load_email_matcher() like_email_matches = self.like_email_matcher(first_phase_doc) like_emails_filtered = [] for match_id, start, end in like_email_matches: span = first_phase_doc[start:end] if self.check_domain(self.tokenizer.tokenize(span.text)): like_emails_filtered.append((span.text, span[0].idx, span[-1].idx + len(span[-1]))) non_space_emails = self.get_non_space_email(first_phase_doc) emails = set(like_emails_filtered).union(non_space_emails) for email in emails: result.append(Extraction( value=email[0], extractor_name=self.name, start_char=email[1], end_char=email[2]) ) return result
def extract(self, text: str) -> List[Extraction]: """ Args: text (str): The input source to be processed Returns: List[Extraction]: The list of extractions returned by EmailExtractor """ result = [] first_phase_doc = self._nlp(text) self._load_email_matcher() like_email_matches = self._like_email_matcher(first_phase_doc) like_emails_filtered = [] for match_id, start, end in like_email_matches: span = first_phase_doc[start:end] if self._check_domain(self._tokenizer.tokenize(span.text)): like_emails_filtered.append( (span.text, span[0].idx, span[-1].idx + len(span[-1]))) non_space_emails = self._get_non_space_email(first_phase_doc) emails = set(like_emails_filtered).union(non_space_emails) for email in emails: result.append( Extraction(value=email[0], extractor_name=self.name, start_char=email[1], end_char=email[2])) return result
def _combiner(self, results: dict) -> List[Extraction]: return_result = list() if "Resources" in results: resources_results = self.deduplicate_by_uri(results["Resources"]) for one_result in resources_results: types = one_result['@types'].split(',') values = { 'surface_form': one_result['@surfaceForm'], 'uri': one_result['@URI'], 'types': types, 'similarity_scores': float(one_result['@similarityScore']) } if self._get_attr: if one_result['@URI'] not in self.seen_uris: # attr = self._attr_finder(one_result['@URI']) attr = self._en_label_finder(one_result['@URI']) self.seen_uris[one_result['@URI']] = attr values['attributes'] = self.seen_uris[one_result['@URI']] values['surface_form'] = self.seen_uris[ one_result['@URI']]['rdf-schema#label'] return_result.append( Extraction(confidence=float(results['@confidence']), extractor_name=self.name, start_char=int(one_result['@offset']), end_char=int(one_result['@offset']) + len(one_result['@surfaceForm']), value=values)) return return_result return list()
def extract(self, text: str) -> List[Extraction]: """ Extract from text Args: text: str Returns: List[Extraction] """ doc = self._tokenizer.tokenize_to_spacy_doc(text) self._load_matcher() matches = [x for x in self._matcher(doc) if x[1] != x[2]] pos_filtered_matches = [] neg_filtered_matches = [] for idx, start, end in matches: span_doc = self._tokenizer.tokenize_to_spacy_doc( doc[start:end].text) this_spacy_rule = self._matcher.get(idx) relations = self._find_relation(span_doc, this_spacy_rule) rule_id, _ = self._hash_map[idx] this_rule = self._rule_lst[rule_id] if self._filter_match(doc[start:end], relations, this_rule.patterns): value = self._form_output(doc[start:end], this_rule.output_format, relations, this_rule.patterns) if this_rule.polarity: pos_filtered_matches.append( (start, end, value, rule_id, relations)) else: neg_filtered_matches.append( (start, end, value, rule_id, relations)) return_lst = [] if pos_filtered_matches: longest_lst_pos = self._get_longest(pos_filtered_matches) if neg_filtered_matches: longest_lst_neg = self._get_longest(neg_filtered_matches) return_lst = self._reject_neg(longest_lst_pos, longest_lst_neg) else: return_lst = longest_lst_pos extractions = [] for (start, end, value, rule_id, relation) in return_lst: this_extraction = Extraction(value=value, extractor_name=self.name, start_token=start, end_token=end, start_char=doc[start].idx, end_char=doc[end - 1].idx + len(doc[end - 1]), rule_id=rule_id.split("rule_id##")[0], match_mapping=relation) extractions.append(this_extraction) return extractions
def _wrap_split_extraction(self, items: List[str]) -> List[Extraction]: res = list() start = 0 for item in items: end = start + len(item) e = Extraction(value=item, extractor_name=self.name, start_char=start, end_char=end) res.append(e) start = end return res
def _wrap_value_with_context(self, tokens: List[Token], start: int, end: int) -> Extraction: """Wraps the final result""" return Extraction(' '.join([x.orth_ for x in tokens[start:end]]), self.name, start_token=start, end_token=end, start_char=tokens[start].idx, end_char=tokens[end - 1].idx + len(tokens[end - 1].orth_) )
def wrap_value_with_context(self, value: dict, field_name: str, start: int = 0, end: int = 0) -> Extraction: """Wraps the final result""" return Extraction(value, self.name, start_token=start, end_token=end, tag=field_name)
def extract(self, text: str, get_attr=['PERSON', 'ORG', 'GPE']) -> List[Extraction]: doc = self.__nlp(text) attr_list = list() for ent in doc.ents: if ent.label_ in get_attr: attr_list.append( Extraction(extractor_name=self.name, start_char=int(ent.start_char), end_char=int(ent.end_char), value=ent.text, tag=ent.label_)) return attr_list
def extract(self, text: str) -> List[Extraction]: """ Extracts and structures email message from UTF8-encoded text Args: text: str Returns: Extraction """ content = BeautifulSoup(text, 'html5lib') subject = content.find('h1').text.strip() recip = self.mailing_list_name navbar = content.find(id='navbar') if navbar == None: info = self.old_format(content) else: info = self.new_format(navbar, content) for i in info[0:3]: if i == 'None': print('missed something important') sender = info[0] date = info[1] body = info[2] nxt = info[3] rep_to = info[4] pub = 'SeeSat_Obs' dRec = datetime.datetime.now().isoformat() msg_obj = { 'url': self.email_url, '@context': { '@vocab': 'schema.org' }, 'subject': subject, 'recip': recip, 'sender': sender } if date != 'None': msg_obj['dateReceived'] = date if body != 'None': msg_obj['body'] = body if nxt != 'None': msg_obj['nxt'] = nxt if rep_to != 'None': msg_obj['replyToMessage'] = rep_to return Extraction(value=msg_obj, extractor_name=self.name)
def _wrap_result(self, value: str, original_key: str) -> Extraction or None: """ Args: value: the decoded value original_key: the original string value to be decode Returns: an Extraction if everything goes well """ try: value = value.strip() if self._strip_value else value e = Extraction(value, self.name, start_char=0, end_char=len(str(value))) return e except Exception as e: print('fail to wrap dictionary extraction: ', original_key, value) raise ExtractorError('Exception: ' + str(e))
def extract(self, text: str, get_attr=['PERSON', 'ORG', 'GPE']) -> List[Extraction]: """ Args: text (str): the text to extract from. get_attr (List[str]): The spaCy NER attributes we're interested in. Returns: List(Extraction): the list of extraction or the empty list if there are no matches. """ doc = self.__nlp(text) attr_list = list() for ent in doc.ents: if ent.label_ in get_attr: attr_list.append( Extraction(extractor_name=self.name, start_char=int(ent.start_char), end_char=int(ent.end_char), value=ent.text, tag=ent.label_)) return attr_list
def _combiner(self, results: dict) -> List[Extraction]: return_result = list() if "Resources" in results: resources_results = results["Resources"] for one_result in resources_results: types = one_result['@types'].split(',') values = {'surface_form': one_result['@surfaceForm'], 'uri': one_result['@URI'], 'types': types, 'similarity_scores': float(one_result['@similarityScore'])} if self._get_attr: attr = self._attr_finder(one_result['@URI']) values['attributes'] = attr return_result.append(Extraction(confidence=float(results['@confidence']), extractor_name=self.name, start_char=int(one_result['@offset']), end_char=int(one_result['@offset']) + len( one_result['@surfaceForm']), value=values)) return return_result return list()
def extract(self, html_text: str, threshold=0.5) -> List[Extraction]: """ Args: html_text (str): str of the html page to be extracted threshold (float): if the ratio of rules that successfully extracted something over all rules \ is higher than or equal to the threshold, return the results, else return an empty list Returns: List[Extraction]: a list of Extractions, each extraction includes the extracted value, the rule name, the provenance etc. """ result = list() try: for rule in self._rule_set.rules: rule.apply(html_text) value = rule.value if value is not None: # note the addition of a new tag argument to Extraction start_char = rule.start_char end_char = rule.end_char result.append( Extraction(value, self.name, start_char=start_char, end_char=end_char, tag=rule.name)) # Test whether the fraction of extractions meets the desired threshold if len(self._rule_set.rules) > 0 and float(len(result)) / len( self._rule_set.rules) >= threshold: return result else: return list() except Exception as e: raise ExtractorError('Error in extracting landmark %s' % e)
def extract(self, text: str) -> List[Extraction]: """ Splits text by sentences. Args: text (str): Input text to be extracted. Returns: List[Extraction]: the list of extraction or the empty list if there are no matches. """ doc = self._parser(text) extractions = list() for sent in doc.sents: this_extraction = Extraction(value=sent.text, extractor_name=self.name, start_token=sent[0], end_token=sent[-1], start_char=sent.text[0], end_char=sent.text[-1]) extractions.append(this_extraction) return extractions
def _wrap_extraction(self, date_object: datetime.datetime, original_text: str, start_char: int, end_char: int) -> Extraction or None: """ wrap the final result as an Extraction and return """ try: resolution = self._settings[MIN_RESOLUTION] \ if self._settings[DATE_VALUE_RESOLUTION] == DateResolution.ORIGINAL \ else self._settings[DATE_VALUE_RESOLUTION] e = Extraction(self._convert_to_iso_format(date_object, resolution=resolution), start_char=start_char, end_char=end_char, extractor_name=self._name, date_object=date_object, original_date=original_text) return e except Exception as e: warn('DateExtractor: Failed to wrap result ' + str(original_text) + ' with Extraction class.\n' 'Catch ' + str(e)) return None
def extract(self, text: str) -> List[Extraction]: """ Splits text by sentences. Args: text (str): Returns: List[Extraction] """ doc = self._parser(text) extractions = list() for sent in doc.sents: this_extraction = Extraction(value=sent.text, extractor_name=self.name, start_token=sent[0], end_token=sent[-1], start_char=sent.text[0], end_char=sent.text[-1]) extractions.append(this_extraction) return extractions
def wrap_data(self, key: str, value) -> Extraction: e = Extraction(value=value, extractor_name=self.name, tag=key) return e
def _wrap_extraction(self, group_idx: int, matches: object) -> Extraction: start, end = matches.start(group_idx), matches.end(group_idx) text = matches.group(group_idx) e = Extraction(value=text, extractor_name=self.name, \ start_char=start, end_char=end, tag=self.general_tag) return e
def extract(self, text: str) -> List[Extraction]: """ Extracts and structures email message from UTF8-encoded text Args: text: str Returns: Extraction """ content = BeautifulSoup(text, 'html5lib') subject = content.find('h1').text.strip() recip = self.mailing_list_name navbar = content.find(id='navbar') if navbar == None: info = self.old_format(content) else: info = self.new_format(navbar, content) sender = info[0] date = info[1] body = info[2] nxt = info[3] rep_to = info[4] msg_obj = { '@id': self.email_url, '@type': ['EmailMessage'], '@context': { '@vocab': 'schema.org' }, 'about': { '@id': subject, '@type': ['Thing'], '@context': { '@vocab': 'schema.org' } }, 'recipient': { '@id': recip, '@type': ['Organization'], '@context': { '@vocab': 'schema.org' } }, 'sender': { '@id': sender, '@type': ['Person'], '@context': { '@vocab': 'schema.org' } } } if date != 'None': msg_obj['dateReceived'] = date if body != 'None': msg_obj['text'] = body if nxt != 'None': msg_obj['nextInThread'] = { '@id': nxt, '@type': ['URL'], '@context': { '@vocab': 'schema.org' } } if rep_to != 'None': msg_obj['replyToMessage'] = { '@id': rep_to, '@type': ['URL'], '@context': { '@vocab': 'schema.org' } } return Extraction(value=msg_obj, extractor_name=self.name)