예제 #1
0
    def extract(self, html_text: str, strategy: Strategy=Strategy.ALL_TEXT) \
            -> List[Extraction]:
        """
        Extracts text from an HTML page using a variety of strategies

        Args:
            html_text (): html page in string
            strategy (): one of Strategy.ALL_TEXT, Strategy.MAIN_CONTENT_STRICT and Strategy.MAIN_CONTENT_RELAXED

        Returns: a list of Extraction(s) of a str, typically a singleton list with the extracted text
        """

        if html_text:
            if strategy == Strategy.ALL_TEXT:
                soup = BeautifulSoup(html_text, 'html.parser')
                texts = soup.findAll(text=True)
                visible_texts = filter(self._tag_visible, texts)
                all_text = u" ".join(t.strip() for t in visible_texts)
                return [Extraction(all_text, self.name)]
            else:
                relax = strategy == Strategy.MAIN_CONTENT_RELAXED
                readable = Document(
                    html_text,
                    recallPriority=relax).summary(html_partial=False)
                clean_text = BeautifulSoup(readable.encode('utf-8'),
                                           'lxml').strings
                readability_text = ' '.join(clean_text)
                return [Extraction(readability_text, self.name)]
        else:
            return []
예제 #2
0
    def extract(self, text: str, method: str) -> List[Extraction]:
        """

        Args:
            text (str): any text, can contain HTML
            method (Enum[IdentificationTool.LANGID, IdentificationTool.LANGDETECT]): specifies which of the two
            algorithms to use

        Returns:
            List(Extraction): an extraction containing the language code used in the text. Returns the empty list of
            the extractor fails to identify the language in the text.

        """
        if method == IdentificationTool.LANGID.name:
            language = classify(text)[0]
            return [Extraction(value=language, extractor_name=self.name)]

        elif method == IdentificationTool.LANGDETECT.name:
            try:
                language = detect(text)
            except:
                language = 'unknown'

            if language == 'unknown':
                return list()
            else:
                return [Extraction(value=language, extractor_name=self.name)]

        else:
            return list()
예제 #3
0
    def extract(self, text: str) -> List[Extraction]:

        """
        Extract with the input text
        Args:
            text: str

        Returns: List[Extraction]
        """

        result = []
        first_phase_doc = self.nlp(text)
        self.load_email_matcher()
        like_email_matches = self.like_email_matcher(first_phase_doc)

        like_emails_filtered = []
        for match_id, start, end in like_email_matches:
            span = first_phase_doc[start:end]
            if self.check_domain(self.tokenizer.tokenize(span.text)):
                like_emails_filtered.append((span.text, span[0].idx, span[-1].idx + len(span[-1])))

        non_space_emails = self.get_non_space_email(first_phase_doc)

        emails = set(like_emails_filtered).union(non_space_emails)

        for email in emails:
            result.append(Extraction(
                value=email[0],
                extractor_name=self.name,
                start_char=email[1],
                end_char=email[2])
            )

        return result
예제 #4
0
    def extract(self, text: str) -> List[Extraction]:
        """

        Args:
            text (str): The input source to be processed

        Returns:
            List[Extraction]: The list of extractions returned by EmailExtractor

        """

        result = []
        first_phase_doc = self._nlp(text)
        self._load_email_matcher()
        like_email_matches = self._like_email_matcher(first_phase_doc)

        like_emails_filtered = []
        for match_id, start, end in like_email_matches:
            span = first_phase_doc[start:end]
            if self._check_domain(self._tokenizer.tokenize(span.text)):
                like_emails_filtered.append(
                    (span.text, span[0].idx, span[-1].idx + len(span[-1])))

        non_space_emails = self._get_non_space_email(first_phase_doc)

        emails = set(like_emails_filtered).union(non_space_emails)

        for email in emails:
            result.append(
                Extraction(value=email[0],
                           extractor_name=self.name,
                           start_char=email[1],
                           end_char=email[2]))

        return result
예제 #5
0
    def _combiner(self, results: dict) -> List[Extraction]:
        return_result = list()
        if "Resources" in results:
            resources_results = self.deduplicate_by_uri(results["Resources"])

            for one_result in resources_results:
                types = one_result['@types'].split(',')
                values = {
                    'surface_form': one_result['@surfaceForm'],
                    'uri': one_result['@URI'],
                    'types': types,
                    'similarity_scores': float(one_result['@similarityScore'])
                }
                if self._get_attr:
                    if one_result['@URI'] not in self.seen_uris:
                        # attr = self._attr_finder(one_result['@URI'])
                        attr = self._en_label_finder(one_result['@URI'])

                        self.seen_uris[one_result['@URI']] = attr
                    values['attributes'] = self.seen_uris[one_result['@URI']]
                    values['surface_form'] = self.seen_uris[
                        one_result['@URI']]['rdf-schema#label']

                return_result.append(
                    Extraction(confidence=float(results['@confidence']),
                               extractor_name=self.name,
                               start_char=int(one_result['@offset']),
                               end_char=int(one_result['@offset']) +
                               len(one_result['@surfaceForm']),
                               value=values))

            return return_result
        return list()
예제 #6
0
    def extract(self, text: str) -> List[Extraction]:
        """
        Extract from text

        Args:
            text: str

        Returns:
            List[Extraction]
        """

        doc = self._tokenizer.tokenize_to_spacy_doc(text)
        self._load_matcher()

        matches = [x for x in self._matcher(doc) if x[1] != x[2]]
        pos_filtered_matches = []
        neg_filtered_matches = []
        for idx, start, end in matches:
            span_doc = self._tokenizer.tokenize_to_spacy_doc(
                doc[start:end].text)
            this_spacy_rule = self._matcher.get(idx)
            relations = self._find_relation(span_doc, this_spacy_rule)
            rule_id, _ = self._hash_map[idx]
            this_rule = self._rule_lst[rule_id]
            if self._filter_match(doc[start:end], relations,
                                  this_rule.patterns):
                value = self._form_output(doc[start:end],
                                          this_rule.output_format, relations,
                                          this_rule.patterns)
                if this_rule.polarity:
                    pos_filtered_matches.append(
                        (start, end, value, rule_id, relations))
                else:
                    neg_filtered_matches.append(
                        (start, end, value, rule_id, relations))

        return_lst = []
        if pos_filtered_matches:
            longest_lst_pos = self._get_longest(pos_filtered_matches)
            if neg_filtered_matches:
                longest_lst_neg = self._get_longest(neg_filtered_matches)
                return_lst = self._reject_neg(longest_lst_pos, longest_lst_neg)
            else:
                return_lst = longest_lst_pos

        extractions = []
        for (start, end, value, rule_id, relation) in return_lst:
            this_extraction = Extraction(value=value,
                                         extractor_name=self.name,
                                         start_token=start,
                                         end_token=end,
                                         start_char=doc[start].idx,
                                         end_char=doc[end - 1].idx +
                                         len(doc[end - 1]),
                                         rule_id=rule_id.split("rule_id##")[0],
                                         match_mapping=relation)
            extractions.append(this_extraction)

        return extractions
예제 #7
0
 def _wrap_split_extraction(self, items: List[str]) -> List[Extraction]:
     res = list()
     start = 0
     for item in items:
         end = start + len(item)
         e = Extraction(value=item, extractor_name=self.name, start_char=start, end_char=end)
         res.append(e)
         start = end
     return res
예제 #8
0
 def _wrap_value_with_context(self, tokens: List[Token], start: int, end: int) -> Extraction:
     """Wraps the final result"""
     return Extraction(' '.join([x.orth_ for x in tokens[start:end]]),
                       self.name,
                       start_token=start,
                       end_token=end,
                       start_char=tokens[start].idx,
                       end_char=tokens[end - 1].idx + len(tokens[end - 1].orth_)
                       )
예제 #9
0
 def wrap_value_with_context(self,
                             value: dict,
                             field_name: str,
                             start: int = 0,
                             end: int = 0) -> Extraction:
     """Wraps the final result"""
     return Extraction(value,
                       self.name,
                       start_token=start,
                       end_token=end,
                       tag=field_name)
예제 #10
0
 def extract(self,
             text: str,
             get_attr=['PERSON', 'ORG', 'GPE']) -> List[Extraction]:
     doc = self.__nlp(text)
     attr_list = list()
     for ent in doc.ents:
         if ent.label_ in get_attr:
             attr_list.append(
                 Extraction(extractor_name=self.name,
                            start_char=int(ent.start_char),
                            end_char=int(ent.end_char),
                            value=ent.text,
                            tag=ent.label_))
     return attr_list
예제 #11
0
    def extract(self, text: str) -> List[Extraction]:
        """
        Extracts and structures email message from UTF8-encoded text
        Args:
            text: str

        Returns: Extraction
        """

        content = BeautifulSoup(text, 'html5lib')
        subject = content.find('h1').text.strip()
        recip = self.mailing_list_name

        navbar = content.find(id='navbar')
        if navbar == None:
            info = self.old_format(content)
        else:
            info = self.new_format(navbar, content)
        for i in info[0:3]:
            if i == 'None':
                print('missed something important')
        sender = info[0]
        date = info[1]
        body = info[2]
        nxt = info[3]
        rep_to = info[4]
        pub = 'SeeSat_Obs'
        dRec = datetime.datetime.now().isoformat()

        msg_obj = {
            'url': self.email_url,
            '@context': {
                '@vocab': 'schema.org'
            },
            'subject': subject,
            'recip': recip,
            'sender': sender
        }
        if date != 'None':
            msg_obj['dateReceived'] = date
        if body != 'None':
            msg_obj['body'] = body
        if nxt != 'None':
            msg_obj['nxt'] = nxt
        if rep_to != 'None':
            msg_obj['replyToMessage'] = rep_to
        return Extraction(value=msg_obj, extractor_name=self.name)
예제 #12
0
    def _wrap_result(self, value: str, original_key: str) -> Extraction or None:
        """

        Args:
            value: the decoded value
            original_key: the original string value to be decode

        Returns: an Extraction if everything goes well

        """
        try:
            value = value.strip() if self._strip_value else value
            e = Extraction(value, self.name, start_char=0, end_char=len(str(value)))
            return e
        except Exception as e:
            print('fail to wrap dictionary extraction: ', original_key, value)
            raise ExtractorError('Exception: ' + str(e))
예제 #13
0
    def extract(self,
                text: str,
                get_attr=['PERSON', 'ORG', 'GPE']) -> List[Extraction]:
        """
        Args:
            text (str): the text to extract from.
            get_attr (List[str]): The spaCy NER attributes we're interested in.

        Returns:
            List(Extraction): the list of extraction or the empty list if there are no matches.
        """
        doc = self.__nlp(text)
        attr_list = list()
        for ent in doc.ents:
            if ent.label_ in get_attr:
                attr_list.append(
                    Extraction(extractor_name=self.name,
                               start_char=int(ent.start_char),
                               end_char=int(ent.end_char),
                               value=ent.text,
                               tag=ent.label_))
        return attr_list
예제 #14
0
    def _combiner(self, results: dict) -> List[Extraction]:
        return_result = list()
        if "Resources" in results:
            resources_results = results["Resources"]
            for one_result in resources_results:
                types = one_result['@types'].split(',')
                values = {'surface_form': one_result['@surfaceForm'],
                          'uri': one_result['@URI'],
                          'types': types,
                          'similarity_scores': float(one_result['@similarityScore'])}
                if self._get_attr:
                    attr = self._attr_finder(one_result['@URI'])
                    values['attributes'] = attr
                return_result.append(Extraction(confidence=float(results['@confidence']),
                                                extractor_name=self.name,
                                                start_char=int(one_result['@offset']),
                                                end_char=int(one_result['@offset']) + len(
                                                    one_result['@surfaceForm']),
                                                value=values))

            return return_result
        return list()
예제 #15
0
    def extract(self, html_text: str, threshold=0.5) -> List[Extraction]:
        """

        Args:
            html_text (str): str of the html page to be extracted
            threshold (float): if the ratio of rules that successfully extracted something over all rules \
                    is higher than or equal to the threshold, return the results, else return an empty list

        Returns:
            List[Extraction]: a list of Extractions, each extraction includes the extracted value, the rule name, the provenance etc.

        """

        result = list()
        try:
            for rule in self._rule_set.rules:
                rule.apply(html_text)
                value = rule.value
                if value is not None:
                    # note the addition of a new tag argument to Extraction
                    start_char = rule.start_char
                    end_char = rule.end_char
                    result.append(
                        Extraction(value,
                                   self.name,
                                   start_char=start_char,
                                   end_char=end_char,
                                   tag=rule.name))

            # Test whether the fraction of extractions meets the desired threshold
            if len(self._rule_set.rules) > 0 and float(len(result)) / len(
                    self._rule_set.rules) >= threshold:
                return result
            else:
                return list()
        except Exception as e:
            raise ExtractorError('Error in extracting landmark %s' % e)
예제 #16
0
    def extract(self, text: str) -> List[Extraction]:
        """
        Splits text by sentences.

        Args:
            text (str): Input text to be extracted.

        Returns:
            List[Extraction]: the list of extraction or the empty list if there are no matches.
        """

        doc = self._parser(text)

        extractions = list()
        for sent in doc.sents:
            this_extraction = Extraction(value=sent.text,
                                         extractor_name=self.name,
                                         start_token=sent[0],
                                         end_token=sent[-1],
                                         start_char=sent.text[0],
                                         end_char=sent.text[-1])
            extractions.append(this_extraction)

        return extractions
예제 #17
0
    def _wrap_extraction(self, date_object: datetime.datetime,
                         original_text: str, start_char: int,
                         end_char: int) -> Extraction or None:
        """
        wrap the final result as an Extraction and return

        """
        try:
            resolution = self._settings[MIN_RESOLUTION] \
                    if self._settings[DATE_VALUE_RESOLUTION] == DateResolution.ORIGINAL \
                    else self._settings[DATE_VALUE_RESOLUTION]
            e = Extraction(self._convert_to_iso_format(date_object,
                                                       resolution=resolution),
                           start_char=start_char,
                           end_char=end_char,
                           extractor_name=self._name,
                           date_object=date_object,
                           original_date=original_text)
            return e
        except Exception as e:
            warn('DateExtractor: Failed to wrap result ' + str(original_text) +
                 ' with Extraction class.\n'
                 'Catch ' + str(e))
            return None
예제 #18
0
    def extract(self, text: str) -> List[Extraction]:
        """
        Splits text by sentences.

        Args:
            text (str):

        Returns:
            List[Extraction]
        """

        doc = self._parser(text)

        extractions = list()
        for sent in doc.sents:
            this_extraction = Extraction(value=sent.text,
                                         extractor_name=self.name,
                                         start_token=sent[0],
                                         end_token=sent[-1],
                                         start_char=sent.text[0],
                                         end_char=sent.text[-1])
            extractions.append(this_extraction)

        return extractions
예제 #19
0
 def wrap_data(self, key: str, value) -> Extraction:
     e = Extraction(value=value, extractor_name=self.name, tag=key)
     return e
예제 #20
0
 def _wrap_extraction(self, group_idx: int, matches: object) -> Extraction:
     start, end = matches.start(group_idx), matches.end(group_idx)
     text = matches.group(group_idx)
     e = Extraction(value=text, extractor_name=self.name, \
                    start_char=start, end_char=end, tag=self.general_tag)
     return e
예제 #21
0
    def extract(self, text: str) -> List[Extraction]:
        """
        Extracts and structures email message from UTF8-encoded text
        Args:
            text: str

        Returns: Extraction
        """

        content = BeautifulSoup(text, 'html5lib')
        subject = content.find('h1').text.strip()
        recip = self.mailing_list_name

        navbar = content.find(id='navbar')
        if navbar == None:
            info = self.old_format(content)
        else:
            info = self.new_format(navbar, content)
        sender = info[0]
        date = info[1]
        body = info[2]
        nxt = info[3]
        rep_to = info[4]

        msg_obj = {
            '@id': self.email_url,
            '@type': ['EmailMessage'],
            '@context': {
                '@vocab': 'schema.org'
            },
            'about': {
                '@id': subject,
                '@type': ['Thing'],
                '@context': {
                    '@vocab': 'schema.org'
                }
            },
            'recipient': {
                '@id': recip,
                '@type': ['Organization'],
                '@context': {
                    '@vocab': 'schema.org'
                }
            },
            'sender': {
                '@id': sender,
                '@type': ['Person'],
                '@context': {
                    '@vocab': 'schema.org'
                }
            }
        }
        if date != 'None':
            msg_obj['dateReceived'] = date
        if body != 'None':
            msg_obj['text'] = body
        if nxt != 'None':
            msg_obj['nextInThread'] = {
                '@id': nxt,
                '@type': ['URL'],
                '@context': {
                    '@vocab': 'schema.org'
                }
            }
        if rep_to != 'None':
            msg_obj['replyToMessage'] = {
                '@id': rep_to,
                '@type': ['URL'],
                '@context': {
                    '@vocab': 'schema.org'
                }
            }
        return Extraction(value=msg_obj, extractor_name=self.name)