def detect_category_names_to_spans(self, text: str, field: str = None) \
            -> Dict[str, List[Tuple[int, int, str]]]:
        if self.sklearn_model is None:
            return {}

        sentence_spans = get_sentence_span_list(text)

        res = {}

        for span in sentence_spans:
            sentence = text[span[0]:span[1]]
            predicted = self.sklearn_model.predict([sentence]).toarray()[0]

            for target_index, value in enumerate(predicted):
                if not value:
                    continue
                target_name = self.target_names[target_index]
                if target_name == SkLearnClassifierModel.EMPTY_CAT_NAME:
                    continue

                if (not field and target_name) or (field
                                                   and field == target_name):
                    spans_of_category = res.get(target_name)
                    if not spans_of_category:
                        spans_of_category = [(span[0], span[1], sentence)]
                        res[target_name] = spans_of_category
                    else:
                        spans_of_category.append((span[0], span[1], sentence))
        return res
示例#2
0
    def build_dataset_on_document(document_class_name: str,
                                  document_id,
                                  retrain_model: bool = False,
                                  task_id=None):
        field_configs = DOCUMENT_FIELDS[document_class_name]
        if not field_configs:
            return

        document_class = BuildFieldDetectorDataset._get_doc_class(document_class_name)

        doc = document_class.objects.get(pk=document_id)

        classifier_model, created = ClassifierModel.objects.get_or_create(
            kind=ClassifierModel.KIND_SENTENCES_RELATED_TO_FIELDS,
            document_class=document_class_name,
            document_field=None)

        deleted, rows_count = ClassifierDataSetEntry.objects.filter(
            field_detection_model=classifier_model,
            document=doc).delete()
        if deleted > 0:
            log('Deleted {0} data set entries of document {1}'.format(deleted, doc.pk),
                task=task_id)

        def add(code, sentence):
            ClassifierDataSetEntry.objects.create(field_detection_model=classifier_model,
                                                  document=doc,
                                                  category=code,
                                                  text=sentence)

        log('Extracting training data from document: {0}'.format(doc.pk), task=task_id)
        text = doc.full_text
        annotations = list(DocumentAnnotation.objects.filter(document__pk=doc.pk))
        sentence_spans = get_sentence_span_list(text)
        for span in sentence_spans:
            sentence = text[span[0]:span[1]]
            annotated_fields = set()
            added = False
            if annotations:
                for a in annotations:
                    if a.document_field \
                            and a.start_offset <= span[1] and span[0] <= a.end_offset:
                        field_code = a.document_field.pk
                        add(field_code, sentence)
                        annotated_fields.add(field_code)
                        added = True

            for field_config in field_configs.values():
                if field_config.field_code not in annotated_fields \
                        and field_config.sentence_matches_field_detectors(sentence):
                    add(field_config.field_code, sentence)
                    added = True
            if not added:
                add('', sentence)
        log('Processed {0} sentences of document {1}'.format(len(sentence_spans), doc.pk),
            task=task_id)

        if retrain_model:
            TrainFieldDetectorModel.train_model_for_document_class.apply_async(
                args=(document_class_name,))
示例#3
0
 def _create_annotator_model(self,
                             documents_gen: Generator[Document, Any, None],
                             annotations_by_doc: Callable[
                                 [Document], Generator[Tuple[int, int], Any, None]] = None):
     positive = []
     negative = []
     for doc in documents_gen:
         text = doc.full_text
         annotations = list(annotations_by_doc(doc))
         sentence_spans = get_sentence_span_list(text)
         for span in sentence_spans:
             sentence = text[span[0]:span[1]]
             if self._sentence_matches_field_detectors(sentence) \
                     or self._sentence_matches_annotations(span, annotations):
                 positive.append(sentence)
             else:
                 negative.append(sentence)
示例#4
0
    def get_copyright_annotations(cls, text: str, return_sources=False) \
            -> Generator[CopyrightAnnotation, None, None]:
        """
        Find copyright in text.
        :param text:
        :param return_sources:
        :return:
        """
        # Iterate through sentences
        if not cls.copyright_ptn_re.search(text):
            return

        for sent_start, _, sentence in get_sentence_span_list(text):
            tagged_phrases = cls.extract_phrases_with_coords(sentence)

            for phrase, phrase_start in tagged_phrases:
                for match in cls.copyright_ptn_re.finditer(phrase):
                    cp_text, cp_sign, cp_date, cp_name = match.groups()

                    # TODO: catch in the general regex
                    if not cp_date:
                        cp_date_at_end = cls.year_ptn_re.search(cp_name)
                        if cp_date_at_end:
                            cp_date = cp_date_at_end.group()
                            cp_name = re.sub(r'{}$'.format(cp_date), '',
                                             cp_name)

                    start, end = match.span()
                    start += phrase_start + sent_start
                    end += phrase_start
                    ant = CopyrightAnnotation(
                        coords=(start, end),
                        sign=cp_sign.strip(),
                        date=cp_date,
                        name=cp_name.strip(string.punctuation +
                                           string.whitespace))

                    if return_sources:
                        ant.text = cp_text.strip()
                    cls.split_copyright_date(ant)
                    cls.derive_company_name(ant, phrase)
                    yield ant
    def test_title_start_end(self):
        text = self.get_text(
            'lexnlp/nlp/en/tests/test_sections/skewed_document.txt')
        sentence_spans = get_sentence_span_list(text)
        sections = list(
            get_section_spans(text,
                              use_ml=False,
                              return_text=False,
                              skip_empty_headers=True))
        self.assertGreater(len(sections), 3)
        # test title coordinates before enhancing titles ...
        for sect in sections:
            title = text[sect.title_start:sect.title_end]
            self.assertEqual(sect.title, title)

        # ... and after enhancing
        find_section_titles(sections, sentence_spans, text)
        for sect in sections:
            title = text[sect.title_start:sect.title_end]
            self.assertEqual(sect.title, title)
    def annotate(self, text: str, field: str = None) -> Dict[str, Tuple[int, int, str]]:
        if self.sklearn_model is None:
            return {}

        sentence_spans = get_sentence_span_list(text)

        res = {}

        for span in sentence_spans:
            sentence = text[span[0]:span[1]]
            target_index = self.sklearn_model.predict([sentence])[0]
            target_name = self.target_names[target_index]

            if (not field and target_name) or (field and field == target_name):
                of_field = res.get(target_name)
                if not of_field:
                    of_field = [(span[0], span[1], sentence)]
                    res[target_name] = of_field
                else:
                    of_field.append((span[0], span[1], sentence))
        return res
    def detect_category_names_to_spans(self, text: str, field: str = None) \
            -> Dict[str, List[Tuple[int, int, str]]]:
        if self.sklearn_model is None:
            return {}

        sentence_spans = get_sentence_span_list(text)

        res = {}

        for span in sentence_spans:
            sentence = text[span[0]:span[1]]
            category_names = self.detect_category_names_for_sentence(sentence)

            for target_name in category_names:
                if (not field and target_name) or (field and field == target_name):
                    spans_of_category = res.get(target_name)
                    if not spans_of_category:
                        spans_of_category = [(span[0], span[1], sentence)]
                        res[target_name] = spans_of_category
                    else:
                        spans_of_category.append((span[0], span[1], sentence))

        return res
示例#8
0
def get_company_annotations(
    text: str,
    strict: bool = False,
    use_gnp: bool = False,
    count_unique: bool = False,
    name_upper: bool = False,
) -> Generator[CompanyAnnotation, None, None]:
    """
    Find company names in text, optionally using the stricter article/prefix expression.
    :param parse_name_abbr:
    :param text:
    :param strict:
    :param use_gnp: use get_noun_phrases or NPExtractor
    :param name_upper: return company name in upper case.
    :param count_unique: return only unique companies - case insensitive.
    :return:
    """
    # skip if all text is in uppercase
    if text == text.upper():
        return
    valid_punctuation = VALID_PUNCTUATION + ["(", ")"]

    unique_companies = {}  # type: Dict[Tuple[str, str], CompanyAnnotation]

    if COMPANY_TYPES_RE.search(text):
        # Iterate through sentences
        for s_start, s_end, sentence in get_sentence_span_list(text):
            # skip if whole phrase is in uppercase
            if sentence == sentence.upper():
                continue
            if use_gnp:
                phrases = list(
                    get_noun_phrases(sentence,
                                     strict=strict,
                                     valid_punctuation=valid_punctuation))
            else:
                phrases = list(np_extractor.get_np(sentence))
            phrase_spans = PhrasePositionFinder.find_phrase_in_source_text(
                sentence, phrases)

            for phrase, p_start, p_end in phrase_spans:
                if COMPANY_TYPES_RE.search(phrase):
                    # noinspection PyTypeChecker
                    for ant in nltk_re.get_companies(
                            phrase, use_sentence_splitter=False
                    ):  # type: CompanyAnnotation

                        if ant.name == ant.company_type or ant.name == ant.description:
                            continue
                        ant.coords = (ant.coords[0] + s_start + p_start,
                                      ant.coords[1] + s_start + p_start)

                        if name_upper:
                            ant.name = ant.name.upper()

                        if count_unique:
                            unique_key = (ant.name.lower() if ant.name else
                                          None, ant.company_type_abbr)
                            existing_result = unique_companies.get(unique_key)

                            if existing_result:
                                existing_result.counter += 1
                            else:
                                unique_companies[unique_key] = ant
                        else:
                            yield ant

        if count_unique:
            for company in unique_companies.values():
                yield company
示例#9
0
def get_companies(text: str,
                  use_article: bool = False,
                  use_sentence_splitter: bool = True) -> Generator[CompanyAnnotation, None, None]:
    """
    Find company names in text, optionally using the stricter article/prefix expression.
    """
    # Select regex
    re_c = RE_ARTICLE_COMPANY if use_article else RE_COMPANY

    # Iterate through sentences
    sent_list = get_sentence_span_list(text) if use_sentence_splitter else [(0, len(text), text)]
    for start, _, sentence in sent_list:
        if check_backtrack_catastrophy(sentence):
            continue

        for match in re_c.finditer(sentence):
            captures = match.capturesdict()
            company_type = captures["company_type_of"] or \
                           captures["company_type"] or \
                           captures["company_type_single"]
            company_type = "".join(company_type).strip(
                string.punctuation.replace(".", "") + string.whitespace)
            company_type = company_type or None

            company_name = "".join(captures["full_name"])
            if company_type:
                company_name = re.sub(r'%s$' % company_type, '', company_name)
            company_name = FALSE_POS_SUB_RE.sub('', company_name)
            company_name = company_name.strip(
                string.punctuation.replace('&', '').replace(')', '') + string.whitespace)
            company_name = re.sub(r'^\s*(?:and|&|of)\s+|\s+(?:and|&|of)\s*$', '',
                                  company_name, re.IGNORECASE)
            if not company_name:
                continue

            # catch a Delaware company
            if company_name.lower().startswith('a ') or captures.get('article') == ['a']:
                continue

            company_description = captures.get("company_description", '')
            if not company_description:
                company_description_match = DEFAULT_COMPANY_DESC_RE.findall(company_name)
                if company_description_match:
                    company_description = company_description_match[0]

            company_description = "".join(company_description).strip(
                string.punctuation + string.whitespace)

            # catch ABC & Company LLC case
            if company_description.lower() == 'company' and \
                    ('& company' in company_name.lower() or 'and company' in company_name.lower()):
                company_description = None
            company_description = company_description or None

            # catch "The Company"
            if company_description:
                _company_name = re.sub(r'[\s,]%s$' % company_description, '', company_name)
                if not _company_name or \
                        ARTICLE_RE.fullmatch(_company_name) or \
                        re.match(r'.+?\s(?:of|in)$', _company_name.lower()):
                    continue
            if company_name in COMPANY_DESCRIPTIONS:
                continue

            abbr_name = "".join(captures["abbr_name"]) or None

            ret = CompanyAnnotation(
                (match.start() + start, match.end() + start),
                name=company_name, company_type_full=company_type)
            ret.company_type_abbr = COMPANY_TYPES[company_type.lower()]['abbr'] if company_type else None
            ret.company_type_label = COMPANY_TYPES[company_type.lower()]['label'] if company_type else None
            ret.description = company_description
            ret.name_abbr = abbr_name
            ret.text = sentence
            # no args:         = [company_name, company_type, company_description]
            # detail_type:     + [company_type_abbr, company_type_label]
            # parse_name_abbr: + [abbr_name]
            # return_source:   + [source]
            yield ret
示例#10
0
def extract_text_and_structure(pdf_fn: str,
                               pdf_password: str = None,
                               timeout_sec: int = 3600,
                               language: str = "",
                               correct_pdf: bool = False,
                               render_coords_debug: bool = False) \
        -> Tuple[
            str, TextAndPDFCoordinates, str, Dict[int, float]]:  # text, structure, corrected_pdf_fn, page_rotate_angles

    if render_coords_debug:
        correct_pdf = True

    java_modules_path = get_settings().java_modules_path

    # Convert language to language code
    lang_converter = LanguageConverter()
    language, locale_code = lang_converter.get_language_and_locale_code(
        language)

    temp_dir = mkdtemp(prefix='pdf_text_')
    out_fn = os.path.join(
        temp_dir,
        os.path.splitext(os.path.basename(pdf_fn))[0] + '.msgpack')
    out_pdf_fn = pdf_fn
    try:
        args = [
            'java', '-cp', f'{java_modules_path}/*',
            'com.lexpredict.textextraction.GetTextFromPDF', pdf_fn, out_fn,
            '-f', 'pages_msgpack'
        ]

        if pdf_password:
            args.append('-p')
            args.append(pdf_password)

        if correct_pdf:
            out_pdf_fn = os.path.join(
                temp_dir,
                os.path.splitext(os.path.basename(pdf_fn))[0] + '_corr.pdf')
            args.append('-corrected_output')
            args.append(out_pdf_fn)

            if render_coords_debug:
                args.append('-render_char_rects')

        completed_process: CompletedProcess = subprocess.run(
            args,
            check=False,
            timeout=timeout_sec,
            universal_newlines=True,
            stderr=PIPE,
            stdout=PIPE)
        raise_from_process(
            log,
            completed_process,
            process_title=lambda: f'Extract text and structure from {pdf_fn}')

        raise_from_pdfbox_error_messages(completed_process)

        with open(out_fn, 'rb') as pages_f:
            # see object structure in com.lexpredict.textextraction.dto.PDFPlainText
            pdfbox_res: Dict[str, Any] = msgpack.unpack(pages_f, raw=False)

        # Remove Null characters because of incompatibility with PostgreSQL
        text = pdfbox_res['text'].replace("\x00", "")
        if len(text) == 0:
            pdf_coordinates = PDFCoordinates(
                char_bboxes=pdfbox_res['charBBoxes'])
            text_struct = PlainTextStructure(
                title='',
                language=language
                or 'en',  # FastText returns English for empty strings
                pages=[],
                sentences=[],
                paragraphs=[],
                sections=[])
            yield text, \
                  TextAndPDFCoordinates(text_structure=text_struct, pdf_coordinates=pdf_coordinates), \
                  out_pdf_fn, \
                  None

            return

        page_rotate_angles: List[float] = [
            pdfpage['deskewAngle'] for pdfpage in pdfbox_res['pages']
        ]

        pages = []
        num: int = 0
        for p in pdfbox_res['pages']:
            p_res = PlainTextPage(number=num,
                                  start=p['location'][0],
                                  end=p['location'][1],
                                  bbox=p['bbox'])
            pages.append(p_res)
            num += 1

        sentence_spans = get_sentence_span_list(text)

        lang = get_lang_detector()

        sentences = [
            PlainTextSentence(start=start,
                              end=end,
                              language=language or lang.predict_lang(segment))
            for start, end, segment in sentence_spans
        ]

        # There was a try-except in Contraxsuite catching some lexnlp exception.
        # Not putting it here because it should be solved on lexnlp side.
        paragraphs = [
            PlainTextParagraph(start=start,
                               end=end,
                               language=language or lang.predict_lang(segment))
            for segment, start, end in get_paragraphs(text, return_spans=True)
        ]

        sections = [
            PlainTextSection(title=sect.title,
                             start=sect.start,
                             end=sect.end,
                             title_start=sect.title_start,
                             title_end=sect.title_end,
                             level=sect.level,
                             abs_level=sect.abs_level)
            for sect in get_document_sections_with_titles(
                text, sentence_list=sentence_spans)
        ]

        try:
            title = next(get_titles(text))
        except StopIteration:
            title = None

        text_struct = PlainTextStructure(title=title,
                                         language=language
                                         or lang.predict_lang(text),
                                         pages=pages,
                                         sentences=sentences,
                                         paragraphs=paragraphs,
                                         sections=sections)

        char_bboxes = pdfbox_res['charBBoxes']
        pdf_coordinates = PDFCoordinates(char_bboxes=char_bboxes)
        yield text, TextAndPDFCoordinates(
            text_structure=text_struct,
            pdf_coordinates=pdf_coordinates), out_pdf_fn, page_rotate_angles
        return

    finally:
        shutil.rmtree(temp_dir, ignore_errors=True)
示例#11
0
    def get_company_annotations(
        self,
        text: str,
        strict: bool = False,
        use_gnp: bool = False,
        count_unique: bool = False,
        name_upper: bool = False,
        banlist_usage: Optional[BanListUsage] = None
    ) -> Generator[CompanyAnnotation, None, None]:
        """
        Find company names in text, optionally using the stricter article/prefix expression.
        :param text:
        :param strict:
        :param use_gnp: use get_noun_phrases or NPExtractor
        :param name_upper: return company name in upper case.
        :param count_unique: return only unique companies - case insensitive.
        :param banlist_usage: a banlist or hints on using the default BL
        :return:
        """
        # skip if all text is in uppercase
        if text == text.upper():
            return
        banlist = self.get_company_banlist(banlist_usage)
        valid_punctuation = VALID_PUNCTUATION + ["(", ")"]
        unique_companies: Dict[Tuple[str, str], CompanyAnnotation] = {}

        if not self.company_types_re.search(text):
            return
        # iterate through sentences
        for s_start, _s_end, sentence in get_sentence_span_list(text):
            # skip if whole phrase is in uppercase
            if sentence == sentence.upper():
                continue
            if use_gnp:
                phrases = list(
                    get_noun_phrases(sentence,
                                     strict=strict,
                                     valid_punctuation=valid_punctuation))
            else:
                phrases = list(self.np_extractor.get_np(sentence))
            phrase_spans = PhrasePositionFinder.find_phrase_in_source_text(
                sentence, phrases)

            for phrase, p_start, _p_end in phrase_spans:
                if self.company_types_re.search(phrase):
                    ant: CompanyAnnotation
                    for ant in self.get_companies_re(
                            phrase, use_sentence_splitter=False):
                        if ant.name == ant.company_type or ant.name == ant.description:
                            continue
                        # check against banlist
                        if banlist:
                            if EntityBanListItem.check_list(ant.name, banlist):
                                continue
                        ant.coords = (ant.coords[0] + s_start + p_start,
                                      ant.coords[1] + s_start + p_start)

                        if name_upper:
                            ant.name = ant.name.upper()

                        if count_unique:
                            unique_key = (ant.name.lower() if ant.name else
                                          None, ant.company_type_abbr)
                            existing_result = unique_companies.get(unique_key)

                            if existing_result:
                                existing_result.counter += 1
                            else:
                                unique_companies[unique_key] = ant
                        else:
                            yield ant

        if count_unique:
            for company in unique_companies.values():
                yield company