예제 #1
0
def test_title_2():
    """
    Test second example title.
    :return:
    """
    # Open file
    test_file_path = os.path.join(get_module_path(), "..", "test_data", "1100644_2016-11-21")
    with open(test_file_path, "rb") as file_handle:
        # Read and parse
        file_text = file_handle.read().decode("utf-8")
        assert_list_equal(list(get_titles(file_text)),
                          ["VALIDIAN SOFTWARE LICENSE AGREEMENT"])
예제 #2
0
 def test_title_2(self):
     """
     Test second example title.
     """
     # Open file
     test_file_path = os.path.join(get_module_path(), '..', 'test_data',
                                   '1100644_2016-11-21')
     with codecs.open(test_file_path, 'r', encoding='utf-8') as file_handle:
         # Read and parse
         file_text = file_handle.read()
         self.assertEqual(['VALIDIAN SOFTWARE LICENSE AGREEMENT'],
                          list(get_titles(file_text)))
예제 #3
0
    def test_title_1(self):
        """
        Test first example title.
        """
        # Setup URL
        url = "https://raw.githubusercontent.com/LexPredict/lexpredict-contraxsuite-samples/master/agreements/" + \
              "construction/1000694_2002-03-15_AGREEMENT%20OF%20LEASE-W.M.RICKMAN%20CONSTRUCTION%20CO..txt"

        # Download file
        file_text = requests.get(url).text

        self.assertEqual(['LEASE AGREEMENT'], list(get_titles(file_text)))
예제 #4
0
def test_title_3():
    """
    test failure
    """
    text = """
    (1279209, 'en', 'C-106 TRANSPORTATION AND PUBLIC WORKS PERFORMANCE
    MEASURES Actual Forecast FY14 FY15 FY16 FY17 FY18 Traffic Engineering
     # of Miles of Roadway Striping 20 miles 10 miles 70 miles 10 miles
     10 Miles # of Signs Replaced 1,300 1,800 2,100 1,600 1,600 # of
     Traffic Signal Upgrades 2 15 18 25 30 Engineering Average Plan Review
      Time 6.9 days 6.13 days 8.34 days 7 days 7 days % Plan Reviews
      Completed Within 14 / 7 days 94% / 59% 94% / 67% 97% / 74% 100% /
       75% 100%/ 75% # Roadway Miles Receiving Major Roadway Maintenance
       57.8 miles 47 miles 45.0 miles 45.0 miles 45 miles Streets &
       Drainage Average Response Time for Street immediate Work Requests
       1 day 1 day 1 day 1 day 1day Percent of Street immediate work
       requests completed in 3 days 95% 90% 96% 95% 95% Percentage of
       staff hours utilized on recurring work activities 30% 33% 40% 45%
       45% Stormwater Utility Bill Collection Rate 94% 98% 95% 95% 95%
       Average Response Time for...', 1, , ...)"""
    assert_list_equal(list(get_titles(text)), [])
예제 #5
0
def extract_text_and_structure(pdf_fn: str,
                               pdf_password: str = None,
                               timeout_sec: int = 3600,
                               language: str = "",
                               correct_pdf: bool = False,
                               render_coords_debug: bool = False) \
        -> Tuple[
            str, TextAndPDFCoordinates, str, Dict[int, float]]:  # text, structure, corrected_pdf_fn, page_rotate_angles

    if render_coords_debug:
        correct_pdf = True

    java_modules_path = get_settings().java_modules_path

    # Convert language to language code
    lang_converter = LanguageConverter()
    language, locale_code = lang_converter.get_language_and_locale_code(
        language)

    temp_dir = mkdtemp(prefix='pdf_text_')
    out_fn = os.path.join(
        temp_dir,
        os.path.splitext(os.path.basename(pdf_fn))[0] + '.msgpack')
    out_pdf_fn = pdf_fn
    try:
        args = [
            'java', '-cp', f'{java_modules_path}/*',
            'com.lexpredict.textextraction.GetTextFromPDF', pdf_fn, out_fn,
            '-f', 'pages_msgpack'
        ]

        if pdf_password:
            args.append('-p')
            args.append(pdf_password)

        if correct_pdf:
            out_pdf_fn = os.path.join(
                temp_dir,
                os.path.splitext(os.path.basename(pdf_fn))[0] + '_corr.pdf')
            args.append('-corrected_output')
            args.append(out_pdf_fn)

            if render_coords_debug:
                args.append('-render_char_rects')

        completed_process: CompletedProcess = subprocess.run(
            args,
            check=False,
            timeout=timeout_sec,
            universal_newlines=True,
            stderr=PIPE,
            stdout=PIPE)
        raise_from_process(
            log,
            completed_process,
            process_title=lambda: f'Extract text and structure from {pdf_fn}')

        raise_from_pdfbox_error_messages(completed_process)

        with open(out_fn, 'rb') as pages_f:
            # see object structure in com.lexpredict.textextraction.dto.PDFPlainText
            pdfbox_res: Dict[str, Any] = msgpack.unpack(pages_f, raw=False)

        # Remove Null characters because of incompatibility with PostgreSQL
        text = pdfbox_res['text'].replace("\x00", "")
        if len(text) == 0:
            pdf_coordinates = PDFCoordinates(
                char_bboxes=pdfbox_res['charBBoxes'])
            text_struct = PlainTextStructure(
                title='',
                language=language
                or 'en',  # FastText returns English for empty strings
                pages=[],
                sentences=[],
                paragraphs=[],
                sections=[])
            yield text, \
                  TextAndPDFCoordinates(text_structure=text_struct, pdf_coordinates=pdf_coordinates), \
                  out_pdf_fn, \
                  None

            return

        page_rotate_angles: List[float] = [
            pdfpage['deskewAngle'] for pdfpage in pdfbox_res['pages']
        ]

        pages = []
        num: int = 0
        for p in pdfbox_res['pages']:
            p_res = PlainTextPage(number=num,
                                  start=p['location'][0],
                                  end=p['location'][1],
                                  bbox=p['bbox'])
            pages.append(p_res)
            num += 1

        sentence_spans = get_sentence_span_list(text)

        lang = get_lang_detector()

        sentences = [
            PlainTextSentence(start=start,
                              end=end,
                              language=language or lang.predict_lang(segment))
            for start, end, segment in sentence_spans
        ]

        # There was a try-except in Contraxsuite catching some lexnlp exception.
        # Not putting it here because it should be solved on lexnlp side.
        paragraphs = [
            PlainTextParagraph(start=start,
                               end=end,
                               language=language or lang.predict_lang(segment))
            for segment, start, end in get_paragraphs(text, return_spans=True)
        ]

        sections = [
            PlainTextSection(title=sect.title,
                             start=sect.start,
                             end=sect.end,
                             title_start=sect.title_start,
                             title_end=sect.title_end,
                             level=sect.level,
                             abs_level=sect.abs_level)
            for sect in get_document_sections_with_titles(
                text, sentence_list=sentence_spans)
        ]

        try:
            title = next(get_titles(text))
        except StopIteration:
            title = None

        text_struct = PlainTextStructure(title=title,
                                         language=language
                                         or lang.predict_lang(text),
                                         pages=pages,
                                         sentences=sentences,
                                         paragraphs=paragraphs,
                                         sections=sections)

        char_bboxes = pdfbox_res['charBBoxes']
        pdf_coordinates = PDFCoordinates(char_bboxes=char_bboxes)
        yield text, TextAndPDFCoordinates(
            text_structure=text_struct,
            pdf_coordinates=pdf_coordinates), out_pdf_fn, page_rotate_angles
        return

    finally:
        shutil.rmtree(temp_dir, ignore_errors=True)