def test_title_2(): """ Test second example title. :return: """ # Open file test_file_path = os.path.join(get_module_path(), "..", "test_data", "1100644_2016-11-21") with open(test_file_path, "rb") as file_handle: # Read and parse file_text = file_handle.read().decode("utf-8") assert_list_equal(list(get_titles(file_text)), ["VALIDIAN SOFTWARE LICENSE AGREEMENT"])
def test_title_2(self): """ Test second example title. """ # Open file test_file_path = os.path.join(get_module_path(), '..', 'test_data', '1100644_2016-11-21') with codecs.open(test_file_path, 'r', encoding='utf-8') as file_handle: # Read and parse file_text = file_handle.read() self.assertEqual(['VALIDIAN SOFTWARE LICENSE AGREEMENT'], list(get_titles(file_text)))
def test_title_1(self): """ Test first example title. """ # Setup URL url = "https://raw.githubusercontent.com/LexPredict/lexpredict-contraxsuite-samples/master/agreements/" + \ "construction/1000694_2002-03-15_AGREEMENT%20OF%20LEASE-W.M.RICKMAN%20CONSTRUCTION%20CO..txt" # Download file file_text = requests.get(url).text self.assertEqual(['LEASE AGREEMENT'], list(get_titles(file_text)))
def test_title_3(): """ test failure """ text = """ (1279209, 'en', 'C-106 TRANSPORTATION AND PUBLIC WORKS PERFORMANCE MEASURES Actual Forecast FY14 FY15 FY16 FY17 FY18 Traffic Engineering # of Miles of Roadway Striping 20 miles 10 miles 70 miles 10 miles 10 Miles # of Signs Replaced 1,300 1,800 2,100 1,600 1,600 # of Traffic Signal Upgrades 2 15 18 25 30 Engineering Average Plan Review Time 6.9 days 6.13 days 8.34 days 7 days 7 days % Plan Reviews Completed Within 14 / 7 days 94% / 59% 94% / 67% 97% / 74% 100% / 75% 100%/ 75% # Roadway Miles Receiving Major Roadway Maintenance 57.8 miles 47 miles 45.0 miles 45.0 miles 45 miles Streets & Drainage Average Response Time for Street immediate Work Requests 1 day 1 day 1 day 1 day 1day Percent of Street immediate work requests completed in 3 days 95% 90% 96% 95% 95% Percentage of staff hours utilized on recurring work activities 30% 33% 40% 45% 45% Stormwater Utility Bill Collection Rate 94% 98% 95% 95% 95% Average Response Time for...', 1, , ...)""" assert_list_equal(list(get_titles(text)), [])
def extract_text_and_structure(pdf_fn: str, pdf_password: str = None, timeout_sec: int = 3600, language: str = "", correct_pdf: bool = False, render_coords_debug: bool = False) \ -> Tuple[ str, TextAndPDFCoordinates, str, Dict[int, float]]: # text, structure, corrected_pdf_fn, page_rotate_angles if render_coords_debug: correct_pdf = True java_modules_path = get_settings().java_modules_path # Convert language to language code lang_converter = LanguageConverter() language, locale_code = lang_converter.get_language_and_locale_code( language) temp_dir = mkdtemp(prefix='pdf_text_') out_fn = os.path.join( temp_dir, os.path.splitext(os.path.basename(pdf_fn))[0] + '.msgpack') out_pdf_fn = pdf_fn try: args = [ 'java', '-cp', f'{java_modules_path}/*', 'com.lexpredict.textextraction.GetTextFromPDF', pdf_fn, out_fn, '-f', 'pages_msgpack' ] if pdf_password: args.append('-p') args.append(pdf_password) if correct_pdf: out_pdf_fn = os.path.join( temp_dir, os.path.splitext(os.path.basename(pdf_fn))[0] + '_corr.pdf') args.append('-corrected_output') args.append(out_pdf_fn) if render_coords_debug: args.append('-render_char_rects') completed_process: CompletedProcess = subprocess.run( args, check=False, timeout=timeout_sec, universal_newlines=True, stderr=PIPE, stdout=PIPE) raise_from_process( log, completed_process, process_title=lambda: f'Extract text and structure from {pdf_fn}') raise_from_pdfbox_error_messages(completed_process) with open(out_fn, 'rb') as pages_f: # see object structure in com.lexpredict.textextraction.dto.PDFPlainText pdfbox_res: Dict[str, Any] = msgpack.unpack(pages_f, raw=False) # Remove Null characters because of incompatibility with PostgreSQL text = pdfbox_res['text'].replace("\x00", "") if len(text) == 0: pdf_coordinates = PDFCoordinates( char_bboxes=pdfbox_res['charBBoxes']) text_struct = PlainTextStructure( title='', language=language or 'en', # FastText returns English for empty strings pages=[], sentences=[], paragraphs=[], sections=[]) yield text, \ TextAndPDFCoordinates(text_structure=text_struct, pdf_coordinates=pdf_coordinates), \ out_pdf_fn, \ None return page_rotate_angles: List[float] = [ pdfpage['deskewAngle'] for pdfpage in pdfbox_res['pages'] ] pages = [] num: int = 0 for p in pdfbox_res['pages']: p_res = PlainTextPage(number=num, start=p['location'][0], end=p['location'][1], bbox=p['bbox']) pages.append(p_res) num += 1 sentence_spans = get_sentence_span_list(text) lang = get_lang_detector() sentences = [ PlainTextSentence(start=start, end=end, language=language or lang.predict_lang(segment)) for start, end, segment in sentence_spans ] # There was a try-except in Contraxsuite catching some lexnlp exception. # Not putting it here because it should be solved on lexnlp side. paragraphs = [ PlainTextParagraph(start=start, end=end, language=language or lang.predict_lang(segment)) for segment, start, end in get_paragraphs(text, return_spans=True) ] sections = [ PlainTextSection(title=sect.title, start=sect.start, end=sect.end, title_start=sect.title_start, title_end=sect.title_end, level=sect.level, abs_level=sect.abs_level) for sect in get_document_sections_with_titles( text, sentence_list=sentence_spans) ] try: title = next(get_titles(text)) except StopIteration: title = None text_struct = PlainTextStructure(title=title, language=language or lang.predict_lang(text), pages=pages, sentences=sentences, paragraphs=paragraphs, sections=sections) char_bboxes = pdfbox_res['charBBoxes'] pdf_coordinates = PDFCoordinates(char_bboxes=char_bboxes) yield text, TextAndPDFCoordinates( text_structure=text_struct, pdf_coordinates=pdf_coordinates), out_pdf_fn, page_rotate_angles return finally: shutil.rmtree(temp_dir, ignore_errors=True)