def test_parse_pdf_with_debug(mocker): path = Path(os.getenv('PYTHONPATH')) pdf_file = os.path.join(path, "tests/data/test.pdf") debug_file = os.path.join(path, "tests/data/test-debug.pdf") stats = [0, 0] pages = pdf_parser.parse_pdf(pdf_file, True, stats) assert pages != None assert os.path.exists(debug_file) os.remove(debug_file)
def test_save_pdf_pages(mocker): path = Path(os.getenv('PYTHONPATH')) pdf_file = os.path.join(path, "tests/data/test.pdf") csv_file = os.path.join(path, "tests/data/test-pages.csv") stats = [0, 0] pages = pdf_parser.parse_pdf(pdf_file, False, stats) pdfUtil.save_pdf_pages_tocsv("test.pdf", pages, csv_file) assert os.path.exists(csv_file) os.remove(csv_file)
def test_parse_and_match_fields_with_debug(mocker): path = Path(os.getenv('PYTHONPATH')) pdf_file = os.path.join(path, "tests/data/train.pdf") debug_file = os.path.join(path, "tests/data/train-debug.pdf") stats = [0, 0] pages = pdf_parser.parse_pdf(pdf_file, True, stats) assert pages != None assert os.path.exists(debug_file) assert stats[0] == 14 assert stats[1] == 14 os.remove(debug_file)
def predict_task(current_task, filename, data): start_time = time.time() # remove password current_task.update_state(state='PROGRESS', meta="Removing password on " + filename) print("Predict: " + filename) input_pdf = io.BytesIO(base64.b64decode(data)) temp_file = tempfile.NamedTemporaryFile(delete=False) temp_file_name = temp_file.name with pikepdf.open(input_pdf) as pdf: pdf.save(temp_file) # extract features stats = [0, 0] pages = pdf_parser.parse_pdf(temp_file_name, False, stats, current_task) temp_csv_file = tempfile.NamedTemporaryFile(delete=False) temp_csv_file_name = temp_csv_file.name pdfUtil.save_pdf_pages_tocsv(filename, pages, temp_csv_file_name) # prepare data markup_data = pd.read_csv(temp_csv_file_name) markup_data["HasCentLine"] = markup_data["HasCentLine"].astype(int) markup_data["HasComboLine"] = markup_data["HasComboLine"].astype(int) markup_data["IsMarkupField"] = markup_data["IsMarkupField"].astype(int) x = markup_data.drop(labels=[ 'FileName', 'PageNum', 'LineLeft', 'LineRight', 'LineTop', 'LineBottom', 'Prefix', 'Suffix', 'FieldCode', 'FieldLeft', 'FieldRight', 'FieldTop', 'FieldBottom', "IsMarkupField" ], axis=1) transformer = ColumnTransformer( [("hash", FeatureHasher(n_features=2, input_type='string'), 'TopElement')], remainder="passthrough") transformed_x = transformer.fit_transform(x) # Get the model's prediction current_task.update_state(state='PROGRESS', meta="Get the model's prediction ") pdf_model = pickle.load(open("/app/ml_model/markup.pkl", "rb")) markup_data['IsMarkupField'] = pdf_model.predict_proba(transformed_x)[:, 1] # markup the PDF temp_output_file = tempfile.NamedTemporaryFile(delete=False) temp_output_file_name = temp_output_file.name pdfUtil.markup_pdf(markup_data, temp_file_name, temp_output_file_name) # return marked up PDF return_data = io.BytesIO() with open(temp_output_file_name, 'rb') as fo: return_data.write(fo.read()) return_data.seek(0) # clean up temp_file.close() temp_csv_file.close() temp_output_file.close() os.remove(temp_file_name) os.remove(temp_csv_file_name) os.remove(temp_output_file_name) total_time = "total time spent: " + str(time.time() - start_time) current_task.update_state(state='PROGRESS', meta=total_time) print(total_time) return { 'data': base64.b64encode(return_data.read()), 'attachment_filename': filename, 'mimetype': 'application/pdf' }
delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) log_writer.writerow(["FileName", "Total", "Found", "Percentage"]) for pdf_file in Path(args.train).glob('**/*.pdf'): files.append(str(pdf_file)) with open(csvFile, mode='w', newline='') as csv_file: csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csv_writer.writerow(["FileName", "PageNum", "LineLeft", "LineRight", "LineTop", "LineBottom", "LineWidth", \ "TopElement", "LeftPadding", "RightPadding", "HasCentLine", "HasComboLine", "FieldWidth", "FieldHeight", \ "Prefix", "PrefixGap", "Suffix", "SuffixGap", "FieldCode", "FieldLeft", "FieldRight", "FieldTop", "FieldBottom", "IsMarkupField"]) for pdfFile in files: print(pdfFile) pages = pdf_parser.parse_pdf(pdfFile, debug, stats) percent = 0 if stats[0] > 0: percent = stats[1] / stats[0] if args.train: log_writer.writerow([pdfFile, stats[0], stats[1], percent]) for i, page in enumerate(pages): for line in [x for x in page if x.IsHorizontal]: csv_writer.writerow([pdfFile, i + 1, line.Position.Left, line.Position.Right, line.Position.Top, line.Position.Bottom, \ line.LineWidth, str(line.TopElement).replace("ElementType.",""), line.LeftPadding, line.RightPadding, line.HasCentLine, line.HasComboLine, \ line.FieldPosition.Right - line.FieldPosition.Left, line.FieldPosition.Top - line.FieldPosition.Bottom, \ line.Prefix, line.PrefixGap, line.Suffix, line.SuffixGap, \ line.FieldCode, line.FieldPosition.Left, line.FieldPosition.Right, line.FieldPosition.Top, line.FieldPosition.Bottom, \ line.IsMarkupField])
def test_parse_pdf_no_debug(mocker): path = Path(os.getenv('PYTHONPATH')) pdf_file = os.path.join(path, "tests/data/test.pdf") stats = [0, 0] pages = pdf_parser.parse_pdf(pdf_file, False, stats) assert pages != None
def test_parse_and_match_fields(mocker): path = Path(os.getenv('PYTHONPATH')) pdf_file = os.path.join(path, "tests/data/train.pdf") stats = [0, 0] pages = pdf_parser.parse_pdf(pdf_file, False, stats) assert pages != None