def test_parse_pdf_with_debug(mocker):
    path = Path(os.getenv('PYTHONPATH'))
    pdf_file = os.path.join(path, "tests/data/test.pdf")
    debug_file = os.path.join(path, "tests/data/test-debug.pdf")
    stats = [0, 0]
    pages = pdf_parser.parse_pdf(pdf_file, True, stats)
    assert pages != None
    assert os.path.exists(debug_file)
    os.remove(debug_file)
def test_save_pdf_pages(mocker):
    path = Path(os.getenv('PYTHONPATH'))
    pdf_file = os.path.join(path, "tests/data/test.pdf")
    csv_file = os.path.join(path, "tests/data/test-pages.csv")
    stats = [0, 0]
    pages = pdf_parser.parse_pdf(pdf_file, False, stats)
    pdfUtil.save_pdf_pages_tocsv("test.pdf", pages, csv_file)
    assert os.path.exists(csv_file)
    os.remove(csv_file)
def test_parse_and_match_fields_with_debug(mocker):
    path = Path(os.getenv('PYTHONPATH'))
    pdf_file = os.path.join(path, "tests/data/train.pdf")
    debug_file = os.path.join(path, "tests/data/train-debug.pdf")
    stats = [0, 0]
    pages = pdf_parser.parse_pdf(pdf_file, True, stats)
    assert pages != None
    assert os.path.exists(debug_file)
    assert stats[0] == 14
    assert stats[1] == 14
    os.remove(debug_file)
예제 #4
0
def predict_task(current_task, filename, data):
    start_time = time.time()

    # remove password
    current_task.update_state(state='PROGRESS',
                              meta="Removing password on " + filename)
    print("Predict: " + filename)
    input_pdf = io.BytesIO(base64.b64decode(data))
    temp_file = tempfile.NamedTemporaryFile(delete=False)
    temp_file_name = temp_file.name
    with pikepdf.open(input_pdf) as pdf:
        pdf.save(temp_file)

    # extract features
    stats = [0, 0]
    pages = pdf_parser.parse_pdf(temp_file_name, False, stats, current_task)
    temp_csv_file = tempfile.NamedTemporaryFile(delete=False)
    temp_csv_file_name = temp_csv_file.name
    pdfUtil.save_pdf_pages_tocsv(filename, pages, temp_csv_file_name)

    # prepare data
    markup_data = pd.read_csv(temp_csv_file_name)
    markup_data["HasCentLine"] = markup_data["HasCentLine"].astype(int)
    markup_data["HasComboLine"] = markup_data["HasComboLine"].astype(int)
    markup_data["IsMarkupField"] = markup_data["IsMarkupField"].astype(int)
    x = markup_data.drop(labels=[
        'FileName', 'PageNum', 'LineLeft', 'LineRight', 'LineTop',
        'LineBottom', 'Prefix', 'Suffix', 'FieldCode', 'FieldLeft',
        'FieldRight', 'FieldTop', 'FieldBottom', "IsMarkupField"
    ],
                         axis=1)
    transformer = ColumnTransformer(
        [("hash", FeatureHasher(n_features=2,
                                input_type='string'), 'TopElement')],
        remainder="passthrough")
    transformed_x = transformer.fit_transform(x)

    # Get the model's prediction
    current_task.update_state(state='PROGRESS',
                              meta="Get the model's prediction ")
    pdf_model = pickle.load(open("/app/ml_model/markup.pkl", "rb"))
    markup_data['IsMarkupField'] = pdf_model.predict_proba(transformed_x)[:, 1]

    # markup the PDF
    temp_output_file = tempfile.NamedTemporaryFile(delete=False)
    temp_output_file_name = temp_output_file.name
    pdfUtil.markup_pdf(markup_data, temp_file_name, temp_output_file_name)

    # return marked up PDF
    return_data = io.BytesIO()
    with open(temp_output_file_name, 'rb') as fo:
        return_data.write(fo.read())
    return_data.seek(0)

    # clean up
    temp_file.close()
    temp_csv_file.close()
    temp_output_file.close()
    os.remove(temp_file_name)
    os.remove(temp_csv_file_name)
    os.remove(temp_output_file_name)

    total_time = "total time spent: " + str(time.time() - start_time)
    current_task.update_state(state='PROGRESS', meta=total_time)
    print(total_time)

    return {
        'data': base64.b64encode(return_data.read()),
        'attachment_filename': filename,
        'mimetype': 'application/pdf'
    }
예제 #5
0
                            delimiter=',',
                            quotechar='"',
                            quoting=csv.QUOTE_MINIMAL)
    log_writer.writerow(["FileName", "Total", "Found", "Percentage"])
    for pdf_file in Path(args.train).glob('**/*.pdf'):
        files.append(str(pdf_file))

with open(csvFile, mode='w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file,
                            delimiter=',',
                            quotechar='"',
                            quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerow(["FileName", "PageNum", "LineLeft", "LineRight", "LineTop", "LineBottom", "LineWidth", \
        "TopElement", "LeftPadding", "RightPadding", "HasCentLine", "HasComboLine", "FieldWidth", "FieldHeight", \
        "Prefix", "PrefixGap", "Suffix", "SuffixGap", "FieldCode", "FieldLeft", "FieldRight", "FieldTop", "FieldBottom", "IsMarkupField"])
    for pdfFile in files:
        print(pdfFile)
        pages = pdf_parser.parse_pdf(pdfFile, debug, stats)
        percent = 0
        if stats[0] > 0:
            percent = stats[1] / stats[0]
        if args.train:
            log_writer.writerow([pdfFile, stats[0], stats[1], percent])
        for i, page in enumerate(pages):
            for line in [x for x in page if x.IsHorizontal]:
                csv_writer.writerow([pdfFile, i + 1, line.Position.Left, line.Position.Right, line.Position.Top, line.Position.Bottom, \
                    line.LineWidth, str(line.TopElement).replace("ElementType.",""), line.LeftPadding, line.RightPadding, line.HasCentLine, line.HasComboLine, \
                    line.FieldPosition.Right - line.FieldPosition.Left, line.FieldPosition.Top - line.FieldPosition.Bottom, \
                    line.Prefix, line.PrefixGap, line.Suffix, line.SuffixGap, \
                    line.FieldCode, line.FieldPosition.Left, line.FieldPosition.Right, line.FieldPosition.Top, line.FieldPosition.Bottom, \
                    line.IsMarkupField])
def test_parse_pdf_no_debug(mocker):
    path = Path(os.getenv('PYTHONPATH'))
    pdf_file = os.path.join(path, "tests/data/test.pdf")
    stats = [0, 0]
    pages = pdf_parser.parse_pdf(pdf_file, False, stats)
    assert pages != None
def test_parse_and_match_fields(mocker):
    path = Path(os.getenv('PYTHONPATH'))
    pdf_file = os.path.join(path, "tests/data/train.pdf")
    stats = [0, 0]
    pages = pdf_parser.parse_pdf(pdf_file, False, stats)
    assert pages != None