Python parse_pdf示例，RecordLib.summary.pdf.parse_pdf Python示例

示例#1

0

显示文件

def test_init():
    try:
        parse_pdf(
            pdf=open("tests/data/CourtSummaryReport.pdf", "rb"),
            tempdir="tests/data/tmp")
    except:
        pytest.fail("Creating Summary object failed.")

示例#2

0

显示文件

def test_get_cases():
    summary = parse_pdf(
        pdf="tests/data/CourtSummaryReport.pdf",
        tempdir="tests/data/tmp")
    assert len(summary.get_cases()) > 0
    assert len(summary.get_cases()) > 0
    assert isinstance(summary.get_cases()[0], Case)

示例#3

0

显示文件

def test_get_defendant():
    summary = parse_pdf(
        pdf="tests/data/CourtSummaryReport.pdf",
        tempdir="tests/data/tmp")
    assert len(summary.get_defendant().first_name) > 0
    assert len(summary.get_defendant().last_name) > 0
    assert summary.get_defendant().date_of_birth > date(1900, 1, 1)

示例#4

0

显示文件

文件： test_summary.py 项目： f1nesse13/RecordLib

def test_add_summary_to_crecord():
    summary = parse_pdf(pdf="tests/data/CourtSummaryReport.pdf",
                        tempdir="tests/data/tmp")
    rec = CRecord(Person("John", "Smith", date(1998, 1, 1)))
    rec.add_summary(summary, override_person=True)
    assert len(rec.person.first_name) > 0
    assert rec.person.first_name != "John"

示例#5

0

显示文件

def test_get_arrest_date():
    summary = parse_pdf(
        pdf=open("tests/data/CourtSummaryReport.pdf", "rb"),
        tempdir="tests/data/tmp")
    cases = summary.get_cases()
    # There's not a standard example summary pdf to run tests on, so can't assume much about the contents of 
    # the summary being parsed here.
    # In the summary being parsed, an arrest date might be missing from a case, 
    # but its unlikely there's _no_ case with an arrest date.
    # If you're testing this on a summary that has no arrest dates ...
    # find a different summary to use for testing.
    arrest_dates = [case.arrest_date for case in cases if case.arrest_date is not None]

示例#6

0

显示文件

def test_get_sentences():
    summary = parse_pdf(
        pdf="tests/data/CourtSummaryReport.pdf",
        tempdir="tests/data/tmp")
    cases = summary.get_cases()
    for case in cases:
        for charge in case.charges:
            for sentence in charge.sentences:
                try:
                    assert (isinstance(sentence.sentence_length.max_time, timedelta) or sentence.sentence_length.max_time is None)
                except:
                    pytest.fail("Could not get sentence from charge.")

示例#7

0

显示文件

def test_bulk_parse_pdf_from_path(caplog):
    caplog.set_level(logging.INFO)
    paths = os.listdir("tests/data/summaries")
    if len(paths) == 0:
        pytest.fail("No summaries to parse in /tests/data/summaries.")
    fails = []
    logging.info("Successful parses:")
    for path in paths:
        try:
            summary = parse_pdf(pdf=os.path.join(f"tests/data/summaries", path), tempdir="tests/data/tmp")
            logging.info(path)
        except:
            print(path)
            fails.append(os.path.split(path)[1])
    if len(fails) > 0:
        logging.error(f"{ len(fails) } / {len(paths)} summaries failed to parse:")
        for fail in fails:
            logging.error(f"  - {fail}")
        pytest.fail("Summaries failed to parse.")

示例#8

0

显示文件

    def put(self, request, *args, **kwargs):
        """
        Accept a CRecord and a set of SourceRecords. Incorporate the information that the SourceRecords contain into the CRecord.

        TODO this should replace FileUpload view. 
        """
        try:
            serializer = IntegrateSourcesSerializer(data=request.data)
            if serializer.is_valid():
                crecord = CRecord.from_dict(
                    serializer.validated_data["crecord"])
                for source_record_data in serializer.validated_data[
                        "source_records"]:
                    source_record = SourceRecord.objects.get(
                        id=source_record_data["id"])
                    if source_record.record_type == SourceRecord.RecTypes.SUMMARY_PDF:
                        summary = parse_pdf(source_record.file.path)
                        crecord.add_summary(
                            summary,
                            case_merge_strategy="overwrite_old",
                            override_person=True)
                    elif source_record.record_type == SourceRecord.RecTypes.DOCKET_PDF:
                        docket, errs = Docket.from_pdf(source_record.file.path)
                        crecord.add_docket(docket)
                    else:
                        logger.error(
                            f"Cannot parse a source record with type {source_record.record_type}"
                        )
                return Response({'crecord': CRecordSerializer(crecord).data},
                                status=status.HTTP_200_OK)
            else:
                return Response({"errors": serializer.errors},
                                status=status.HTTP_400_BAD_REQUEST)
        except Exception as err:
            return Response({"errors": [str(err)]},
                            status=status.HTTP_500_INTERNAL_SERVER_ERROR)

示例#9

0

显示文件

def dir(directory, archive, expungement_template, sealing_template, atty_name,
        atty_org, atty_org_addr, atty_org_phone, atty_bar_id, tempdir):
    if not os.path.exists(directory):
        print(f"The directory {directory} does not exist.")
        return
    files = [
        os.path.join(directory, f) for f in os.listdir(directory)
        if os.path.isfile(os.path.join(directory, f))
    ]
    summaries = []
    dockets = []
    atty = Attorney(name=atty_name,
                    organization=atty_org,
                    organization_address=atty_org_addr,
                    organization_phone=atty_org_phone,
                    bar_id=atty_bar_id)
    for f in files:
        print(f"  Processing {f}")
        try:
            dk = Docket.from_pdf(f, tempdir=tempdir)
            print(f"    It looks like {f} is a docket.")
            dockets.append(dk)
        except:
            try:
                sm = parse_pdf(f, tempdir=tempdir)
                print(f"    It looks like {f} is a summary.")
                summaries.append(sm)
            except:
                print(f"    It seems {f} is neither a summary nor a docket.")

    crec = CRecord()
    [crec.add_summary(summary) for summary in summaries]
    [crec.add_docket(docket) for docket in dockets]

    analysis = (Analysis(crec).rule(expunge_deceased).rule(
        expunge_over_70).rule(expunge_nonconvictions).rule(
            expunge_summary_convictions).rule(seal_convictions))

    petitions = [
        petition for decision in analysis.decisions
        for petition in decision.value
    ]
    for petition in petitions:
        petition.attorney = atty
    with open(sealing_template, "rb") as doc:
        for petition in petitions:
            if petition.petition_type == "Sealing":
                petition.set_template(doc)

    with open(expungement_template, "rb") as doc:
        for petition in petitions:
            if petition.petition_type == "Expungement":
                petition.set_template(doc)

    petition_tuples = []
    for pt in petitions:
        petition_tuples.append((pt.file_name(), pt.render()))
    pkg = Compressor(archive, petition_tuples, tempdir=tempdir)
    pkg.save()
    print("*********************************")
    print("****** COMPLETE *****************")
    print("*********************************")

示例#10

0

显示文件

def test_parse_pdf_from_path():
    summary = parse_pdf(
        pdf="tests/data/CourtSummaryReport.pdf",
        tempdir="tests/data/tmp")
    assert len(summary.get_cases()) > 0