Exemplo n.º 1
0
def hash_item(item):
    """Hash an RSS item. Item should be a dict at this stage"""
    # Stringify, normalizing dates to strings.
    item_j = json.dumps(item, sort_keys=True, default=str)
    item_hash = sha1(item_j)
    return item_hash
Exemplo n.º 2
0
        "tennworkcompapp": Court.objects.get(pk="tennworkcompapp"),
    }

    for case in tn_corpus:
        if case["label"] == skip_until:
            ready = True
        if not ready:
            continue
        logging.info("Processing label:%s for case:%s", case["label"],
                     case["title"])
        pdf_path = glob("%s/%s/*.pdf" %
                        (os.path.dirname(filepath.name), case["label"]))[0]
        with open(pdf_path, "rb") as p:
            pdf_data = p.read()

        sha1_hash = sha1(force_bytes(pdf_data))
        ops = Opinion.objects.filter(sha1=sha1_hash)
        if len(ops) > 0:
            op = ops[0]
            logging.warn("Document already in database. See: %s at %s" %
                         (op.get_absolute_url(), op.cluster.case_name))

        docket, opinion, cluster, citations, error = make_objects(
            make_item(case),
            courts[case["court"]],
            sha1_hash,
            pdf_data,
        )

        save_everything(
            items={
    def scrape_court(
        self,
        site,
        full_crawl: bool = False,
        backscrape: bool = False,
    ) -> None:
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split(".")[-1].split("_")[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        abort = dup_checker.abort_by_url_hash(site.url, site.hash)
        if abort:
            return

        if site.cookies:
            logger.info("Using cookies: %s" % site.cookies)
        for i, item in enumerate(site):
            msg, r = get_binary_content(
                item["download_urls"],
                site.cookies,
                method=site.method,
            )
            if msg:
                logger.warning(msg)
                ErrorLog(log_level="WARNING", court=court, message=msg).save()
                continue

            content = site.cleanup_content(r.content)

            current_date = item["case_dates"]
            try:
                next_date = site[i + 1]["case_dates"]
            except IndexError:
                next_date = None

            # request.content is sometimes a str, sometimes unicode, so
            # force it all to be bytes, pleasing hashlib.
            sha1_hash = sha1(force_bytes(content))
            onwards = dup_checker.press_on(
                Audio,
                current_date,
                next_date,
                lookup_value=sha1_hash,
                lookup_by="sha1",
            )
            if dup_checker.emulate_break:
                break

            if onwards:
                # Not a duplicate, carry on
                logger.info(
                    "Adding new document found at: %s"
                    % item["download_urls"].encode()
                )
                dup_checker.reset()

                docket, audio_file = make_objects(
                    item, court, sha1_hash, content
                )

                save_everything(
                    items={"docket": docket, "audio_file": audio_file},
                    index=False,
                    backscrape=backscrape,
                )
                process_audio_file.apply_async(
                    (audio_file.pk,), countdown=random.randint(0, 3600)
                )

                logger.info(
                    "Successfully added audio file {pk}: {name}".format(
                        pk=audio_file.pk,
                        name=item["case_names"].encode(),
                    )
                )

        # Update the hash if everything finishes properly.
        logger.info("%s: Successfully crawled oral arguments." % site.court_id)
        if not full_crawl:
            # Only update the hash if no errors occurred.
            dup_checker.update_site_hash(site.hash)
Exemplo n.º 4
0
    def scrape_court(self, site, full_crawl=False):
        download_error = False
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split(".")[-1].split("_")[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        if dup_checker.abort_by_url_hash(site.url, site.hash):
            return

        if site.cookies:
            logger.info("Using cookies: %s" % site.cookies)
        for i, item in enumerate(site):
            msg, r = get_binary_content(
                item["download_urls"],
                site.cookies,
                site._get_adapter_instance(),
                method=site.method,
            )
            if msg:
                logger.warn(msg)
                ErrorLog(log_level="WARNING", court=court, message=msg).save()
                continue

            content = site.cleanup_content(r.content)

            current_date = item["case_dates"]
            try:
                next_date = site[i + 1]["case_dates"]
            except IndexError:
                next_date = None

            # request.content is sometimes a str, sometimes unicode, so
            # force it all to be bytes, pleasing hashlib.
            sha1_hash = sha1(force_bytes(content))
            if (court_str == "nev"
                    and item["precedential_statuses"] == "Unpublished"):
                # Nevada's non-precedential cases have different SHA1 sums
                # every time.
                lookup_params = {
                    "lookup_value": item["download_urls"],
                    "lookup_by": "download_url",
                }
            else:
                lookup_params = {
                    "lookup_value": sha1_hash,
                    "lookup_by": "sha1",
                }

            proceed = dup_checker.press_on(Opinion, current_date, next_date,
                                           **lookup_params)
            if dup_checker.emulate_break:
                break
            if not proceed:
                continue

            # Not a duplicate, carry on
            logger.info("Adding new document found at: %s" %
                        item["download_urls"].encode("utf-8"))
            dup_checker.reset()

            docket, opinion, cluster, citations, error = self.make_objects(
                item, court, sha1_hash, content)

            if error:
                download_error = True
                continue

            self.save_everything(
                items={
                    "docket": docket,
                    "opinion": opinion,
                    "cluster": cluster,
                    "citations": citations,
                },
                index=False,
            )
            extract_doc_content.delay(
                opinion.pk,
                do_ocr=True,
                citation_jitter=True,
            )

            logger.info("Successfully added doc {pk}: {name}".format(
                pk=opinion.pk,
                name=item["case_names"].encode("utf-8"),
            ))

        # Update the hash if everything finishes properly.
        logger.info("%s: Successfully crawled opinions." % site.court_id)
        if not download_error and not full_crawl:
            # Only update the hash if no errors occurred.
            dup_checker.update_site_hash(site.hash)
Exemplo n.º 5
0
def import_disclosure(self, data: Dict[str, Union[str, int, list]]) -> None:
    """Import disclosures into Courtlistener

    :param data: The disclosure information to process
    :return: None
    """
    # Check download_filepath to see if it has been processed before.
    if has_been_extracted(data):
        logger.info(f"Document already extracted and saved: {data['id']}.")
        return

    interface = make_redis_interface("CACHE")
    disclosure_key = make_disclosure_key(data["id"])
    newly_enqueued = create_redis_semaphore(
        interface,
        disclosure_key,
        ttl=60 * 60 * 12,
    )

    if not newly_enqueued:
        logger.info(f"Process is already running {data['id']}.")
        return

    # Generate PDF content from our three paths
    year = int(data["year"])
    person_id = data["person_id"]

    logger.info(
        f"Processing row {data['id']} for person {person_id} "
        f"in year {year}"
    )

    # Check if we've already extracted
    disclosure_url = get_aws_url(data)
    was_previously_pdfed = has_been_pdfed(disclosure_url)
    pdf_response = generate_or_download_disclosure_as_pdf(
        data, was_previously_pdfed
    )
    pdf_bytes = pdf_response.content

    if pdf_response.status_code != 200:
        logger.info("PDF generation failed.")
        return

    if was_previously_pdfed:
        disclosure = get_disclosure_from_pdf_path(disclosure_url)
    else:
        logger.info("PDF generated successfully.")

        # Sha1 hash - Check for duplicates
        sha1_hash = sha1(pdf_bytes)
        in_system = check_if_in_system(sha1_hash)
        if in_system:
            logger.info("PDF already in system.")
            interface.delete(disclosure_key)
            return

        # Return page count - 0 indicates a failure of some kind.  Like PDF
        # Not actually present on aws.
        pg_count = get_page_count(pdf_bytes)
        if not pg_count:
            logger.info(f"PDF failed for disclosure {data['id']}.")
            interface.delete(disclosure_key)
            return

        # Save Financial Disclosure here to AWS and move onward
        disclosure = FinancialDisclosure(
            year=year,
            page_count=pg_count,
            person=Person.objects.get(id=person_id),
            sha1=sha1_hash,
            has_been_extracted=False,
            download_filepath=data.get("url")
            if data.get("url")
            else data.get("urls")[0],
        )
        # Save and upload PDF
        disclosure.filepath.save(
            f"{disclosure.person.slug}-disclosure.{year}.pdf",
            ContentFile(pdf_bytes),
        )
        logger.info(
            f"Uploaded to https://{settings.AWS_S3_CUSTOM_DOMAIN}/"
            f"{disclosure.filepath}"
        )
    # Extract content from PDF
    content = extract_content(
        pdf_bytes=pdf_bytes, disclosure_type=data["disclosure_type"]
    )
    if not content:
        logger.info("Failed extraction!")
        interface.delete(disclosure_key)
        return

    # Save PDF content
    save_disclosure(extracted_data=content, disclosure=disclosure)
    # Remove disclosure ID in redis for completed disclosure
    interface.delete(disclosure_key)