Exemplo n.º 1
0
def get_next_date_range(court_id, span=7):
    """Get the next start and end query dates for a court.

    Check the DB for the last date for a court that was completed. Return the
    day after that date + span days into the future as the range to query for
    the requested court.

    If the court is still in progress, return (None, None).

    :param court_id: A PACER Court ID
    :param span: The number of days to go forward from the last completed date
    """
    court_id = map_pacer_to_cl_id(court_id)
    try:
        last_completion_log = (PACERFreeDocumentLog.objects.filter(
            court_id=court_id, ).exclude(
                status=PACERFreeDocumentLog.SCRAPE_FAILED, ).latest(
                    "date_queried"))
    except PACERFreeDocumentLog.DoesNotExist:
        logger.warning("FAILED ON: %s" % court_id)
        raise

    if last_completion_log.status == PACERFreeDocumentLog.SCRAPE_IN_PROGRESS:
        return None, None

    # Ensure that we go back five days from the last time we had success if
    # that success was in the last few days.
    last_complete_date = min(now().date() - timedelta(days=5),
                             last_completion_log.date_queried)
    next_end_date = min(now().date(),
                        last_complete_date + timedelta(days=span))
    return last_complete_date, next_end_date
Exemplo n.º 2
0
def get_docket_ids(last_x_days: int) -> Set[int]:
    """Get docket IDs to update via iquery

    :param last_x_days: How many of the last days relative to today should we
    inspect? E.g. 1 means just today, 2 means today and yesterday, etc.
    :return: docket IDs for which we should crawl iquery
    """
    docket_ids = set()
    if hasattr(settings, "MATOMO_TOKEN"):
        try:
            r = requests.get(
                settings.MATOMO_REPORT_URL,
                timeout=10,
                params={
                    "idSite": settings.MATOMO_SITE_ID,
                    "module": "API",
                    "method": "Live.getLastVisitsDetails",
                    "period": "day",
                    "format": "json",
                    "date": "last%s" % last_x_days,
                    "token_auth": settings.MATOMO_TOKEN,
                },
            )
            r.raise_for_status()
            j = r.json()
        except (
                ConnectionRefusedError,
                JSONDecodeError,
                RequestException,
        ) as e:
            logger.warning(
                "iQuery scraper was unable to get results from Matomo. Got "
                "exception: %s" % e)
        else:
            for item in j:
                for actiondetail in item["actionDetails"]:
                    url = actiondetail.get("url")
                    if url is None:
                        continue
                    match = re.search(
                        r"^https://www\.courtlistener\.com/docket/([0-9]+)/",
                        url,
                    )
                    if match is None:
                        continue
                    docket_ids.add(match.group(1))

    # Add in docket IDs that have docket alerts or are favorited
    docket_ids.update(DocketAlert.objects.values_list("docket", flat=True))
    docket_ids.update(
        Favorite.objects.exclude(docket_id=None).values_list("docket_id",
                                                             flat=True))
    docket_ids.update(
        Docket.objects.filter(
            case_name__isnull=True,
            source__in=Docket.RECAP_SOURCES).order_by("?").values_list(
                "pk", flat=True))
    return docket_ids
Exemplo n.º 3
0
def get_documents(options):
    """Download documents from PACER if we don't already have them."""
    q = options["queue"]

    throttle = CeleryThrottle(queue_name=q)
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()

    page_size = 20000
    main_query = build_main_query_from_query_string(
        QUERY_STRING,
        {"rows": page_size, "fl": ["id", "docket_id"]},
        {"group": False, "facet": False, "highlight": False},
    )
    si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r")
    results = si.query().add_extra(**main_query).execute()
    si.conn.http_connection.close()
    logger.info("Got %s search results.", results.result.numFound)

    for i, result in enumerate(results):
        if i < options["offset"]:
            continue
        if i >= options["limit"] > 0:
            break
        throttle.maybe_wait()

        logger.info(
            "Doing item %s w/rd: %s, d: %s",
            i,
            result["id"],
            result["docket_id"],
        )

        try:
            rd = RECAPDocument.objects.get(pk=result["id"])
        except RECAPDocument.DoesNotExist:
            logger.warning(
                "Unable to find RECAP Document with id %s", result["id"]
            )
            continue

        if rd.is_available:
            logger.info("Already have pk %s; just tagging it.", rd.pk)
            add_tags(rd, TAG)
            continue

        if not rd.pacer_doc_id:
            logger.info("Unable to find pacer_doc_id for: %s", rd.pk)
            continue

        chain(
            get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG).set(
                queue=q
            ),
            extract_recap_pdf.si(rd.pk).set(queue=q),
            add_items_to_solr.si([rd.pk], "search.RECAPDocument").set(queue=q),
        ).apply_async()
Exemplo n.º 4
0
    def set_if_falsy(obj, attribute, new_value):
        """Check if the value passed in is Falsy. If so, set it to the value of
        new_value.

        return ok: Whether the item was set successfully
        """
        current_value = getattr(obj, attribute)
        if current_value is not None and isinstance(current_value, str):
            current_value = current_value.strip()

        does_not_currently_have_a_value = not current_value
        current_value_not_zero = current_value != 0
        new_value_not_blank = new_value.strip() != ""
        ok = True
        if all([
                does_not_currently_have_a_value,
                current_value_not_zero,
                new_value_not_blank,
        ]):
            logger.info("Updating %s with %s." %
                        (attribute, new_value.encode()))
            setattr(obj, attribute, new_value)
        else:
            # Report if there's a difference -- that might spell trouble.
            values_differ = False
            if (isinstance(current_value, str) and isinstance(new_value, str)
                    and "".join(current_value.split()) != "".join(
                        new_value.split())):
                # Handles strings and normalizes them for comparison.
                values_differ = True
            elif isinstance(current_value,
                            int) and current_value != int(new_value):
                # Handles ints, which need no normalization for comparison.
                values_differ = True

            if values_differ:
                logger.warning(
                    "WARNING: Didn't set '{attr}' attribute on obj {obj_id} "
                    "because it already had a value, but the new value "
                    "('{new}') differs from current value ('{current}')".
                    format(
                        attr=attribute,
                        obj_id=obj.pk,
                        new=new_value,
                        current=force_bytes(current_value),
                    ))
                ok = False
            else:
                # The values were the same.
                logger.info("'%s' field unchanged -- old and new values were "
                            "the same: %s" % (attribute, new_value))
        return ok
Exemplo n.º 5
0
    def do_citations(cluster, scdb_info):
        """
        Handle the citation fields.

        :param cluster: The Cluster to be changed.
        :param scdb_info: A dict with the SCDB information.
        """
        fields = {
            "usCite": ("U.S.", Citation.FEDERAL),
            "sctCite": ("S. Ct.", Citation.FEDERAL),
            "ledCite": ("L. Ed.", Citation.FEDERAL),
            "lexisCite": ("U.S. LEXIS", Citation.LEXIS),
        }
        for scdb_field, reporter_info in fields.items():
            if not scdb_info[scdb_field]:
                continue
            try:
                citation_obj = get_citations(
                    scdb_info[scdb_field],
                    html=False,
                    do_post_citation=False,
                    do_defendant=False,
                    disambiguate=False,
                )[0]
            except IndexError:
                logger.warning("Unable to parse citation for: %s",
                               scdb_info[scdb_field])
            else:
                cites = cluster.citations.filter(reporter=reporter_info[0])
                if cites.count() == 1:
                    # Update the existing citation.
                    cite = cites[0]
                    cite.volume = citation_obj.volume
                    cite.reporter = citation_obj.reporter
                    cite.page = citation_obj.page
                    cite.save()
                else:
                    try:
                        # Create a new citation
                        Citation.objects.create(
                            cluster=cluster,
                            volume=citation_obj.volume,
                            reporter=citation_obj.reporter,
                            page=citation_obj.page,
                            type=reporter_info[1],
                        )
                    except IntegrityError:
                        # Violated unique_together constraint. Fine.
                        pass
Exemplo n.º 6
0
    def do_first_pass(options):
        idb_rows = FjcIntegratedDatabase.objects.filter(
            dataset_source=CV_2017,
        ).order_by("pk")
        q = options["queue"]
        throttle = CeleryThrottle(queue_name=q)
        for i, idb_row in enumerate(queryset_generator(idb_rows)):
            # Iterate over all items in the IDB and find them in the Docket
            # table. If they're not there, create a new item.
            if i < options["offset"]:
                continue
            if i >= options["limit"] > 0:
                break

            throttle.maybe_wait()
            # TODO: See conversation in #courtlistener channel from 2019-07-11,
            # In which it appears we matched a criminal case with a civil one.
            # The code below doesn't protect against that, but it should (and I
            # think it does in the `do_second_pass` code, below.
            ds = Docket.objects.filter(
                docket_number_core=idb_row.docket_number,
                court=idb_row.district,
            )
            count = ds.count()
            if count == 0:
                logger.info(
                    "%s: Creating new docket for IDB row: %s", i, idb_row
                )
                create_new_docket_from_idb.apply_async(
                    args=(idb_row.pk,), queue=q,
                )

            elif count == 1:
                d = ds[0]
                logger.info(
                    "%s: Merging Docket %s with IDB row: %s", i, d, idb_row
                )
                merge_docket_with_idb.apply_async(
                    args=(d.pk, idb_row.pk), queue=q
                )
            elif count > 1:
                logger.warning(
                    "%s: Unable to merge. Got %s dockets for row: %s",
                    i,
                    count,
                    idb_row,
                )
Exemplo n.º 7
0
def find_missing_or_incorrect_docket_numbers(options):
    """Iterate over tax cases to verify which docket numbers are correct.

    :param options:
    :return: Nothing
    """

    should_fix = options["fix"]
    ocs = OpinionCluster.objects.filter(docket__court="tax").exclude(
        sub_opinions__plain_text="")

    logger.info("%s clusters found", ocs.count())

    for oc in ocs:
        logger.info("Analyzing cluster %s", oc.id)
        ops = oc.sub_opinions.all()
        assert ops.count() == 1
        for op in ops:
            logger.warning(
                "Reference url: https://www.courtlistener.com/opinion/%s/x",
                oc.id,
            )
            # Only loop over the first opinion because these
            # cases should only one have one
            # because they were extracted from the tax courts
            dockets_in_db = oc.docket.docket_number.strip()
            found_dockets = get_tax_docket_numbers(op.plain_text)
            if found_dockets == dockets_in_db:
                if (oc.docket.docket_number.strip() == ""
                        and dockets_in_db == ""):
                    logger.info("No docket numbers found in db or text.")
                else:
                    logger.info("Docket numbers appear correct.")
                continue
            else:
                if dockets_in_db == "":
                    logger.warning(
                        "Docket No(s). found for the first time: %s",
                        found_dockets,
                    )
                elif found_dockets == "":
                    logger.warning(
                        "Docket No(s). not found in text but Docket No(s). %s in db",
                        dockets_in_db,
                    )
                else:
                    logger.warning(
                        "Dockets in db (%s) != (%s) docket parsed from text",
                        dockets_in_db,
                        found_dockets,
                    )
                if should_fix:
                    oc.docket.docket_number = found_dockets
                    oc.docket.save()
Exemplo n.º 8
0
def process_citations(data, debug):
    """Walk through the citations and add them one at a time.
    """
    updated_ids = set()
    for index, item in data.iterrows():
        logger.info("\nAdding citation from %s to %s" %
                    (item["citing"], item["cited"]))
        try:
            cite = OpinionsCited.objects.get(
                citing_opinion_id=item["citing"],
                cited_opinion_id=item["cited"],
            )
            msg = "Citation already exists. Doing nothing:\n"
        except OpinionsCited.DoesNotExist:
            cite = OpinionsCited(
                citing_opinion_id=item["citing"],
                cited_opinion_id=item["cited"],
            )
            msg = "Created new citation:\n"
            if not debug:
                cite.save()
                updated_ids.add(cite.citing_opinion.pk)
        try:
            logger.info(
                "  %s"
                "    %s: %s\n"
                "    From: %s\n"
                "    To:   %s\n" %
                (msg, cite.pk, cite, cite.citing_opinion, cite.cited_opinion))
        except Opinion.DoesNotExist:
            logger.warning(
                "  Unable to create citation. Underlying Opinion doesn't "
                "exist.")

    logger.info("\nUpdating Solr...")
    if not debug:
        add_items_to_solr(updated_ids, "search.Opinion")
    logger.info("Done.")
Exemplo n.º 9
0
def create_or_update_row(values: Dict[str, str]) -> FjcIntegratedDatabase:
    fjc_filters = [
        {
            "district": values["district"],
            "docket_number": values["docket_number"],
            "origin": values["origin"],
            "date_filed": values["date_filed"],
        },
        # Match on defendant (that'll work better on criminal cases). It can
        # change over time, but if we find a match that's a very strong
        # indicator and we should use it.
        {
            "defendant": values["defendant"]
        },
    ]
    existing_rows = FjcIntegratedDatabase.objects.all()
    for fjc_filter in fjc_filters:
        existing_rows = existing_rows.filter(**fjc_filter)
        existing_row_count = existing_rows.count()
        if existing_row_count == 0:
            fjc_row = FjcIntegratedDatabase.objects.create(**values)
            logger.info("Added row: %s", fjc_row)
            break
        elif existing_row_count == 1:
            existing_rows.update(date_modified=now(), **values)
            fjc_row = existing_rows[0]
            logger.info("Updated row: %s" % fjc_row)
            break
    else:
        # Didn't hit a break b/c too many matches.
        logger.warning(
            "Got %s results when looking up row by filters: %s",
            existing_row_count,
            fjc_filter,
        )
        fjc_row = None

    return fjc_row
Exemplo n.º 10
0
    def map_judges_to_photos(self):
        """Identify which of the judges in the DB have photos.

        We iterate over the entire collection of judges, identifying which have
        photos. We could instead iterate over the photos, but that increases
        the risk of duplicate issues.
        """
        # Create a dict of judge paths, mapping paths to empty lists.
        judge_paths = os.listdir(os.path.join(judge_root, "orig"))
        judge_map = {}
        for path in judge_paths:
            judge_map[path] = []

        # Iterate over the people, attempting to look them up in the list
        people = Person.objects.filter(is_alias_of=None)
        for person in people:
            for name in self.make_slugs(person):
                if name in judge_map:
                    # If there's a hit, add the path to the dict of judge paths.
                    judge_map[name].append(person)
                    break

        # After iterating, set all people to not have photos.
        if not self.debug:
            people.update(has_photo=False)

        found = 0
        missed = 0
        multi = 0
        for path, people in judge_map.items():
            if len(people) == 0:
                logger.warning("Did not find a judge for %s" % path)
                missed += 1
            if len(people) == 1:
                person = people[0]
                found += 1
                if not self.debug:
                    logger.info("Updating judge %s" % person)
                    person.has_photo = True
                    person.save()
            if len(people) > 1:
                logger.warning("Found more than one match for %s:" % path)
                for person in people:
                    logger.warning("Found: %s - %s" % (
                        person,
                        granular_date(
                            person,
                            "date_dob",
                            iso=True,
                        ),
                    ))
                multi += 1

        logger.info("\n\n%s Matches\n%s Missed\n%s Multiple results" %
                    (found, missed, multi))
Exemplo n.º 11
0
    def scrape_court(self, site, full_crawl=False, ocr_available=True):
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split(".")[-1].split("_")[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        if dup_checker.abort_by_url_hash(site.url, site.hash):
            return

        if site.cookies:
            logger.info(f"Using cookies: {site.cookies}")
        for i, item in enumerate(site):
            msg, r = get_binary_content(
                item["download_urls"],
                site.cookies,
                method=site.method,
            )
            if msg:
                logger.warning(msg)
                ErrorLog(log_level="WARNING", court=court, message=msg).save()
                continue

            content = site.cleanup_content(r.content)

            current_date = item["case_dates"]
            try:
                next_date = site[i + 1]["case_dates"]
            except IndexError:
                next_date = None

            # request.content is sometimes a str, sometimes unicode, so
            # force it all to be bytes, pleasing hashlib.
            sha1_hash = sha1(force_bytes(content))
            if (
                court_str == "nev"
                and item["precedential_statuses"] == "Unpublished"
            ):
                # Nevada's non-precedential cases have different SHA1 sums
                # every time.
                lookup_params = {
                    "lookup_value": item["download_urls"],
                    "lookup_by": "download_url",
                }
            else:
                lookup_params = {
                    "lookup_value": sha1_hash,
                    "lookup_by": "sha1",
                }

            proceed = dup_checker.press_on(
                Opinion, current_date, next_date, **lookup_params
            )
            if dup_checker.emulate_break:
                break
            if not proceed:
                continue

            # Not a duplicate, carry on
            logger.info(
                f"Adding new document found at: {item['download_urls'].encode()}"
            )
            dup_checker.reset()

            docket, opinion, cluster, citations = make_objects(
                item, court, sha1_hash, content
            )

            save_everything(
                items={
                    "docket": docket,
                    "opinion": opinion,
                    "cluster": cluster,
                    "citations": citations,
                },
                index=False,
            )
            extract_doc_content.delay(
                opinion.pk, ocr_available=ocr_available, citation_jitter=True
            )

            logger.info(
                f"Successfully added doc {opinion.pk}: {item['case_names'].encode()}"
            )

        # Update the hash if everything finishes properly.
        logger.info(f"{site.court_id}: Successfully crawled opinions.")
        if not full_crawl:
            # Only update the hash if no errors occurred.
            dup_checker.update_site_hash(site.hash)
Exemplo n.º 12
0
def parse_harvard_opinions(reporter, volume, make_searchable):
    """
    Parse downloaded CaseLaw Corpus from internet archive and add them to our
    database.

    Optionally uses a reporter abbreviation to identify cases to download as
    used by IA.  (Ex. T.C. => tc)

    Optionally uses a volume integer.

    If neither is provided, code will cycle through all downloaded files.

    :param volume: The volume (int) of the reporters (optional) (ex 10)
    :param reporter: Reporter string as slugify'd (optional) (tc) for T.C.
    :param make_searchable: Boolean to indicate saving to solr
    :return: None
    """
    if not reporter and volume:
        logger.error("You provided a volume but no reporter. Exiting.")
        return

    for file_path in filepath_list(reporter, volume):
        ia_download_url = "/".join(
            ["https://archive.org/download", file_path.split("/", 9)[-1]]
        )

        if OpinionCluster.objects.filter(
            filepath_json_harvard=file_path
        ).exists():
            logger.info("Skipping - already in system %s" % ia_download_url)
            continue

        try:
            with open(file_path) as f:
                data = json.load(f)
        except ValueError:
            logger.warning("Empty json: missing case at: %s" % ia_download_url)
            continue
        except Exception as e:
            logger.warning("Unknown error %s for: %s" % (e, ia_download_url))
            continue

        cites = get_citations(data["citations"][0]["cite"])
        if not cites:
            logger.info(
                "No citation found for %s." % data["citations"][0]["cite"]
            )
            continue

        case_name = harmonize(data["name_abbreviation"])
        case_name_short = cnt.make_case_name_short(case_name)
        case_name_full = harmonize(data["name"])

        citation = cites[0]
        if skip_processing(citation, case_name, file_path):
            continue

        # TODO: Generalize this to handle all court types somehow.
        court_id = match_court_string(
            data["court"]["name"],
            state=True,
            federal_appeals=True,
            federal_district=True,
        )

        soup = BeautifulSoup(data["casebody"]["data"], "lxml")

        # Some documents contain images in the HTML
        # Flag them for a later crawl by using the placeholder '[[Image]]'
        judge_list = [
            extract_judge_last_name(x.text) for x in soup.find_all("judges")
        ]
        author_list = [
            extract_judge_last_name(x.text) for x in soup.find_all("author")
        ]
        # Flatten and dedupe list of judges
        judges = ", ".join(
            sorted(
                list(
                    set(
                        itertools.chain.from_iterable(judge_list + author_list)
                    )
                )
            )
        )
        judges = titlecase(judges)
        docket_string = (
            data["docket_number"]
            .replace("Docket No.", "")
            .replace("Docket Nos.", "")
            .strip()
        )

        short_fields = ["attorneys", "disposition", "otherdate", "seealso"]

        long_fields = [
            "syllabus",
            "summary",
            "history",
            "headnotes",
            "correction",
        ]

        short_data = parse_extra_fields(soup, short_fields, False)
        long_data = parse_extra_fields(soup, long_fields, True)

        with transaction.atomic():
            logger.info("Adding docket for: %s", citation.base_citation())
            docket = Docket(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                docket_number=docket_string,
                court_id=court_id,
                source=Docket.HARVARD,
                ia_needs_upload=False,
            )
            try:
                with transaction.atomic():
                    docket.save()
            except OperationalError as e:
                if "exceeds maximum" in str(e):
                    docket.docket_number = (
                        "%s, See Corrections for full Docket Number"
                        % trunc(docket_string, length=5000, ellipsis="...")
                    )
                    docket.save()
                    long_data["correction"] = "%s <br> %s" % (
                        data["docket_number"],
                        long_data["correction"],
                    )
            # Handle partial dates by adding -01v to YYYY-MM dates
            date_filed, is_approximate = validate_dt(data["decision_date"])

            logger.info("Adding cluster for: %s", citation.base_citation())
            cluster = OpinionCluster(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                precedential_status="Published",
                docket_id=docket.id,
                source="U",
                date_filed=date_filed,
                date_filed_is_approximate=is_approximate,
                attorneys=short_data["attorneys"],
                disposition=short_data["disposition"],
                syllabus=long_data["syllabus"],
                summary=long_data["summary"],
                history=long_data["history"],
                other_dates=short_data["otherdate"],
                cross_reference=short_data["seealso"],
                headnotes=long_data["headnotes"],
                correction=long_data["correction"],
                judges=judges,
                filepath_json_harvard=file_path,
            )
            cluster.save(index=False)

            logger.info("Adding citation for: %s", citation.base_citation())
            Citation.objects.create(
                volume=citation.volume,
                reporter=citation.reporter,
                page=citation.page,
                type=map_reporter_db_cite_type(
                    REPORTERS[citation.canonical_reporter][0]["cite_type"]
                ),
                cluster_id=cluster.id,
            )
            new_op_pks = []
            for op in soup.find_all("opinion"):
                # This code cleans author tags for processing.
                # It is particularly useful for identifiying Per Curiam
                for elem in [op.find("author")]:
                    if elem is not None:
                        [x.extract() for x in elem.find_all("page-number")]

                auth = op.find("author")
                if auth is not None:
                    author_tag_str = titlecase(auth.text.strip(":"))
                    author_str = titlecase(
                        "".join(extract_judge_last_name(author_tag_str))
                    )
                else:
                    author_str = ""
                    author_tag_str = ""

                per_curiam = True if author_tag_str == "Per Curiam" else False
                # If Per Curiam is True set author string to Per Curiam
                if per_curiam:
                    author_str = "Per Curiam"

                op_type = map_opinion_type(op.get("type"))
                opinion_xml = str(op)
                logger.info("Adding opinion for: %s", citation.base_citation())
                op = Opinion(
                    cluster_id=cluster.id,
                    type=op_type,
                    author_str=author_str,
                    xml_harvard=opinion_xml,
                    per_curiam=per_curiam,
                    extracted_by_ocr=True,
                )
                # Don't index now; do so later if desired
                op.save(index=False)
                new_op_pks.append(op.pk)

        if make_searchable:
            add_items_to_solr.delay(new_op_pks, "search.Opinion")

        logger.info("Finished: %s", citation.base_citation())
Exemplo n.º 13
0
def lookup_row(row):
    """Lookup the row provided in the FJC DB.

    :param row: A row dict as pulled from the CSV using the csv DictReader
    :returns int: The PK of the row that matched.
    """
    try:
        plaintiff, defendant = row["Case Name"].lower().split(" v. ", 1)
    except IndexError:
        logger.warning("Unable to find ' v. ' in case name.")
        return
    except ValueError:
        logger.warning("Got multiple ' v. ' in the case name.")
        return
    opinion_date = datetime.strptime(row["Date"], "%m/%d/%Y")
    orig_query = (
        FjcIntegratedDatabase.objects.filter(
            # All of these are civil.
            dataset_source=CV_2017,
            # Ensure the correct court.
            district__fjc_court_id=row["AO ID"],
            # The docket must have been filed *before* the date of the opinion.
            date_filed__lte=opinion_date,
            # But not more than five years prior to the opinion.
            date_filed__gte=opinion_date - timedelta(days=365 * 5),
        ).exclude(
            # FJC Ids are duplicated across bankruptcy and district. Since we only
            # know the FJC court ID, just exclude bankruptcy cases as a rule. That
            # will ensure we limit ourselves to the correct jurisdiction.
            district__jurisdiction=Court.FEDERAL_BANKRUPTCY, ).order_by(
                "-date_filed"))

    # Start with the strictest, then broaden when you fail. Truncate at 30
    # chars (that's all the field can contain).
    filter_tuples = [
        (
            # Try an exact match on case name.
            (),
            {
                "plaintiff__iexact": plaintiff[:30],
                "defendant__iexact": defendant[:30],
            },
        ),
        (
            # Try a starts with match on case name.
            (),
            {
                "plaintiff__istartswith": plaintiff[:30],
                "defendant__istartswith": defendant[:30],
            },
        ),
        (
            # To to find a match that contains the first three words from the
            # plaintiff and defendant (in any order). Note Q objects are args, not
            # kwargs, hence different format here.
            (
                make_party_q(defendant, "defendant", slice(None, 3)),
                make_party_q(plaintiff, "plaintiff", slice(None, 3)),
            ),
            {},
        ),
        (
            # Broaden. Try just the first word from plaintiff & defendant matching.
            (
                make_party_q(defendant, "defendant", slice(None, 1)),
                make_party_q(plaintiff, "plaintiff", slice(None, 1)),
            ),
            {},
        ),
        (
            # Explore. Try the second word of the plaintiff instead. It's often a
            # last name and worth a try.
            (
                make_party_q(plaintiff, "plaintiff", slice(1, 2)),
                make_party_q(defendant, "defendant", slice(None, 1)),
            ),
            {},
        ),
    ]

    for args, kwargs in filter_tuples:
        results = orig_query.filter(*args, **kwargs)
        count = results.count()
        if count == 0:
            logger.warning("Unable to find result (args: %s, kwargs: %s). "
                           "Broadening if possible." % (args, kwargs))
            continue
        if count == 1:
            logger.info("Got one result. Bingo (args: %s, kwargs: %s)." %
                        (args, kwargs))
            return results[0]
        elif 5 > count > 1:
            logger.info("Got %s results. Choosing closest to document date." %
                        count)
            return results[0]
        else:
            logger.warning(
                "Got too many results. Cannot identify correct case "
                "(args: %s, kwargs: %s)." % (args, kwargs))
            return
Exemplo n.º 14
0
def download_documents(options):
    """We've got good values in the new columns, so just need to look those up,
    and get the documents from PACER.
    """
    f = open(options["input_file"], "r")
    dialect = csv.Sniffer().sniff(f.read(1024))
    f.seek(0)
    reader = csv.DictReader(f, dialect=dialect)
    q = options["queue"]
    throttle = CeleryThrottle(queue_name=q, min_items=options["queue_length"])
    session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
    session.login()
    for i, row in enumerate(reader):
        if i < options["offset"]:
            continue
        if i >= options["limit"] > 0:
            break
        throttle.maybe_wait()

        logger.info("Doing row %s: %s", i, row)

        docket_number = (row["cl_d_docket_number"]
                         or row["cl_d_docket_number (student)"] or None)

        if not docket_number:
            logger.warning("No docket number found for row: %s", i)
            continue
        court = Court.federal_courts.district_courts().get(
            fjc_court_id=row["AO ID"].rjust(2, "0"), )

        try:
            d = Docket.objects.get(docket_number=docket_number, court=court)
        except Docket.MultipleObjectsReturned:
            logger.warning("Multiple objects returned for row: %s", i)
            continue
        except Docket.DoesNotExist:
            logger.warning("Could not find docket for row: %s", i)
            continue

        # Got the docket, now get the documents from it, tag & OCR them.
        document_date = datetime.strptime(row["Date"], "%m/%d/%Y").date()
        des = d.docket_entries.filter(date_filed=document_date)
        count = des.count()
        if count == 0:
            logger.warning("No docket entries found for row: %s", i)
            continue
        elif des.count() == 1:
            good_des = [des[0]]
        else:
            # More than one item. Apply filtering rules.
            good_des = filter_des(des)

        # We've got our des, now download them.
        for de in good_des:
            rds = de.recap_documents.filter(
                document_type=RECAPDocument.PACER_DOCUMENT)
            for rd in rds:
                if not rd.pacer_doc_id:
                    logger.warning(
                        "Unable to get pacer_doc_id for item with "
                        "rd_pk: %s. Restricted document?",
                        rd.pk,
                    )
                    continue
                if options["task"] == "add_extra_tags":
                    # Wherein I belatedly realize we need a tag specifically
                    # for this part of the project.
                    add_tags(rd, TAG_NAME_OPINIONS)
                else:
                    # Otherwise, do the normal download thing.
                    chain(
                        get_pacer_doc_by_rd.s(rd.pk,
                                              session.cookies,
                                              tag=TAG_NAME).set(queue=q),
                        extract_recap_pdf.si(rd.pk).set(queue=q),
                        add_items_to_solr.si(
                            [rd.pk], "search.RECAPDocument").set(queue=q),
                    ).apply_async()
    f.close()
Exemplo n.º 15
0
def parse_harvard_opinions(reporter, volume):
    """
    Parse downloaded CaseLaw Corpus from internet archive and add them to our
    database.

    Optionally uses a reporter abbreviation to identify cases to download as
    used by IA.  (Ex. T.C. => tc)

    Optionally uses a volume integer.

    If neither is provided, code will cycle through all downloaded files.

    :param volume: The volume (int) of the reporters (optional) (ex 10)
    :param reporter: Reporter string as slugify'd (optional) (tc) for T.C.
    :return: None
    """
    if not reporter and volume:
        logger.error("You provided a volume but no reporter. Exiting.")
        return

    for file_path in filepath_list(reporter, volume):
        ia_download_url = "/".join(
            ["https://archive.org/download", file_path.split("/", 9)[-1]]
        )

        if OpinionCluster.objects.filter(
            filepath_json_harvard=file_path
        ).exists():
            logger.info("Skipping - already in system %s" % ia_download_url)
            continue

        try:
            with open(file_path) as f:
                data = json.load(f)
        except ValueError:
            logger.warning("Empty json: missing case at: %s" % ia_download_url)
            continue
        except Exception as e:
            logger.warning("Unknown error %s for: %s" % (e, ia_download_url))
            continue

        cites = get_citations(data["citations"][0]["cite"], html=False)
        if not cites:
            logger.info(
                "No citation found for %s." % data["citations"][0]["cite"]
            )
            continue

        case_name = harmonize(data["name_abbreviation"])
        case_name_short = cnt.make_case_name_short(case_name)
        case_name_full = harmonize(data["name"])

        citation = cites[0]
        if skip_processing(citation, case_name):
            continue

        # TODO: Generalize this to handle all court types somehow.
        court_id = match_court_string(
            data["court"]["name"],
            state=True,
            federal_appeals=True,
            federal_district=True,
        )

        soup = BeautifulSoup(data["casebody"]["data"], "lxml")

        # Some documents contain images in the HTML
        # Flag them for a later crawl by using the placeholder '[[Image]]'
        judge_list = [
            find_judge_names(x.text) for x in soup.find_all("judges")
        ]
        author_list = [
            find_judge_names(x.text) for x in soup.find_all("author")
        ]
        # Flatten and dedupe list of judges
        judges = ", ".join(
            list(set(itertools.chain.from_iterable(judge_list + author_list)))
        )
        judges = titlecase(judges)
        docket_string = (
            data["docket_number"]
            .replace("Docket No.", "")
            .replace("Docket Nos.", "")
            .strip()
        )

        with transaction.atomic():
            logger.info("Adding docket for: %s", citation.base_citation())
            docket = Docket.objects.create(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                docket_number=docket_string,
                court_id=court_id,
                source=Docket.HARVARD,
                ia_needs_upload=False,
            )
            # Iterate over other xml fields in Harvard data set
            # and save as string list   for further processing at a later date.
            json_fields = [
                "attorneys",
                "disposition",
                "syllabus",
                "summary",
                "history",
                "otherdate",
                "seealso",
                "headnotes",
                "correction",
            ]
            data_set = {}
            while json_fields:
                key = json_fields.pop(0)
                data_set[key] = "|".join([x.text for x in soup.find_all(key)])

            # Handle partial dates by adding -01v to YYYY-MM dates
            date_filed, is_approximate = validate_dt(data["decision_date"])

            logger.info("Adding cluster for: %s", citation.base_citation())
            cluster = OpinionCluster.objects.create(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                precedential_status="Published",
                docket_id=docket.id,
                source="U",
                date_filed=date_filed,
                date_filed_is_approximate=is_approximate,
                attorneys=data_set["attorneys"],
                disposition=data_set["disposition"],
                syllabus=data_set["syllabus"],
                summary=data_set["summary"],
                history=data_set["history"],
                other_dates=data_set["otherdate"],
                cross_reference=data_set["seealso"],
                headnotes=data_set["headnotes"],
                correction=data_set["correction"],
                judges=judges,
                filepath_json_harvard=file_path,
            )

            logger.info("Adding citation for: %s", citation.base_citation())
            Citation.objects.create(
                volume=citation.volume,
                reporter=citation.reporter,
                page=citation.page,
                type=map_reporter_db_cite_type(
                    REPORTERS[citation.reporter][0]["cite_type"]
                ),
                cluster_id=cluster.id,
            )
            for op in soup.find_all("opinion"):
                joined_by_str = titlecase(
                    " ".join(
                        list(set(itertools.chain.from_iterable(judge_list)))
                    )
                )
                author_str = titlecase(
                    " ".join(
                        list(set(itertools.chain.from_iterable(author_list)))
                    )
                )

                op_type = map_opinion_type(op.get("type"))
                opinion_xml = str(op)
                logger.info("Adding opinion for: %s", citation.base_citation())
                Opinion.objects.create(
                    cluster_id=cluster.id,
                    type=op_type,
                    author_str=author_str,
                    xml_harvard=opinion_xml,
                    joined_by_str=joined_by_str,
                    extracted_by_ocr=True,
                )

        logger.info("Finished: %s", citation.base_citation())
    def scrape_court(self, site, full_crawl=False, backscrape=False):
        download_error = False
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split(".")[-1].split("_")[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        abort = dup_checker.abort_by_url_hash(site.url, site.hash)
        if not abort:
            if site.cookies:
                logger.info("Using cookies: %s" % site.cookies)
            for i, item in enumerate(site):
                msg, r = get_binary_content(
                    item["download_urls"],
                    site.cookies,
                    site._get_adapter_instance(),
                    method=site.method,
                )
                if msg:
                    logger.warning(msg)
                    ErrorLog(
                        log_level="WARNING", court=court, message=msg
                    ).save()
                    continue

                content = site.cleanup_content(r.content)

                current_date = item["case_dates"]
                try:
                    next_date = site[i + 1]["case_dates"]
                except IndexError:
                    next_date = None

                # request.content is sometimes a str, sometimes unicode, so
                # force it all to be bytes, pleasing hashlib.
                sha1_hash = sha1(force_bytes(content))
                onwards = dup_checker.press_on(
                    Audio,
                    current_date,
                    next_date,
                    lookup_value=sha1_hash,
                    lookup_by="sha1",
                )
                if dup_checker.emulate_break:
                    break

                if onwards:
                    # Not a duplicate, carry on
                    logger.info(
                        "Adding new document found at: %s"
                        % item["download_urls"].encode("utf-8")
                    )
                    dup_checker.reset()

                    docket, audio_file, error = make_objects(
                        item, court, sha1_hash, content
                    )

                    if error:
                        download_error = True
                        continue

                    save_everything(
                        items={"docket": docket, "audio_file": audio_file},
                        index=False,
                        backscrape=backscrape,
                    )
                    process_audio_file.apply_async(
                        (audio_file.pk,), countdown=random.randint(0, 3600)
                    )

                    logger.info(
                        "Successfully added audio file {pk}: {name}".format(
                            pk=audio_file.pk,
                            name=item["case_names"].encode("utf-8"),
                        )
                    )

            # Update the hash if everything finishes properly.
            logger.info(
                "%s: Successfully crawled oral arguments." % site.court_id
            )
            if not download_error and not full_crawl:
                # Only update the hash if no errors occurred.
                dup_checker.update_site_hash(site.hash)
Exemplo n.º 17
0
def find_missing_or_incorrect_citations(options):
    """Iterate over tax cases to verify which citations are correctly parsed

    This code should pull back all the cases with plaintext tax courts to parse.
    Iterate over those cases extracting the citation if any

    :param options:
    :return:
    """
    should_fix = options["fix"]

    ocs = OpinionCluster.objects.filter(docket__court="tax").exclude(
        sub_opinions__plain_text="")
    logger.info("%s clusters found", ocs.count())

    for oc in ocs:
        logger.warning(
            "Reference url: https://www.courtlistener.com/opinion/%s/x",
            oc.id,
        )
        cites = oc.citations.all()

        logger.info("Found %s cite(s) for case in db", cites.count())

        if cites.count() > 0:
            if should_fix:
                logger.warning("Deleting cites in cluster %s", oc.id)
                cites.delete()

        ops = oc.sub_opinions.all()
        assert ops.count() == 1
        for op in ops:
            # Only loop over the first opinion because
            # these cases should only one have one opinion
            found_cite = find_tax_court_citation(op.plain_text)
            if found_cite is not None:
                found_cite_str = found_cite.base_citation()
                logger.info("Found citation in plain text as %s",
                            found_cite_str)
                if should_fix:
                    logger.warning("Creating citation: %s", found_cite_str)
                    Citation.objects.create(
                        volume=found_cite.volume,
                        reporter=found_cite.reporter,
                        page=found_cite.page,
                        type=found_cite.type,
                        cluster_id=oc.id,
                    )
                else:
                    if cites.count() > 0:
                        for cite in cites:
                            if str(cite) != found_cite_str:
                                logger.warning(
                                    "Have (%s), Expect (%s)",
                                    cite,
                                    found_cite_str,
                                )
                    else:
                        logger.warning("Add %s to db", found_cite_str)

            else:
                if cites.count() > 0:
                    for cite in cites:
                        logger.warning("Have (%s), Expect None", cite)
                        logger.warning("%s should be removed", cite)
                else:
                    logger.info("No citation in db or text: %s", oc.id)