def import_anon_2020_db(
    import_dir: str,
    skip_until: Optional[str],
    make_searchable: Optional[bool],
) -> None:
    """Import data from anon 2020 DB into our system.

    Iterate over thousands of directories each containing a tax case
    containing case json and a preprocessed HTML object. Check if we have
    a copy of this opinion in our system and either add the opinion to a
    case we already have or create a new docket, cluster, citations and opinion
    to our database.

    :param import_dir: Location of directory of import data.
    :param skip_until: ID for case we should begin processing, if any.
    :param make_searchable: Should we add content to SOLR.
    :return: None.
    """
    directories = iglob(f"{import_dir}/*/????-*.json")
    for dir in directories:
        logger.info(f"Importing case id: {dir.split('/')[-2]}")
        if skip_until:
            if skip_until in dir:
                skip_until = False
            continue

        # Prepare data and html
        with open(dir, "rb") as f:
            data = json.load(f)
        with open(dir.replace("json", "html"), "rb") as f:
            soup = bs4(f.read(), "html.parser")

        case_names = do_case_name(soup, data)
        court_id = find_court_id(data["court"])
        date_argued, date_filed = process_dates(data)
        docket_number = do_docket_number(data)
        html_str = soup.find("div", {"class": "container"}).decode_contents()
        found_cites = find_cites(data)
        status = check_publication_status(found_cites)

        cluster_id = None
        if found_cites:
            cluster_id = attempt_cluster_lookup(found_cites, docket_number)

        if cluster_id is not None:
            # Matching citations. Merge.
            docket = merge_or_add_opinions(
                cluster_id,
                html_str,
                data,
                date_argued,
                date_filed,
                case_names,
                status,
                docket_number,
                found_cites,
            )
        else:
            # No matching citations. Create new records.
            docket = add_new_records(
                html_str,
                data,
                date_argued,
                date_filed,
                case_names,
                status,
                docket_number,
                found_cites,
                court_id,
            )

        if make_searchable and docket:
            add_items_to_solr.delay([docket.pk], "search.Docket")
示例#2
0
def find_citations_for_opinion_by_pks(self, opinion_pks, index=True):
    """Find citations for search.Opinion objects.

    :param opinion_pks: An iterable of search.Opinion PKs
    :param index: Whether to add the item to Solr
    :return: None
    """
    opinions = Opinion.objects.filter(pk__in=opinion_pks)
    for opinion in opinions:
        # Returns a list of Citation objects, i.e., something like
        # [FullCitation, FullCitation, ShortformCitation, FullCitation,
        #   SupraCitation, SupraCitation, ShortformCitation, FullCitation]
        citations = get_document_citations(opinion)

        # If no citations are found, continue
        if not citations:
            continue

        # Match all those different Citation objects to Opinion objects, using
        # a variety of hueristics.
        try:
            citation_matches = match_citations.get_citation_matches(
                opinion, citations)
        except ResponseNotReady as e:
            # Threading problem in httplib, which is used in the Solr query.
            raise self.retry(exc=e, countdown=2)

        # Consolidate duplicate matches, keeping a counter of how often each
        # match appears (so we know how many times an opinion cites another).
        # keys = cited opinion
        # values = number of times that opinion is cited
        grouped_matches = Counter(citation_matches)

        # Increase the citation count for the cluster of each matched opinion
        # if that cluster has not already been cited by this opinion. First,
        # calculate a list of the IDs of every opinion whose cluster will need
        # updating.
        all_cited_opinions = opinion.opinions_cited.all().values_list(
            "pk", flat=True)
        opinion_ids_to_update = set()
        for matched_opinion in grouped_matches:
            if matched_opinion.pk not in all_cited_opinions:
                opinion_ids_to_update.add(matched_opinion.pk)

        # Then, increment the citation_count fields for those matched clusters
        # all at once. Trigger a single Solr update as well, if required.
        opinion_clusters_to_update = OpinionCluster.objects.filter(
            sub_opinions__pk__in=opinion_ids_to_update)
        opinion_clusters_to_update.update(citation_count=F("citation_count") +
                                          1)
        if index:
            add_items_to_solr.delay(
                opinion_clusters_to_update.values_list("pk", flat=True),
                "search.OpinionCluster",
            )

        # Generate the citing opinion's new HTML (with inline citation links)
        opinion.html_with_citations = create_cited_html(opinion, citations)

        # Nuke existing citations
        OpinionsCited.objects.filter(citing_opinion_id=opinion.pk).delete()

        # Create the new ones.
        OpinionsCited.objects.bulk_create([
            OpinionsCited(
                citing_opinion_id=opinion.pk,
                cited_opinion_id=matched_opinion.pk,
                depth=grouped_matches[matched_opinion],
            ) for matched_opinion in grouped_matches
        ])

        # Save all the changes to the citing opinion
        opinion.save()

    # If a Solr update was requested, do a single one at the end with all the
    # pks of the passed opinions
    if index:
        add_items_to_solr.delay(opinion_pks, "search.Opinion")
示例#3
0
    def save_model(self, request, obj, form, change):
        obj.save()
        from cl.search.tasks import add_items_to_solr

        add_items_to_solr.delay([obj.citing_opinion_id], "search.Opinion")
示例#4
0
    def save_model(self, request, obj, form, change):
        obj.save()
        from cl.search.tasks import add_items_to_solr

        add_items_to_solr.delay([obj.pk], "search.OpinionCluster")
示例#5
0
    def save_model(self, request, obj, form, change):
        obj.save()
        from cl.search.tasks import add_items_to_solr

        add_items_to_solr.delay([obj.person_id], "people_db.Person")
示例#6
0
    def delete_model(self, request, obj):
        obj.delete()
        from cl.search.tasks import add_items_to_solr

        add_items_to_solr.delay([obj.person_id], "people_db.Person")
def parse_harvard_opinions(reporter, volume, make_searchable):
    """
    Parse downloaded CaseLaw Corpus from internet archive and add them to our
    database.

    Optionally uses a reporter abbreviation to identify cases to download as
    used by IA.  (Ex. T.C. => tc)

    Optionally uses a volume integer.

    If neither is provided, code will cycle through all downloaded files.

    :param volume: The volume (int) of the reporters (optional) (ex 10)
    :param reporter: Reporter string as slugify'd (optional) (tc) for T.C.
    :param make_searchable: Boolean to indicate saving to solr
    :return: None
    """
    if not reporter and volume:
        logger.error("You provided a volume but no reporter. Exiting.")
        return

    for file_path in filepath_list(reporter, volume):
        ia_download_url = "/".join(
            ["https://archive.org/download", file_path.split("/", 9)[-1]]
        )

        if OpinionCluster.objects.filter(
            filepath_json_harvard=file_path
        ).exists():
            logger.info("Skipping - already in system %s" % ia_download_url)
            continue

        try:
            with open(file_path) as f:
                data = json.load(f)
        except ValueError:
            logger.warning("Empty json: missing case at: %s" % ia_download_url)
            continue
        except Exception as e:
            logger.warning("Unknown error %s for: %s" % (e, ia_download_url))
            continue

        cites = get_citations(data["citations"][0]["cite"])
        if not cites:
            logger.info(
                "No citation found for %s." % data["citations"][0]["cite"]
            )
            continue

        case_name = harmonize(data["name_abbreviation"])
        case_name_short = cnt.make_case_name_short(case_name)
        case_name_full = harmonize(data["name"])

        citation = cites[0]
        if skip_processing(citation, case_name, file_path):
            continue

        # TODO: Generalize this to handle all court types somehow.
        court_id = match_court_string(
            data["court"]["name"],
            state=True,
            federal_appeals=True,
            federal_district=True,
        )

        soup = BeautifulSoup(data["casebody"]["data"], "lxml")

        # Some documents contain images in the HTML
        # Flag them for a later crawl by using the placeholder '[[Image]]'
        judge_list = [
            extract_judge_last_name(x.text) for x in soup.find_all("judges")
        ]
        author_list = [
            extract_judge_last_name(x.text) for x in soup.find_all("author")
        ]
        # Flatten and dedupe list of judges
        judges = ", ".join(
            sorted(
                list(
                    set(
                        itertools.chain.from_iterable(judge_list + author_list)
                    )
                )
            )
        )
        judges = titlecase(judges)
        docket_string = (
            data["docket_number"]
            .replace("Docket No.", "")
            .replace("Docket Nos.", "")
            .strip()
        )

        short_fields = ["attorneys", "disposition", "otherdate", "seealso"]

        long_fields = [
            "syllabus",
            "summary",
            "history",
            "headnotes",
            "correction",
        ]

        short_data = parse_extra_fields(soup, short_fields, False)
        long_data = parse_extra_fields(soup, long_fields, True)

        with transaction.atomic():
            logger.info("Adding docket for: %s", citation.base_citation())
            docket = Docket(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                docket_number=docket_string,
                court_id=court_id,
                source=Docket.HARVARD,
                ia_needs_upload=False,
            )
            try:
                with transaction.atomic():
                    docket.save()
            except OperationalError as e:
                if "exceeds maximum" in str(e):
                    docket.docket_number = (
                        "%s, See Corrections for full Docket Number"
                        % trunc(docket_string, length=5000, ellipsis="...")
                    )
                    docket.save()
                    long_data["correction"] = "%s <br> %s" % (
                        data["docket_number"],
                        long_data["correction"],
                    )
            # Handle partial dates by adding -01v to YYYY-MM dates
            date_filed, is_approximate = validate_dt(data["decision_date"])

            logger.info("Adding cluster for: %s", citation.base_citation())
            cluster = OpinionCluster(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                precedential_status="Published",
                docket_id=docket.id,
                source="U",
                date_filed=date_filed,
                date_filed_is_approximate=is_approximate,
                attorneys=short_data["attorneys"],
                disposition=short_data["disposition"],
                syllabus=long_data["syllabus"],
                summary=long_data["summary"],
                history=long_data["history"],
                other_dates=short_data["otherdate"],
                cross_reference=short_data["seealso"],
                headnotes=long_data["headnotes"],
                correction=long_data["correction"],
                judges=judges,
                filepath_json_harvard=file_path,
            )
            cluster.save(index=False)

            logger.info("Adding citation for: %s", citation.base_citation())
            Citation.objects.create(
                volume=citation.volume,
                reporter=citation.reporter,
                page=citation.page,
                type=map_reporter_db_cite_type(
                    REPORTERS[citation.canonical_reporter][0]["cite_type"]
                ),
                cluster_id=cluster.id,
            )
            new_op_pks = []
            for op in soup.find_all("opinion"):
                # This code cleans author tags for processing.
                # It is particularly useful for identifiying Per Curiam
                for elem in [op.find("author")]:
                    if elem is not None:
                        [x.extract() for x in elem.find_all("page-number")]

                auth = op.find("author")
                if auth is not None:
                    author_tag_str = titlecase(auth.text.strip(":"))
                    author_str = titlecase(
                        "".join(extract_judge_last_name(author_tag_str))
                    )
                else:
                    author_str = ""
                    author_tag_str = ""

                per_curiam = True if author_tag_str == "Per Curiam" else False
                # If Per Curiam is True set author string to Per Curiam
                if per_curiam:
                    author_str = "Per Curiam"

                op_type = map_opinion_type(op.get("type"))
                opinion_xml = str(op)
                logger.info("Adding opinion for: %s", citation.base_citation())
                op = Opinion(
                    cluster_id=cluster.id,
                    type=op_type,
                    author_str=author_str,
                    xml_harvard=opinion_xml,
                    per_curiam=per_curiam,
                    extracted_by_ocr=True,
                )
                # Don't index now; do so later if desired
                op.save(index=False)
                new_op_pks.append(op.pk)

        if make_searchable:
            add_items_to_solr.delay(new_op_pks, "search.Opinion")

        logger.info("Finished: %s", citation.base_citation())
示例#8
0
def find_citations_for_opinion_by_pks(
    self,
    opinion_pks: List[int],
    index: bool = True,
) -> None:
    """Find citations for search.Opinion objects.

    :param opinion_pks: An iterable of search.Opinion PKs
    :param index: Whether to add the item to Solr
    :return: None
    """
    opinions: List[Opinion] = Opinion.objects.filter(pk__in=opinion_pks)
    for opinion in opinions:
        # Memoize parsed versions of the opinion's text
        get_and_clean_opinion_text(opinion)

        # Extract the citations from the opinion's text
        citations: List[CitationBase] = get_citations(opinion.cleaned_text)

        # If no citations are found, continue
        if not citations:
            continue

        # Resolve all those different citation objects to Opinion objects,
        # using a variety of heuristics.
        try:
            citation_resolutions: Dict[
                MatchedResourceType,
                List[SupportedCitationType]] = do_resolve_citations(
                    citations, opinion)
        except ResponseNotReady as e:
            # Threading problem in httplib, which is used in the Solr query.
            raise self.retry(exc=e, countdown=2)

        # Generate the citing opinion's new HTML with inline citation links
        opinion.html_with_citations = create_cited_html(
            opinion, citation_resolutions)

        # Delete the unmatched citations
        citation_resolutions.pop(NO_MATCH_RESOURCE, None)

        # Increase the citation count for the cluster of each matched opinion
        # if that cluster has not already been cited by this opinion. First,
        # calculate a list of the IDs of every opinion whose cluster will need
        # updating.
        all_cited_opinions = opinion.opinions_cited.all().values_list(
            "pk", flat=True)
        opinion_ids_to_update = set()
        for _opinion in citation_resolutions.keys():
            if _opinion.pk not in all_cited_opinions:
                opinion_ids_to_update.add(_opinion.pk)

        # Finally, commit these changes to the database in a single
        # transcation block. Trigger a single Solr update as well, if
        # required.
        with transaction.atomic():
            opinion_clusters_to_update = OpinionCluster.objects.filter(
                sub_opinions__pk__in=opinion_ids_to_update)
            opinion_clusters_to_update.update(
                citation_count=F("citation_count") + 1)
            if index:
                add_items_to_solr.delay(
                    opinion_clusters_to_update.values_list("pk", flat=True),
                    "search.OpinionCluster",
                )

            # Nuke existing citations
            OpinionsCited.objects.filter(citing_opinion_id=opinion.pk).delete()

            # Create the new ones.
            OpinionsCited.objects.bulk_create([
                OpinionsCited(
                    citing_opinion_id=opinion.pk,
                    cited_opinion_id=_opinion.pk,
                    depth=len(_citations),
                ) for _opinion, _citations in citation_resolutions.items()
            ])

            # Save all the changes to the citing opinion (send to solr later)
            opinion.save(index=False)

    # If a Solr update was requested, do a single one at the end with all the
    # pks of the passed opinions
    if index:
        add_items_to_solr.delay(opinion_pks, "search.Opinion")
示例#9
0
    def fix_fjc_positions(self, infile=None):
        """
        Addresses issue #624.

        We had some errant regexes in the district court assignments. This code
        reassigns the court fields for these judges where the new regexes
        differs from the old ones.

        :param infile: The import file with fjc-data.xslx
        :return: None
        """

        if infile is None:
            self.ensure_input_file()
            infile = self.options["input_file"]
        textfields = [
            "firstname",
            "midname",
            "lastname",
            "gender",
            "Place of Birth (City)",
            "Place of Birth (State)",
            "Place of Death (City)",
            "Place of Death (State)",
        ]
        df = pd.read_excel(infile, 0)
        for x in textfields:
            df[x] = df[x].replace(np.nan, "", regex=True)
        df["Employment text field"].replace(
            to_replace=r";\sno", value=r", no", inplace=True, regex=True
        )
        for i, item in df.iterrows():
            fjc_id = item["Judge Identification Number"]
            p = Person.objects.get(fjc_id=fjc_id)
            logger.info(
                "Doing person with FJC ID: %s, "
                "https://courtlistener.com%s" % (fjc_id, p.get_absolute_url())
            )

            exclusions = []
            for posnum in range(1, 7):
                if posnum > 1:
                    pos_str = " (%s)" % posnum
                else:
                    pos_str = ""

                if pd.isnull(item["Court Name" + pos_str]):
                    continue
                courtid = match_court_string(
                    item["Court Name" + pos_str], federal_district=True
                )
                if courtid is None:
                    raise Exception
                date_termination = process_date_string(
                    item["Date of Termination" + pos_str]
                )
                date_start = process_date_string(
                    item["Commission Date" + pos_str]
                )
                date_recess_appointment = process_date_string(
                    item["Recess Appointment date" + pos_str]
                )
                if pd.isnull(date_start) and not pd.isnull(
                    date_recess_appointment
                ):
                    date_start = date_recess_appointment
                if pd.isnull(date_start):
                    # if still no start date, skip
                    continue
                positions = Position.objects.filter(
                    person=p,
                    date_start=date_start,
                    date_termination=date_termination,
                    position_type="jud",
                ).exclude(pk__in=exclusions)
                position_count = positions.count()
                if position_count < 1:
                    logger.info(
                        "Couldn't find position to match '%s' on '%s' "
                        "with exclusions: %s" % (p, date_start, exclusions)
                    )
                    add_positions_from_row(
                        item, p, self.debug, fix_nums=[posnum]
                    )
                    if not self.debug:
                        add_items_to_solr.delay([p.pk], "people_db.Person")
                    continue
                elif position_count == 1:
                    # Good case. Press on!
                    position = positions[0]
                    exclusions.append(position.pk)
                elif position_count > 1:
                    logger.info(
                        "Got too many results for '%s' on '%s'. Got %s"
                        % (p, date_start, position_count)
                    )
                    continue

                if position.court.pk == courtid:
                    logger.info(
                        "Court IDs are both '%s'. No changes made." % courtid
                    )
                else:
                    logger.info(
                        "Court IDs are different! Old: %s, New: %s"
                        % (position.court.pk, courtid)
                    )
                    court = Court.objects.get(pk=courtid)
                    position.court = court

                    if not self.debug:
                        position.save()
                        add_items_to_solr.delay([p.pk], "people_db.Person")
    def fix_fjc_positions(self, infile=None):
        """
        Addresses issue #624.

        We had some errant regexes in the district court assignments. This code
        reassigns the court fields for these judges where the new regexes
        differs from the old ones.

        :param infile: The import file with fjc-data.xslx
        :return: None
        """

        if infile is None:
            self.ensure_input_file()
            infile = self.options['input_file']
        textfields = ['firstname', 'midname', 'lastname', 'gender',
                      'Place of Birth (City)', 'Place of Birth (State)',
                      'Place of Death (City)', 'Place of Death (State)']
        df = pd.read_excel(infile, 0)
        for x in textfields:
            df[x] = df[x].replace(np.nan, '', regex=True)
        df['Employment text field'].replace(to_replace=r';\sno', value=r', no',
                                            inplace=True, regex=True)
        for i, item in df.iterrows():
            fjc_id = item['Judge Identification Number']
            p = Person.objects.get(fjc_id=fjc_id)
            logger.info("Doing person with FJC ID: %s, "
                        "https://courtlistener.com%s" % (fjc_id,
                                                         p.get_absolute_url()))

            exclusions = []
            for posnum in range(1, 7):
                if posnum > 1:
                    pos_str = ' (%s)' % posnum
                else:
                    pos_str = ''

                if pd.isnull(item['Court Name' + pos_str]):
                    continue
                courtid = match_court_string(item['Court Name' + pos_str],
                                             federal_district=True)
                if courtid is None:
                    raise Exception
                date_termination = process_date_string(
                    item['Date of Termination' + pos_str])
                date_start = process_date_string(
                    item['Commission Date' + pos_str])
                date_recess_appointment = process_date_string(
                    item['Recess Appointment date' + pos_str])
                if pd.isnull(date_start) and not pd.isnull(
                        date_recess_appointment):
                    date_start = date_recess_appointment
                if pd.isnull(date_start):
                    # if still no start date, skip
                    continue
                positions = (Position.objects
                                .filter(person=p, date_start=date_start,
                                        date_termination=date_termination,
                                        position_type='jud')
                                .exclude(pk__in=exclusions))
                position_count = positions.count()
                if position_count < 1:
                    logger.info("Couldn't find position to match '%s' on '%s' "
                                "with exclusions: %s" % (p, date_start,
                                                         exclusions))
                    add_positions_from_row(item, p, self.debug,
                                           fix_nums=[posnum])
                    if not self.debug:
                        add_items_to_solr.delay([p.pk], 'people_db.Person')
                    continue
                elif position_count == 1:
                    # Good case. Press on!
                    position = positions[0]
                    exclusions.append(position.pk)
                elif position_count > 1:
                    logger.info("Got too many results for '%s' on '%s'. Got %s"
                                % (p, date_start, position_count))
                    continue

                if position.court.pk == courtid:
                    logger.info("Court IDs are both '%s'. No changes made." %
                                courtid)
                else:
                    logger.info("Court IDs are different! Old: %s, New: %s" %
                                (position.court.pk, courtid))
                    court = Court.objects.get(pk=courtid)
                    position.court = court

                    if not self.debug:
                        position.save()
                        add_items_to_solr.delay([p.pk], 'people_db.Person')
示例#11
0
 def delete_model(self, request, obj):
     obj.delete()
     from cl.search.tasks import add_items_to_solr
     add_items_to_solr.delay([obj.person_id], 'people_db.Person')
示例#12
0
 def save_model(self, request, obj, form, change):
     obj.save()
     from cl.search.tasks import add_items_to_solr
     add_items_to_solr.delay([obj.person_id], 'people_db.Person')
示例#13
0
 def save_model(self, request, obj, form, change):
     obj.save()
     from cl.search.tasks import add_items_to_solr
     add_items_to_solr.delay([obj.pk], 'search.Opinion')