Exemplo n.º 1
0
    def test_get_appellate_court_object_from_string(self) -> None:
        """Can we get the correct federal appellate courts?"""

        pairs = (
            {
                "q": "U. S. Court of Appeals for the Ninth Circuit",
                "a": "ca9"
            },
            {
                # FJC data does not appear to have a space between U. and S.
                "q": "U.S. Court of Appeals for the Ninth Circuit",
                "a": "ca9",
            },
            {
                "q": "U. S. Circuit Court for the Ninth Circuit",
                "a": "ca9"
            },
            {
                "q": "U.S. Circuit Court for the Ninth Circuit",
                "a": "ca9"
            },
        )
        for test in pairs:
            print(f"Testing: {test['q']}, expecting: {test['a']}")
            got = match_court_string(test["q"], federal_appeals=True)
            self.assertEqual(test["a"], got)
Exemplo n.º 2
0
    def test_get_appellate_court_object_from_string(self):
        """Can we get the correct federal appellate courts?"""

        pairs = (
            {
                'q': 'U. S. Court of Appeals for the Ninth Circuit',
                'a': 'ca9',
            },
            {
                # FJC data does not appear to have a space between U. and S.
                'q': 'U.S. Court of Appeals for the Ninth Circuit',
                'a': 'ca9',
            },
            {
                'q': 'U. S. Circuit Court for the Ninth Circuit',
                'a': 'ca9',
            },
            {
                'q': 'U.S. Circuit Court for the Ninth Circuit',
                'a': 'ca9',
            },
        )
        for test in pairs:
            print("Testing: %s, expecting: %s" % (test['q'], test['a']))
            got = match_court_string(test['q'], federal_appeals=True)
            self.assertEqual(test['a'], got)
Exemplo n.º 3
0
def update_bankruptcy_and_magistrate(testing=False):
    # update bankruptcy positions
    positions = Position.object.filter(job_title__icontains='Bankruptcy')
    for position in positions:
        location = position.location
        bcourt = match_court_string(location, bankruptcy=True)
        if bcourt is None:
            continue
        position.court_id = bcourt
        position.position_type = 'jud'
        if not testing:
            position.save()

        positions = Position.object.filter(job_title__icontains='Magistrate')
        for position in positions:
            location = position.location
            mcourt = match_court_string(location, federal_district=True)
            position.court_id = mcourt
            position.position_type = 'm-jud'
            if not testing:
                position.save()
def update_bankruptcy_and_magistrate(testing=False):
    # update bankruptcy positions
    positions = Position.object.filter(job_title__icontains='Bankruptcy')
    for position in positions:
        location = position.location
        bcourt = match_court_string(location, bankruptcy=True)
        if bcourt is None:
            continue
        position.court_id = bcourt
        position.position_type = 'jud'
        if not testing:
            position.save()

        positions = Position.object.filter(job_title__icontains='Magistrate')
        for position in positions:
            location = position.location
            mcourt = match_court_string(location, federal_district=True)
            position.court_id = mcourt
            position.position_type = 'm-jud'
            if not testing:
                position.save()
Exemplo n.º 5
0
    def test_get_fed_court_object_from_string(self):
        """Can we get the correct federal courts?"""

        pairs = (
            {
                'q': 'Eastern District of New York',
                'a': 'nyed'
            },
            {
                'q': 'Northern District of New York',
                'a': 'nynd'
            },
            {
                'q':  'Southern District of New York',
                'a': 'nysd'
            },
            # When we have unknown first word, we assume it's errant.
            {
                'q': 'Nathan District of New York',
                'a': 'nyd'
            },
            {
                'q': "Nate District of New York",
                'a': 'nyd',
            },
            {
                'q': "Middle District of Pennsylvania",
                'a': 'pamd',
            },
            {
                'q': "Middle Dist. of Pennsylvania",
                'a': 'pamd',
            },
            {
                'q': "M.D. of Pennsylvania",
                'a': 'pamd',
            }
        )
        for test in pairs:
            print("Testing: %s, expecting: %s" % (test['q'], test['a']))
            got = match_court_string(test['q'], federal_district=True)
            self.assertEqual(
                test['a'],
                got,
            )
Exemplo n.º 6
0
    def test_get_fed_court_object_from_string(self):
        """Can we get the correct federal courts?"""

        pairs = (
            {
                "q": "Eastern District of New York",
                "a": "nyed"
            },
            {
                "q": "Northern District of New York",
                "a": "nynd"
            },
            {
                "q": "Southern District of New York",
                "a": "nysd"
            },
            # When we have unknown first word, we assume it's errant.
            {
                "q": "Nathan District of New York",
                "a": "nyd"
            },
            {
                "q": "Nate District of New York",
                "a": "nyd",
            },
            {
                "q": "Middle District of Pennsylvania",
                "a": "pamd",
            },
            {
                "q": "Middle Dist. of Pennsylvania",
                "a": "pamd",
            },
            {
                "q": "M.D. of Pennsylvania",
                "a": "pamd",
            },
        )
        for test in pairs:
            print("Testing: %s, expecting: %s" % (test["q"], test["a"]))
            got = match_court_string(test["q"], federal_district=True)
            self.assertEqual(
                test["a"],
                got,
            )
Exemplo n.º 7
0
    def test_get_fed_court_object_from_string(self):
        """Can we get the correct federal courts?"""

        pairs = (
            {
                'q': 'Eastern District of New York',
                'a': 'nyed'
            },
            {
                'q': 'Northern District of New York',
                'a': 'nynd'
            },
            {
                'q':  'Southern District of New York',
                'a': 'nysd'
            },
            # When we have unknown first word, we assume it's errant.
            {
                'q': 'Nathan District of New York',
                'a': 'nyd'
            },
            {
                'q': "Nate District of New York",
                'a': 'nyd',
            },
            {
                'q': "Middle District of Pennsylvania",
                'a': 'pamd',
            },
            {
                'q': "Middle Dist. of Pennsylvania",
                'a': 'pamd',
            },
            {
                'q': "M.D. of Pennsylvania",
                'a': 'pamd',
            }
        )
        for test in pairs:
            print("Testing: %s, expecting: %s" % (test['q'], test['a']))
            got = match_court_string(test['q'], federal_district=True)
            self.assertEqual(
                test['a'],
                got,
            )
def add_positions_from_row(item, person, testing, fix_nums=None):
    # add position items (up to 6 of them)
    prev_politics = None
    for posnum in range(1, 7):
        # Save the position if we're running all positions or specifically
        # fixing this one.
        save_this_position = fix_nums is None or posnum in fix_nums
        pos_str = " (%s)" % posnum

        if pd.isnull(item["Court Name" + pos_str]):
            continue

        if re.search("appeal", item["Court Name" + pos_str], re.I):
            courtid = match_court_string(item["Court Name" + pos_str],
                                         federal_appeals=True)
        elif re.search("district|trade", item["Court Name" + pos_str], re.I):
            courtid = match_court_string(item["Court Name" + pos_str],
                                         federal_district=True)

        if courtid is None:
            raise Exception

        date_nominated = process_date_string(item["Nomination Date" + pos_str])
        date_recess_appointment = process_date_string(
            item["Recess Appointment Date" + pos_str])
        date_referred_to_judicial_committee = process_date_string(
            item["Committee Referral Date" + pos_str])
        date_judicial_committee_action = process_date_string(
            item["Committee Action Date" + pos_str])
        date_hearing = process_date_string(item["Hearing Date" + pos_str])
        date_confirmation = process_date_string(item["Confirmation Date" +
                                                     pos_str])

        # assign start date
        date_start = process_date_string(item["Commission Date" + pos_str])
        if pd.isnull(date_start) and not pd.isnull(date_recess_appointment):
            date_start = date_recess_appointment
        if pd.isnull(date_start):
            # if still no start date, skip
            date_start = None
        date_termination = process_date_string(item["Termination Date" +
                                                    pos_str])
        termination = item["Termination" + pos_str]

        if date_termination is None:
            date_granularity_termination = ""
        else:
            date_granularity_termination = GRANULARITY_DAY

        # check duplicate position
        dupe_search = Position.objects.filter(
            person=person,
            position_type="jud",
            date_start=date_start,
            date_termination=date_termination,
            termination_reason=termination,
            court_id=courtid,
        )
        if len(dupe_search) > 0:
            print("Duplicate position:", dupe_search)
            continue

        # assign appointing president
        if not pd.isnull(item["Reappointing President" + pos_str]):
            appointstr = item["Reappointing President" + pos_str]
        else:
            appointstr = item["Appointing President" + pos_str]
        appointer = None
        if appointstr not in ["Assignment", "Reassignment"]:
            names = appointstr.split()

            if len(names) == 3:
                first, mid, last = names
            else:
                first, last = names[0], names[-1]
                mid = ""
            appoint_search = Position.objects.filter(
                person__name_first__iexact=first,
                person__name_last__iexact=last,
            )
            if len(appoint_search) > 1:
                appoint_search = Position.objects.filter(
                    person__name_first__iexact=first,
                    person__name_last__iexact=last,
                    person__name_middle__iexact=mid,
                    position_type="pres",
                )
            if len(appoint_search) > 1:
                appoint_search = Position.objects.filter(
                    person__name_first__iexact=first,
                    person__name_last__iexact=last,
                    person__name_middle__iexact=mid,
                    position_type="pres",
                    date_start__lte=date_nominated,
                    date_termination__gte=date_nominated,
                )
            if len(appoint_search) == 0:
                print(names, appoint_search)
            if len(appoint_search) > 1:
                print(names, appoint_search)
            if len(appoint_search) == 1:
                appointer = appoint_search[0]

        # Senate votes data.
        votes = item["Ayes/Nays" + pos_str]
        if not pd.isnull(votes):
            votes_yes, votes_no = votes.split("/")
        else:
            votes_yes = None
            votes_no = None
        if item["Senate Vote Type" + pos_str] == "Yes":
            voice_vote = True
        else:
            voice_vote = False

        termdict = {
            "Abolition of Court": "abolished",
            "Death": "ded",
            "Reassignment": "other_pos",
            "Appointment to Another Judicial Position": "other_pos",
            "Impeachment & Conviction": "bad_judge",
            "Recess Appointment-Not Confirmed": "recess_not_confirmed",
            "Resignation": "resign",
            "Retirement": "retire_vol",
        }
        term_reason = item["Termination" + pos_str]
        if pd.isnull(term_reason):
            term_reason = ""
        else:
            term_reason = termdict[term_reason]

        position = Position(
            person=person,
            court_id=courtid,
            position_type="jud",
            date_nominated=date_nominated,
            date_recess_appointment=date_recess_appointment,
            date_referred_to_judicial_committee=
            date_referred_to_judicial_committee,
            date_judicial_committee_action=date_judicial_committee_action,
            date_hearing=date_hearing,
            date_confirmation=date_confirmation,
            date_start=date_start,
            date_granularity_start=GRANULARITY_DAY,
            date_termination=date_termination,
            date_granularity_termination=date_granularity_termination,
            appointer=appointer,
            voice_vote=voice_vote,
            votes_yes=votes_yes,
            votes_no=votes_no,
            vote_type="s",
            how_selected="a_pres",
            termination_reason=term_reason,
        )

        if not testing and save_this_position:
            position.save()

        # set party
        p = item["Party of Appointing President" + pos_str]
        if not pd.isnull(p) and p not in ["Assignment", "Reassignment"]:
            party = get_party(item["Party of Appointing President" + pos_str])
            if prev_politics is None:
                if pd.isnull(date_nominated):
                    politicsgran = ""
                else:
                    politicsgran = GRANULARITY_DAY
                politics = PoliticalAffiliation(
                    person=person,
                    political_party=party,
                    date_start=date_nominated,
                    date_granularity_start=politicsgran,
                    source="a",
                )
                if not testing and save_this_position:
                    politics.save()
                prev_politics = party
            elif party != prev_politics:
                # account for changing political affiliation
                politics.date_end = date_nominated
                politics.date_granularity_end = GRANULARITY_DAY
                if not testing and save_this_position:
                    politics.save()
                politics = PoliticalAffiliation(
                    person=person,
                    political_party=party,
                    date_start=date_nominated,
                    date_granularity_start=GRANULARITY_DAY,
                    source="a",
                )
                if not testing and save_this_position:
                    politics.save()
        rating = get_aba(item["ABA Rating" + pos_str])
        if rating is not None:
            nom_year = date_nominated.year
            aba = ABARating(person=person, rating=rating, year_rated=nom_year)
            if not testing and save_this_position:
                aba.save()

        # Add URL and date accessed.
        sources = Source(
            person=person,
            url="https://www.fjc.gov/sites/default/files/history/judges.csv",
            date_accessed=str(date.today()),
        )
        if not testing:
            sources.save()
Exemplo n.º 9
0
def parse_harvard_opinions(reporter, volume, make_searchable):
    """
    Parse downloaded CaseLaw Corpus from internet archive and add them to our
    database.

    Optionally uses a reporter abbreviation to identify cases to download as
    used by IA.  (Ex. T.C. => tc)

    Optionally uses a volume integer.

    If neither is provided, code will cycle through all downloaded files.

    :param volume: The volume (int) of the reporters (optional) (ex 10)
    :param reporter: Reporter string as slugify'd (optional) (tc) for T.C.
    :param make_searchable: Boolean to indicate saving to solr
    :return: None
    """
    if not reporter and volume:
        logger.error("You provided a volume but no reporter. Exiting.")
        return

    for file_path in filepath_list(reporter, volume):
        ia_download_url = "/".join(
            ["https://archive.org/download", file_path.split("/", 9)[-1]]
        )

        if OpinionCluster.objects.filter(
            filepath_json_harvard=file_path
        ).exists():
            logger.info("Skipping - already in system %s" % ia_download_url)
            continue

        try:
            with open(file_path) as f:
                data = json.load(f)
        except ValueError:
            logger.warning("Empty json: missing case at: %s" % ia_download_url)
            continue
        except Exception as e:
            logger.warning("Unknown error %s for: %s" % (e, ia_download_url))
            continue

        cites = get_citations(data["citations"][0]["cite"])
        if not cites:
            logger.info(
                "No citation found for %s." % data["citations"][0]["cite"]
            )
            continue

        case_name = harmonize(data["name_abbreviation"])
        case_name_short = cnt.make_case_name_short(case_name)
        case_name_full = harmonize(data["name"])

        citation = cites[0]
        if skip_processing(citation, case_name, file_path):
            continue

        # TODO: Generalize this to handle all court types somehow.
        court_id = match_court_string(
            data["court"]["name"],
            state=True,
            federal_appeals=True,
            federal_district=True,
        )

        soup = BeautifulSoup(data["casebody"]["data"], "lxml")

        # Some documents contain images in the HTML
        # Flag them for a later crawl by using the placeholder '[[Image]]'
        judge_list = [
            extract_judge_last_name(x.text) for x in soup.find_all("judges")
        ]
        author_list = [
            extract_judge_last_name(x.text) for x in soup.find_all("author")
        ]
        # Flatten and dedupe list of judges
        judges = ", ".join(
            sorted(
                list(
                    set(
                        itertools.chain.from_iterable(judge_list + author_list)
                    )
                )
            )
        )
        judges = titlecase(judges)
        docket_string = (
            data["docket_number"]
            .replace("Docket No.", "")
            .replace("Docket Nos.", "")
            .strip()
        )

        short_fields = ["attorneys", "disposition", "otherdate", "seealso"]

        long_fields = [
            "syllabus",
            "summary",
            "history",
            "headnotes",
            "correction",
        ]

        short_data = parse_extra_fields(soup, short_fields, False)
        long_data = parse_extra_fields(soup, long_fields, True)

        with transaction.atomic():
            logger.info("Adding docket for: %s", citation.base_citation())
            docket = Docket(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                docket_number=docket_string,
                court_id=court_id,
                source=Docket.HARVARD,
                ia_needs_upload=False,
            )
            try:
                with transaction.atomic():
                    docket.save()
            except OperationalError as e:
                if "exceeds maximum" in str(e):
                    docket.docket_number = (
                        "%s, See Corrections for full Docket Number"
                        % trunc(docket_string, length=5000, ellipsis="...")
                    )
                    docket.save()
                    long_data["correction"] = "%s <br> %s" % (
                        data["docket_number"],
                        long_data["correction"],
                    )
            # Handle partial dates by adding -01v to YYYY-MM dates
            date_filed, is_approximate = validate_dt(data["decision_date"])

            logger.info("Adding cluster for: %s", citation.base_citation())
            cluster = OpinionCluster(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                precedential_status="Published",
                docket_id=docket.id,
                source="U",
                date_filed=date_filed,
                date_filed_is_approximate=is_approximate,
                attorneys=short_data["attorneys"],
                disposition=short_data["disposition"],
                syllabus=long_data["syllabus"],
                summary=long_data["summary"],
                history=long_data["history"],
                other_dates=short_data["otherdate"],
                cross_reference=short_data["seealso"],
                headnotes=long_data["headnotes"],
                correction=long_data["correction"],
                judges=judges,
                filepath_json_harvard=file_path,
            )
            cluster.save(index=False)

            logger.info("Adding citation for: %s", citation.base_citation())
            Citation.objects.create(
                volume=citation.volume,
                reporter=citation.reporter,
                page=citation.page,
                type=map_reporter_db_cite_type(
                    REPORTERS[citation.canonical_reporter][0]["cite_type"]
                ),
                cluster_id=cluster.id,
            )
            new_op_pks = []
            for op in soup.find_all("opinion"):
                # This code cleans author tags for processing.
                # It is particularly useful for identifiying Per Curiam
                for elem in [op.find("author")]:
                    if elem is not None:
                        [x.extract() for x in elem.find_all("page-number")]

                auth = op.find("author")
                if auth is not None:
                    author_tag_str = titlecase(auth.text.strip(":"))
                    author_str = titlecase(
                        "".join(extract_judge_last_name(author_tag_str))
                    )
                else:
                    author_str = ""
                    author_tag_str = ""

                per_curiam = True if author_tag_str == "Per Curiam" else False
                # If Per Curiam is True set author string to Per Curiam
                if per_curiam:
                    author_str = "Per Curiam"

                op_type = map_opinion_type(op.get("type"))
                opinion_xml = str(op)
                logger.info("Adding opinion for: %s", citation.base_citation())
                op = Opinion(
                    cluster_id=cluster.id,
                    type=op_type,
                    author_str=author_str,
                    xml_harvard=opinion_xml,
                    per_curiam=per_curiam,
                    extracted_by_ocr=True,
                )
                # Don't index now; do so later if desired
                op.save(index=False)
                new_op_pks.append(op.pk)

        if make_searchable:
            add_items_to_solr.delay(new_op_pks, "search.Opinion")

        logger.info("Finished: %s", citation.base_citation())
Exemplo n.º 10
0
    def fix_fjc_positions(self, infile=None):
        """
        Addresses issue #624.

        We had some errant regexes in the district court assignments. This code
        reassigns the court fields for these judges where the new regexes
        differs from the old ones.

        :param infile: The import file with fjc-data.xslx
        :return: None
        """

        if infile is None:
            self.ensure_input_file()
            infile = self.options["input_file"]
        textfields = [
            "firstname",
            "midname",
            "lastname",
            "gender",
            "Place of Birth (City)",
            "Place of Birth (State)",
            "Place of Death (City)",
            "Place of Death (State)",
        ]
        df = pd.read_excel(infile, 0)
        for x in textfields:
            df[x] = df[x].replace(np.nan, "", regex=True)
        df["Employment text field"].replace(
            to_replace=r";\sno", value=r", no", inplace=True, regex=True
        )
        for i, item in df.iterrows():
            fjc_id = item["Judge Identification Number"]
            p = Person.objects.get(fjc_id=fjc_id)
            logger.info(
                "Doing person with FJC ID: %s, "
                "https://courtlistener.com%s" % (fjc_id, p.get_absolute_url())
            )

            exclusions = []
            for posnum in range(1, 7):
                if posnum > 1:
                    pos_str = " (%s)" % posnum
                else:
                    pos_str = ""

                if pd.isnull(item["Court Name" + pos_str]):
                    continue
                courtid = match_court_string(
                    item["Court Name" + pos_str], federal_district=True
                )
                if courtid is None:
                    raise Exception
                date_termination = process_date_string(
                    item["Date of Termination" + pos_str]
                )
                date_start = process_date_string(
                    item["Commission Date" + pos_str]
                )
                date_recess_appointment = process_date_string(
                    item["Recess Appointment date" + pos_str]
                )
                if pd.isnull(date_start) and not pd.isnull(
                    date_recess_appointment
                ):
                    date_start = date_recess_appointment
                if pd.isnull(date_start):
                    # if still no start date, skip
                    continue
                positions = Position.objects.filter(
                    person=p,
                    date_start=date_start,
                    date_termination=date_termination,
                    position_type="jud",
                ).exclude(pk__in=exclusions)
                position_count = positions.count()
                if position_count < 1:
                    logger.info(
                        "Couldn't find position to match '%s' on '%s' "
                        "with exclusions: %s" % (p, date_start, exclusions)
                    )
                    add_positions_from_row(
                        item, p, self.debug, fix_nums=[posnum]
                    )
                    if not self.debug:
                        add_items_to_solr.delay([p.pk], "people_db.Person")
                    continue
                elif position_count == 1:
                    # Good case. Press on!
                    position = positions[0]
                    exclusions.append(position.pk)
                elif position_count > 1:
                    logger.info(
                        "Got too many results for '%s' on '%s'. Got %s"
                        % (p, date_start, position_count)
                    )
                    continue

                if position.court.pk == courtid:
                    logger.info(
                        "Court IDs are both '%s'. No changes made." % courtid
                    )
                else:
                    logger.info(
                        "Court IDs are different! Old: %s, New: %s"
                        % (position.court.pk, courtid)
                    )
                    court = Court.objects.get(pk=courtid)
                    position.court = court

                    if not self.debug:
                        position.save()
                        add_items_to_solr.delay([p.pk], "people_db.Person")
Exemplo n.º 11
0
def add_positions_from_row(item, person, testing, fix_nums=None):
    # add position items (up to 6 of them)
    prev_politics = None
    for posnum in range(1, 7):
        # Save the position if we're running all positions or specifically
        # fixing this one.
        save_this_position = (fix_nums is None or posnum in fix_nums)
        if posnum > 1:
            pos_str = ' (%s)' % posnum
        else:
            pos_str = ''

        if pd.isnull(item['Court Name' + pos_str]):
            continue
        courtid = match_court_string(item['Court Name' + pos_str],
                                     federal_district=True)
        if courtid is None:
            raise Exception

        date_nominated = process_date_string(
            item['Nomination Date Senate Executive Journal' + pos_str])
        date_recess_appointment = process_date_string(
            item['Recess Appointment date' + pos_str])
        date_referred_to_judicial_committee = process_date_string(
            item['Referral date (referral to Judicial Committee)' + pos_str])
        date_judicial_committee_action = process_date_string(
            item['Committee action date' + pos_str])
        date_hearing = process_date_string(item['Hearings' + pos_str])
        date_confirmation = process_date_string(
            item['Senate Vote Date (Confirmation Date)' + pos_str])

        # assign start date
        date_start = process_date_string(item['Commission Date' + pos_str])
        if pd.isnull(date_start) and not pd.isnull(date_recess_appointment):
            date_start = date_recess_appointment
        if pd.isnull(date_start):
            # if still no start date, skip
            continue
        date_termination = process_date_string(item['Date of Termination' +
                                                    pos_str])
        date_retirement = process_date_string(
            item['Retirement from Active Service' + pos_str])

        if date_termination is None:
            date_granularity_termination = ''
        else:
            date_granularity_termination = GRANULARITY_DAY

        # check duplicate position
        dupe_search = Position.objects.filter(
            person=person,
            position_type='jud',
            date_start=date_start,
            date_termination=date_termination,
            court_id=courtid,
        )
        if len(dupe_search) > 0:
            print('Duplicate position:', dupe_search)
            continue

        # assign appointing president
        if not pd.isnull(item['Renominating President name' + pos_str]):
            appointstr = item['Renominating President name' + pos_str]
        else:
            appointstr = item['President name' + pos_str]
        appointer = None
        if appointstr not in ['Assignment', 'Reassignment']:
            names = appointstr.split()

            if len(names) == 3:
                first, mid, last = names
            else:
                first, last = names[0], names[-1]
                mid = ''
            appoint_search = Position.objects.filter(
                person__name_first__iexact=first,
                person__name_last__iexact=last)
            if len(appoint_search) > 1:
                appoint_search = Position.objects.filter(
                    person__name_first__iexact=first,
                    person__name_last__iexact=last,
                    person__name_middle__iexact=mid,
                    position_type='pres',
                )
            if len(appoint_search) > 1:
                appoint_search = Position.objects.filter(
                    person__name_first__iexact=first,
                    person__name_last__iexact=last,
                    person__name_middle__iexact=mid,
                    position_type='pres',
                    date_start__lte=date_nominated,
                    date_termination__gte=date_nominated)
            if len(appoint_search) == 0:
                print(names, appoint_search)
            if len(appoint_search) > 1:
                print(names, appoint_search)
            if len(appoint_search) == 1:
                appointer = appoint_search[0]

        # senate votes data
        votes = item['Senate vote Ayes/Nays' + pos_str]
        if not pd.isnull(votes):
            votes_yes, votes_no = votes.split('/')
        else:
            votes_yes = None
            votes_no = None
        if item['Senate voice vote' + pos_str] == "Yes":
            voice_vote = True
        else:
            voice_vote = False

        termdict = {
            'Abolition of Court': 'abolished',
            'Death': 'ded',
            'Reassignment': 'other_pos',
            'Appointment to Another Judicial Position': 'other_pos',
            'Impeachment & Conviction': 'bad_judge',
            'Recess Appointment-Not Confirmed': 'recess_not_confirmed',
            'Resignation': 'resign',
            'Retirement': 'retire_vol'
        }
        term_reason = item['Termination specific reason' + pos_str]
        if pd.isnull(term_reason):
            term_reason = ''
        else:
            term_reason = termdict[term_reason]

        position = Position(
            person=person,
            court_id=courtid,
            position_type='jud',
            date_nominated=date_nominated,
            date_recess_appointment=date_recess_appointment,
            date_referred_to_judicial_committee=
            date_referred_to_judicial_committee,
            date_judicial_committee_action=date_judicial_committee_action,
            date_hearing=date_hearing,
            date_confirmation=date_confirmation,
            date_start=date_start,
            date_granularity_start=GRANULARITY_DAY,
            date_termination=date_termination,
            date_granularity_termination=date_granularity_termination,
            date_retirement=date_retirement,
            appointer=appointer,
            voice_vote=voice_vote,
            votes_yes=votes_yes,
            votes_no=votes_no,
            vote_type='s',
            how_selected='a_pres',
            termination_reason=term_reason)

        if not testing and save_this_position:
            position.save()

        # set party
        p = item['Party Affiliation of President' + pos_str]
        if not pd.isnull(p) and p not in ['Assignment', 'Reassignment']:
            party = get_party(item['Party Affiliation of President' + pos_str])
            if prev_politics is None:
                if pd.isnull(date_nominated):
                    politicsgran = ''
                else:
                    politicsgran = GRANULARITY_DAY
                politics = PoliticalAffiliation(
                    person=person,
                    political_party=party,
                    date_start=date_nominated,
                    date_granularity_start=politicsgran,
                    source='a',
                )
                if not testing and save_this_position:
                    politics.save()
                prev_politics = party
            elif party != prev_politics:
                # account for changing political affiliation
                politics.date_end = date_nominated
                politics.date_granularity_end = GRANULARITY_DAY
                if not testing and save_this_position:
                    politics.save()
                politics = PoliticalAffiliation(
                    person=person,
                    political_party=party,
                    date_start=date_nominated,
                    date_granularity_start=GRANULARITY_DAY,
                    source='a')
                if not testing and save_this_position:
                    politics.save()
        rating = get_aba(item['ABA Rating' + pos_str])
        if rating is not None:
            nom_year = date_nominated.year
            aba = ABARating(person=person, rating=rating, year_rated=nom_year)
            if not testing and save_this_position:
                aba.save()
Exemplo n.º 12
0
def parse_harvard_opinions(reporter, volume):
    """
    Parse downloaded CaseLaw Corpus from internet archive and add them to our
    database.

    Optionally uses a reporter abbreviation to identify cases to download as
    used by IA.  (Ex. T.C. => tc)

    Optionally uses a volume integer.

    If neither is provided, code will cycle through all downloaded files.

    :param volume: The volume (int) of the reporters (optional) (ex 10)
    :param reporter: Reporter string as slugify'd (optional) (tc) for T.C.
    :return: None
    """
    if not reporter and volume:
        logger.error("You provided a volume but no reporter. Exiting.")
        return

    for file_path in filepath_list(reporter, volume):
        ia_download_url = "/".join(
            ["https://archive.org/download", file_path.split("/", 9)[-1]]
        )

        if OpinionCluster.objects.filter(
            filepath_json_harvard=file_path
        ).exists():
            logger.info("Skipping - already in system %s" % ia_download_url)
            continue

        try:
            with open(file_path) as f:
                data = json.load(f)
        except ValueError:
            logger.warning("Empty json: missing case at: %s" % ia_download_url)
            continue
        except Exception as e:
            logger.warning("Unknown error %s for: %s" % (e, ia_download_url))
            continue

        cites = get_citations(data["citations"][0]["cite"], html=False)
        if not cites:
            logger.info(
                "No citation found for %s." % data["citations"][0]["cite"]
            )
            continue

        case_name = harmonize(data["name_abbreviation"])
        case_name_short = cnt.make_case_name_short(case_name)
        case_name_full = harmonize(data["name"])

        citation = cites[0]
        if skip_processing(citation, case_name):
            continue

        # TODO: Generalize this to handle all court types somehow.
        court_id = match_court_string(
            data["court"]["name"],
            state=True,
            federal_appeals=True,
            federal_district=True,
        )

        soup = BeautifulSoup(data["casebody"]["data"], "lxml")

        # Some documents contain images in the HTML
        # Flag them for a later crawl by using the placeholder '[[Image]]'
        judge_list = [
            find_judge_names(x.text) for x in soup.find_all("judges")
        ]
        author_list = [
            find_judge_names(x.text) for x in soup.find_all("author")
        ]
        # Flatten and dedupe list of judges
        judges = ", ".join(
            list(set(itertools.chain.from_iterable(judge_list + author_list)))
        )
        judges = titlecase(judges)
        docket_string = (
            data["docket_number"]
            .replace("Docket No.", "")
            .replace("Docket Nos.", "")
            .strip()
        )

        with transaction.atomic():
            logger.info("Adding docket for: %s", citation.base_citation())
            docket = Docket.objects.create(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                docket_number=docket_string,
                court_id=court_id,
                source=Docket.HARVARD,
                ia_needs_upload=False,
            )
            # Iterate over other xml fields in Harvard data set
            # and save as string list   for further processing at a later date.
            json_fields = [
                "attorneys",
                "disposition",
                "syllabus",
                "summary",
                "history",
                "otherdate",
                "seealso",
                "headnotes",
                "correction",
            ]
            data_set = {}
            while json_fields:
                key = json_fields.pop(0)
                data_set[key] = "|".join([x.text for x in soup.find_all(key)])

            # Handle partial dates by adding -01v to YYYY-MM dates
            date_filed, is_approximate = validate_dt(data["decision_date"])

            logger.info("Adding cluster for: %s", citation.base_citation())
            cluster = OpinionCluster.objects.create(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                precedential_status="Published",
                docket_id=docket.id,
                source="U",
                date_filed=date_filed,
                date_filed_is_approximate=is_approximate,
                attorneys=data_set["attorneys"],
                disposition=data_set["disposition"],
                syllabus=data_set["syllabus"],
                summary=data_set["summary"],
                history=data_set["history"],
                other_dates=data_set["otherdate"],
                cross_reference=data_set["seealso"],
                headnotes=data_set["headnotes"],
                correction=data_set["correction"],
                judges=judges,
                filepath_json_harvard=file_path,
            )

            logger.info("Adding citation for: %s", citation.base_citation())
            Citation.objects.create(
                volume=citation.volume,
                reporter=citation.reporter,
                page=citation.page,
                type=map_reporter_db_cite_type(
                    REPORTERS[citation.reporter][0]["cite_type"]
                ),
                cluster_id=cluster.id,
            )
            for op in soup.find_all("opinion"):
                joined_by_str = titlecase(
                    " ".join(
                        list(set(itertools.chain.from_iterable(judge_list)))
                    )
                )
                author_str = titlecase(
                    " ".join(
                        list(set(itertools.chain.from_iterable(author_list)))
                    )
                )

                op_type = map_opinion_type(op.get("type"))
                opinion_xml = str(op)
                logger.info("Adding opinion for: %s", citation.base_citation())
                Opinion.objects.create(
                    cluster_id=cluster.id,
                    type=op_type,
                    author_str=author_str,
                    xml_harvard=opinion_xml,
                    joined_by_str=joined_by_str,
                    extracted_by_ocr=True,
                )

        logger.info("Finished: %s", citation.base_citation())
def add_positions_from_row(item, person, testing, fix_nums=None):
    # add position items (up to 6 of them)
    prev_politics = None
    for posnum in range(1, 7):
        # Save the position if we're running all positions or specifically
        # fixing this one.
        save_this_position = (fix_nums is None or posnum in fix_nums)
        if posnum > 1:
            pos_str = ' (%s)' % posnum
        else:
            pos_str = ''

        if pd.isnull(item['Court Name' + pos_str]):
            continue
        courtid = match_court_string(item['Court Name' + pos_str],
                                     federal_district=True)
        if courtid is None:
            raise Exception

        date_nominated = process_date_string(
            item['Nomination Date Senate Executive Journal' + pos_str])
        date_recess_appointment = process_date_string(
            item['Recess Appointment date' + pos_str])
        date_referred_to_judicial_committee = process_date_string(
            item['Referral date (referral to Judicial Committee)' + pos_str])
        date_judicial_committee_action = process_date_string(
            item['Committee action date' + pos_str])
        date_hearing = process_date_string(item['Hearings' + pos_str])
        date_confirmation = process_date_string(
            item['Senate Vote Date (Confirmation Date)' + pos_str])

        # assign start date
        date_start = process_date_string(item['Commission Date' + pos_str])
        if pd.isnull(date_start) and not pd.isnull(date_recess_appointment):
            date_start = date_recess_appointment
        if pd.isnull(date_start):
            # if still no start date, skip
            continue
        date_termination = process_date_string(
            item['Date of Termination' + pos_str])
        date_retirement = process_date_string(
            item['Retirement from Active Service' + pos_str])

        if date_termination is None:
            date_granularity_termination = ''
        else:
            date_granularity_termination = GRANULARITY_DAY

        # check duplicate position
        dupe_search = Position.objects.filter(
            person=person,
            position_type='jud',
            date_start=date_start,
            date_termination=date_termination,
            court_id=courtid,
        )
        if len(dupe_search) > 0:
            print('Duplicate position:', dupe_search)
            continue

        # assign appointing president
        if not pd.isnull(item['Renominating President name' + pos_str]):
            appointstr = item['Renominating President name' + pos_str]
        else:
            appointstr = item['President name' + pos_str]
        appointer = None
        if appointstr not in ['Assignment', 'Reassignment']:
            names = appointstr.split()

            if len(names) == 3:
                first, mid, last = names
            else:
                first, last = names[0], names[-1]
                mid = ''
            appoint_search = Position.objects.filter(
                person__name_first__iexact=first,
                person__name_last__iexact=last)
            if len(appoint_search) > 1:
                appoint_search = Position.objects.filter(
                    person__name_first__iexact=first,
                    person__name_last__iexact=last,
                    person__name_middle__iexact=mid,
                    position_type='pres',
                )
            if len(appoint_search) > 1:
                appoint_search = Position.objects.filter(
                    person__name_first__iexact=first,
                    person__name_last__iexact=last,
                    person__name_middle__iexact=mid,
                    position_type='pres',
                    date_start__lte=date_nominated,
                    date_termination__gte=date_nominated
                )
            if len(appoint_search) == 0:
                print(names, appoint_search)
            if len(appoint_search) > 1:
                print(names, appoint_search)
            if len(appoint_search) == 1:
                appointer = appoint_search[0]

        # senate votes data
        votes = item['Senate vote Ayes/Nays' + pos_str]
        if not pd.isnull(votes):
            votes_yes, votes_no = votes.split('/')
        else:
            votes_yes = None
            votes_no = None
        if item['Senate voice vote' + pos_str] == "Yes":
            voice_vote = True
        else:
            voice_vote = False

        termdict = {'Abolition of Court': 'abolished',
                    'Death': 'ded',
                    'Reassignment': 'other_pos',
                    'Appointment to Another Judicial Position': 'other_pos',
                    'Impeachment & Conviction': 'bad_judge',
                    'Recess Appointment-Not Confirmed': 'recess_not_confirmed',
                    'Resignation': 'resign',
                    'Retirement': 'retire_vol'
                    }
        term_reason = item['Termination specific reason' + pos_str]
        if pd.isnull(term_reason):
            term_reason = ''
        else:
            term_reason = termdict[term_reason]

        position = Position(
            person=person,
            court_id=courtid,
            position_type='jud',

            date_nominated=date_nominated,
            date_recess_appointment=date_recess_appointment,
            date_referred_to_judicial_committee=date_referred_to_judicial_committee,
            date_judicial_committee_action=date_judicial_committee_action,
            date_hearing=date_hearing,
            date_confirmation=date_confirmation,
            date_start=date_start,
            date_granularity_start=GRANULARITY_DAY,
            date_termination=date_termination,
            date_granularity_termination=date_granularity_termination,
            date_retirement=date_retirement,

            appointer=appointer,

            voice_vote=voice_vote,
            votes_yes=votes_yes,
            votes_no=votes_no,
            vote_type='s',
            how_selected='a_pres',
            termination_reason=term_reason
        )

        if not testing and save_this_position:
            position.save()

        # set party
        p = item['Party Affiliation of President' + pos_str]
        if not pd.isnull(p) and p not in ['Assignment', 'Reassignment']:
            party = get_party(item['Party Affiliation of President' + pos_str])
            if prev_politics is None:
                if pd.isnull(date_nominated):
                    politicsgran = ''
                else:
                    politicsgran = GRANULARITY_DAY
                politics = PoliticalAffiliation(
                    person=person,
                    political_party=party,
                    date_start=date_nominated,
                    date_granularity_start=politicsgran,
                    source='a',
                )
                if not testing and save_this_position:
                    politics.save()
                prev_politics = party
            elif party != prev_politics:
                # account for changing political affiliation
                politics.date_end = date_nominated
                politics.date_granularity_end = GRANULARITY_DAY
                if not testing and save_this_position:
                    politics.save()
                politics = PoliticalAffiliation(
                    person=person,
                    political_party=party,
                    date_start=date_nominated,
                    date_granularity_start=GRANULARITY_DAY,
                    source='a'
                )
                if not testing and save_this_position:
                    politics.save()
        rating = get_aba(item['ABA Rating' + pos_str])
        if rating is not None:
            nom_year = date_nominated.year
            aba = ABARating(
                person=person,
                rating=rating,
                year_rated=nom_year
            )
            if not testing and save_this_position:
                aba.save()
    def fix_fjc_positions(self, infile=None):
        """
        Addresses issue #624.

        We had some errant regexes in the district court assignments. This code
        reassigns the court fields for these judges where the new regexes
        differs from the old ones.

        :param infile: The import file with fjc-data.xslx
        :return: None
        """

        if infile is None:
            self.ensure_input_file()
            infile = self.options['input_file']
        textfields = ['firstname', 'midname', 'lastname', 'gender',
                      'Place of Birth (City)', 'Place of Birth (State)',
                      'Place of Death (City)', 'Place of Death (State)']
        df = pd.read_excel(infile, 0)
        for x in textfields:
            df[x] = df[x].replace(np.nan, '', regex=True)
        df['Employment text field'].replace(to_replace=r';\sno', value=r', no',
                                            inplace=True, regex=True)
        for i, item in df.iterrows():
            fjc_id = item['Judge Identification Number']
            p = Person.objects.get(fjc_id=fjc_id)
            logger.info("Doing person with FJC ID: %s, "
                        "https://courtlistener.com%s" % (fjc_id,
                                                         p.get_absolute_url()))

            exclusions = []
            for posnum in range(1, 7):
                if posnum > 1:
                    pos_str = ' (%s)' % posnum
                else:
                    pos_str = ''

                if pd.isnull(item['Court Name' + pos_str]):
                    continue
                courtid = match_court_string(item['Court Name' + pos_str],
                                             federal_district=True)
                if courtid is None:
                    raise Exception
                date_termination = process_date_string(
                    item['Date of Termination' + pos_str])
                date_start = process_date_string(
                    item['Commission Date' + pos_str])
                date_recess_appointment = process_date_string(
                    item['Recess Appointment date' + pos_str])
                if pd.isnull(date_start) and not pd.isnull(
                        date_recess_appointment):
                    date_start = date_recess_appointment
                if pd.isnull(date_start):
                    # if still no start date, skip
                    continue
                positions = (Position.objects
                                .filter(person=p, date_start=date_start,
                                        date_termination=date_termination,
                                        position_type='jud')
                                .exclude(pk__in=exclusions))
                position_count = positions.count()
                if position_count < 1:
                    logger.info("Couldn't find position to match '%s' on '%s' "
                                "with exclusions: %s" % (p, date_start,
                                                         exclusions))
                    add_positions_from_row(item, p, self.debug,
                                           fix_nums=[posnum])
                    if not self.debug:
                        add_items_to_solr.delay([p.pk], 'people_db.Person')
                    continue
                elif position_count == 1:
                    # Good case. Press on!
                    position = positions[0]
                    exclusions.append(position.pk)
                elif position_count > 1:
                    logger.info("Got too many results for '%s' on '%s'. Got %s"
                                % (p, date_start, position_count))
                    continue

                if position.court.pk == courtid:
                    logger.info("Court IDs are both '%s'. No changes made." %
                                courtid)
                else:
                    logger.info("Court IDs are different! Old: %s, New: %s" %
                                (position.court.pk, courtid))
                    court = Court.objects.get(pk=courtid)
                    position.court = court

                    if not self.debug:
                        position.save()
                        add_items_to_solr.delay([p.pk], 'people_db.Person')