Пример #1
0
 def test_citation_matching_issue621(self) -> None:
     """Make sure that a citation like 1 Wheat 9 doesn't match 9 Wheat 1"""
     # The fixture contains a reference to 9 F. 1, so we expect no results.
     citation_str = "1 F. 9 (1795)"
     citation = get_citations(citation_str)[0]
     results = match_citation(citation)
     self.assertEqual([], results)
Пример #2
0
def get_document_citations(
    opinion: Opinion, ) -> List[Union[NonopinionCitation, Citation]]:
    """Identify and return citations from the html or plain text of the
    opinion.
    """
    if opinion.html_anon_2020:
        citations = get_citations(
            text=opinion.html_anon_2020,
            clean=(
                "html",
                "whitespace",
            ),
        )
    elif opinion.html_columbia:
        citations = get_citations(
            text=opinion.html_columbia,
            clean=(
                "html",
                "whitespace",
            ),
        )
    elif opinion.html_lawbox:
        citations = get_citations(
            text=opinion.html_lawbox,
            clean=(
                "html",
                "whitespace",
            ),
        )
    elif opinion.html:
        citations = get_citations(
            text=opinion.html,
            clean=(
                "html",
                "whitespace",
            ),
        )
    elif opinion.plain_text:
        citations = get_citations(text=opinion.plain_text,
                                  clean=("whitespace", ))
    else:
        citations = []
    return citations
Пример #3
0
def make_citation(
    cite_str: str,
    cluster: OpinionCluster,
    cite_type: int,
) -> Citation:
    """Create and return a citation object for the input values."""
    citation_obj = get_citations(cite_str)[0]
    return Citation(
        cluster=cluster,
        volume=citation_obj.volume,
        reporter=citation_obj.reporter,
        page=citation_obj.page,
        type=cite_type,
    )
Пример #4
0
def find_cites(case_data: Dict[str, str]) -> List[FoundCitation]:
    """Extract citations from raw string.

    :param case_data: Case information from the anon 2020 db.
    :return: Citation objects found in the raw string.
    """
    found_citations = []
    cites = re.findall(r"\"(.*?)\"", case_data["lexis_ids_normalized"],
                       re.DOTALL)
    for cite in cites:
        fc = get_citations(clean_text(cite, ["html", "inline_whitespace"]))
        if len(fc) > 0:
            found_citations.append(fc[0])
    return found_citations
Пример #5
0
def find_tax_court_citation(opinion_text):
    """
    Returns a dictionary representation of our
    Citation object.

    Return the citation object or nothing.
    Iterates over lines of text beacuse we assume our citations won't wrap.

    :param opinion_text: The plain_text of our opinion from the scrape.
    :return: citation object or None
    """
    for line_of_text in opinion_text.split("\n")[:250]:
        cites = get_citations(line_of_text)
        if not cites:
            continue

        if "UNITED STATES TAX COURT REPORT" in opinion_text:
            for cite in cites:
                if "UNITED STATES TAX COURT REPORT" in cite.reporter_found:
                    cite.type = Citation.SPECIALTY
                    return cite
        else:
            for cite in cites:
                if ("T.C." not in cite.reporter
                        and "T. C." not in cite.reporter):
                    # If not the first cite - Skip
                    return None

                if cite.reporter_index > 2:
                    # If reporter not in first or second term in the line we skip.
                    return None

                alt_cite = line_of_text.replace(cite.reporter_found,
                                                "").strip()
                other_words = alt_cite.split(" ")

                if len([x for x in other_words if x != ""]) > 3:
                    # If line has more than three non reporter components skip.
                    return None

                if "T.C." == cite.reporter:
                    cite_type = Citation.SPECIALTY
                elif "T.C. No." == cite.reporter:
                    cite_type = Citation.SPECIALTY
                else:
                    cite_type = Citation.NEUTRAL

                cite.type = cite_type
                return cite
Пример #6
0
    def do_citations(cluster, scdb_info):
        """
        Handle the citation fields.

        :param cluster: The Cluster to be changed.
        :param scdb_info: A dict with the SCDB information.
        """
        fields = {
            "usCite": ("U.S.", Citation.FEDERAL),
            "sctCite": ("S. Ct.", Citation.FEDERAL),
            "ledCite": ("L. Ed.", Citation.FEDERAL),
            "lexisCite": ("U.S. LEXIS", Citation.LEXIS),
        }
        for scdb_field, reporter_info in fields.items():
            if not scdb_info[scdb_field]:
                continue
            try:
                citation_obj = get_citations(
                    scdb_info[scdb_field],
                    do_post_citation=False,
                    do_defendant=False,
                    disambiguate=False,
                )[0]
            except IndexError:
                logger.warning("Unable to parse citation for: %s",
                               scdb_info[scdb_field])
            else:
                cites = cluster.citations.filter(reporter=reporter_info[0])
                if cites.count() == 1:
                    # Update the existing citation.
                    cite = cites[0]
                    cite.volume = citation_obj.volume
                    cite.reporter = citation_obj.reporter
                    cite.page = citation_obj.page
                    cite.save()
                else:
                    try:
                        # Create a new citation
                        Citation.objects.create(
                            cluster=cluster,
                            volume=citation_obj.volume,
                            reporter=citation_obj.reporter,
                            page=citation_obj.page,
                            type=reporter_info[1],
                        )
                    except IntegrityError:
                        # Violated unique_together constraint. Fine.
                        pass
Пример #7
0
    def test_make_html_from_html(self) -> None:
        """Can we convert the HTML of an opinion into modified HTML?"""
        # fmt: off

        test_pairs = [
            # Id. citation with HTML tags
            ('<div><p>the improper views of the Legislature.\" 2 <i>id.,</i> '
             'at 73.</p>\n<p>Nathaniel Gorham of Massachusetts</p></div>',
             '<div><p>the improper views of the Legislature." 2<span class="'
             'citation no-link"> <i><span class="id_token">id.,</span></i> at '
             '73.</span></p>\n<p>Nathaniel Gorham of Massachusetts</p></div>'),

            # Id. citation with an intervening HTML tag
            #  (We expect the HTML to be unchanged, since it's too risky to
            #   modify with another tag in the way)
            ('<div><p>the improper views of the Legislature.\" 2 <i>id.,</i> '
             'at <b>73, bolded</b>.</p>\n<p>Nathaniel Gorham of Massachusetts'
             '</p></div>',
             '<div><p>the improper views of the Legislature.\" 2 <i>id.,</i> '
             'at <b>73, bolded</b>.</p>\n<p>Nathaniel Gorham of Massachusetts'
             '</p></div>'),

            # Ibid. citation with HTML tags
            ('<div><p>possess any peculiar knowledge of the mere policy of '
             'public measures.\" <i>Ibid.</i> Gerry of Massachusetts '
             'like</p></div>',
             '<div><p>possess any peculiar knowledge of the mere policy of '
             'public measures."<span class="citation no-link"> <i><span class='
             '"id_token">Ibid.</span></i> Gerry of Massachusetts </span>like'
             '</p></div>'),
        ]

        # fmt: on
        for s, expected_html in test_pairs:
            print("Testing html to html conversion for %s..." % s, end=" ")
            opinion = Opinion(html=s)
            citations = get_citations(s, clean=("html", "whitespace"))
            created_html = create_cited_html(opinion, citations)
            self.assertEqual(
                created_html,
                expected_html,
                msg="\n%s\n\n    !=\n\n%s" % (created_html, expected_html),
            )
            print("✓")
Пример #8
0
 def test_identifying_parallel_citations(self) -> None:
     """Given a string, can we identify parallel citations"""
     tests = (
         # A pair consisting of a test string and the number of parallel
         # citations that should be identifiable in that string.
         # Simple case
         ("1 U.S. 1 (22 U.S. 33)", 1, 2),
         # Too far apart
         ("1 U.S. 1 too many words 22 U.S. 33", 0, 0),
         # Three citations
         ("1 U.S. 1, (44 U.S. 33, 99 U.S. 100)", 1, 3),
         # Parallel citation after a valid citation too early on
         ("1 U.S. 1 too many words, then 22 U.S. 33, 13 WL 33223", 1, 2),
     )
     for q, citation_group_count, expected_num_parallel_citations in tests:
         print(
             "Testing parallel citation identification for: %s..." % q,
             end=" ",
         )
         citations = get_citations(q)
         citation_groups = identify_parallel_citations(citations)
         computed_num_citation_groups = len(citation_groups)
         self.assertEqual(
             computed_num_citation_groups,
             citation_group_count,
             msg="Did not have correct number of citation groups. Got %s, "
             "not %s." %
             (computed_num_citation_groups, citation_group_count),
         )
         if not citation_groups:
             # Add an empty list to make testing easier.
             citation_groups = [[]]
         computed_num_parallel_citation = len(list(citation_groups)[0])
         self.assertEqual(
             computed_num_parallel_citation,
             expected_num_parallel_citations,
             msg="Did not identify correct number of parallel citations in "
             "the group. Got %s, not %s" % (
                 computed_num_parallel_citation,
                 expected_num_parallel_citations,
             ),
         )
         print("✓")
Пример #9
0
def get_query_citation(cd: Dict[str, Any]) -> Optional[List[Citation]]:
    """Extract citations from the query string and return them, or return
    None
    """
    if not cd.get("q"):
        return None
    citations = get_citations(cd["q"],
                              do_post_citation=False,
                              do_defendant=False)

    citations = [c for c in citations if isinstance(c, Citation)]

    matches = None
    if len(citations) == 1:
        # If it's not exactly one match, user doesn't get special help.
        matches = match_citation(citations[0])
        if len(matches) == 1:
            # If more than one match, don't show the tip
            return matches.result.docs[0]

    return matches
Пример #10
0
def make_and_save(item,
                  skipdupes=False,
                  min_dates=None,
                  start_dates=None,
                  testing=True):
    """Associates case data from `parse_opinions` with objects. Saves these
    objects.

    min_date: if not none, will skip cases after min_date
    """
    date_filed = (date_argued) = (
        date_reargued
    ) = date_reargument_denied = date_cert_granted = date_cert_denied = None
    unknown_date = None
    for date_cluster in item["dates"]:
        for date_info in date_cluster:
            # check for any dates that clearly aren't dates
            if date_info[1].year < 1600 or date_info[1].year > 2020:
                continue
            # check for untagged dates that will be assigned to date_filed
            if date_info[0] is None:
                date_filed = date_info[1]
                continue
            # try to figure out what type of date it is based on its tag string
            if date_info[0] in FILED_TAGS:
                date_filed = date_info[1]
            elif date_info[0] in DECIDED_TAGS:
                if not date_filed:
                    date_filed = date_info[1]
            elif date_info[0] in ARGUED_TAGS:
                date_argued = date_info[1]
            elif date_info[0] in REARGUE_TAGS:
                date_reargued = date_info[1]
            elif date_info[0] in REARGUE_DENIED_TAGS:
                date_reargument_denied = date_info[1]
            elif date_info[0] in CERT_GRANTED_TAGS:
                date_cert_granted = date_info[1]
            elif date_info[0] in CERT_DENIED_TAGS:
                date_cert_denied = date_info[1]
            else:
                unknown_date = date_info[1]
                if date_info[0] not in UNKNOWN_TAGS:
                    print("\nFound unknown date tag '%s' with date '%s'.\n" %
                          date_info)

    # the main date (used for date_filed in OpinionCluster) and panel dates
    # (used for finding judges) are ordered in terms of which type of dates
    # best reflect them
    main_date = (date_filed or date_argued or date_reargued
                 or date_reargument_denied or unknown_date)
    panel_date = (date_argued or date_reargued or date_reargument_denied
                  or date_filed or unknown_date)

    if main_date is None:
        raise Exception("Failed to get a date for " + item["file"])

    # special rule for Kentucky
    if item["court_id"] == "kycourtapp" and main_date <= date(1975, 12, 31):
        item["court_id"] = "kycourtapphigh"

    if min_dates is not None:
        if min_dates.get(item["court_id"]) is not None:
            if main_date >= min_dates[item["court_id"]]:
                print(
                    main_date,
                    "after",
                    min_dates[item["court_id"]],
                    " -- skipping.",
                )
                return
    if start_dates is not None:
        if start_dates.get(item["court_id"]) is not None:
            if main_date <= start_dates[item["court_id"]]:
                print(
                    main_date,
                    "before court founding:",
                    start_dates[item["court_id"]],
                    " -- skipping.",
                )
                return

    docket = Docket(
        source=Docket.COLUMBIA,
        date_argued=date_argued,
        date_reargued=date_reargued,
        date_cert_granted=date_cert_granted,
        date_cert_denied=date_cert_denied,
        date_reargument_denied=date_reargument_denied,
        court_id=item["court_id"],
        case_name_short=item["case_name_short"] or "",
        case_name=item["case_name"] or "",
        case_name_full=item["case_name_full"] or "",
        docket_number=item["docket"] or "",
    )

    # get citation objects in a list for addition to the cluster
    found_citations = []
    for c in item["citations"]:
        found = get_citations(clean_text(c, ["html", "inline_whitespace"]))
        if not found:
            # if the docket number --is-- citation string, we're likely dealing
            # with a somewhat common triplet of (docket number, date,
            # jurisdiction), which isn't a citation at all (so there's no
            # problem)
            if item["docket"]:
                docket_no = item["docket"].lower()
                if "claim no." in docket_no:
                    docket_no = docket_no.split("claim no.")[0]
                for junk in DOCKET_JUNK:
                    docket_no = docket_no.replace(junk, "")
                docket_no = docket_no.strip(".").strip()
                if docket_no and docket_no in c.lower():
                    continue

            # there are a trivial number of letters (except for
            # months and a few trivial words) in the citation,
            # then it's not a citation at all
            non_trivial = c.lower()
            for trivial in TRIVIAL_CITE_WORDS:
                non_trivial = non_trivial.replace(trivial, "")
            num_letters = sum(
                non_trivial.count(letter) for letter in string.lowercase)
            if num_letters < 3:
                continue

            # if there is a string that's known to indicate
            # a bad citation, then it's not a citation
            if any(bad in c for bad in BAD_CITES):
                continue
            # otherwise, this is a problem
            raise Exception("Failed to get a citation from the string '%s' in "
                            "court '%s' with docket '%s'." %
                            (c, item["court_id"], item["docket"]))
        else:
            found_citations.extend(found.to_model())

    cluster = OpinionCluster(
        judges=item.get("judges", "") or "",
        precedential_status=("Unpublished"
                             if item["unpublished"] else "Published"),
        date_filed=main_date,
        case_name_short=item["case_name_short"] or "",
        case_name=item["case_name"] or "",
        case_name_full=item["case_name_full"] or "",
        source="Z",
        attorneys=item["attorneys"] or "",
        posture=item["posture"] or "",
    )
    panel = lookup_judges_by_last_name_list(item["panel"], item["court_id"],
                                            panel_date)

    opinions = []
    for i, opinion_info in enumerate(item["opinions"]):
        if opinion_info["author"] is None:
            author = None
        else:
            author = lookup_judge_by_last_name(opinion_info["author"],
                                               item["court_id"], panel_date)

        converted_text = convert_columbia_html(opinion_info["opinion"])
        opinion_type = OPINION_TYPE_MAPPING[opinion_info["type"]]
        if opinion_type == Opinion.LEAD and i > 0:
            opinion_type = Opinion.ADDENDUM

        opinion = Opinion(
            author=author,
            per_curiam=opinion_info["per_curiam"],
            type=opinion_type,
            # type=OPINION_TYPE_MAPPING[opinion_info['type']],
            html_columbia=converted_text,
            sha1=opinion_info["sha1"],
            # This is surely not updated for the new S3 world. If you're
            # reading this, you'll need to update this code.
            local_path=opinion_info["local_path"],
        )
        joined_by = lookup_judges_by_last_name_list(item["joining"],
                                                    item["court_id"],
                                                    panel_date)
        opinions.append((opinion, joined_by))

    if min_dates is None:
        # check to see if this is a duplicate
        dups = find_dups(docket, cluster)
        if dups:
            if skipdupes:
                print("Duplicate. skipping.")
            else:
                raise Exception("Found %s duplicate(s)." % len(dups))

    # save all the objects
    if not testing:
        try:
            docket.save()
            cluster.docket = docket
            cluster.save(index=False)
            for citation in found_citations:
                citation.cluster = cluster
                citation.save()
            for member in panel:
                cluster.panel.add(member)
            for opinion, joined_by in opinions:
                opinion.cluster = cluster
                opinion.save(index=False)
                for joiner in joined_by:
                    opinion.joined_by.add(joiner)
            if settings.DEBUG:
                domain = "http://127.0.0.1:8000"
            else:
                domain = "https://www.courtlistener.com"
            print("Created item at: %s%s" %
                  (domain, cluster.get_absolute_url()))
        except:
            # if anything goes wrong, try to delete everything
            try:
                docket.delete()
            except:
                pass
            raise
Пример #11
0
def parse_harvard_opinions(reporter, volume, make_searchable):
    """
    Parse downloaded CaseLaw Corpus from internet archive and add them to our
    database.

    Optionally uses a reporter abbreviation to identify cases to download as
    used by IA.  (Ex. T.C. => tc)

    Optionally uses a volume integer.

    If neither is provided, code will cycle through all downloaded files.

    :param volume: The volume (int) of the reporters (optional) (ex 10)
    :param reporter: Reporter string as slugify'd (optional) (tc) for T.C.
    :param make_searchable: Boolean to indicate saving to solr
    :return: None
    """
    if not reporter and volume:
        logger.error("You provided a volume but no reporter. Exiting.")
        return

    for file_path in filepath_list(reporter, volume):
        ia_download_url = "/".join(
            ["https://archive.org/download", file_path.split("/", 9)[-1]]
        )

        if OpinionCluster.objects.filter(
            filepath_json_harvard=file_path
        ).exists():
            logger.info("Skipping - already in system %s" % ia_download_url)
            continue

        try:
            with open(file_path) as f:
                data = json.load(f)
        except ValueError:
            logger.warning("Empty json: missing case at: %s" % ia_download_url)
            continue
        except Exception as e:
            logger.warning("Unknown error %s for: %s" % (e, ia_download_url))
            continue

        cites = get_citations(data["citations"][0]["cite"])
        if not cites:
            logger.info(
                "No citation found for %s." % data["citations"][0]["cite"]
            )
            continue

        case_name = harmonize(data["name_abbreviation"])
        case_name_short = cnt.make_case_name_short(case_name)
        case_name_full = harmonize(data["name"])

        citation = cites[0]
        if skip_processing(citation, case_name, file_path):
            continue

        # TODO: Generalize this to handle all court types somehow.
        court_id = match_court_string(
            data["court"]["name"],
            state=True,
            federal_appeals=True,
            federal_district=True,
        )

        soup = BeautifulSoup(data["casebody"]["data"], "lxml")

        # Some documents contain images in the HTML
        # Flag them for a later crawl by using the placeholder '[[Image]]'
        judge_list = [
            extract_judge_last_name(x.text) for x in soup.find_all("judges")
        ]
        author_list = [
            extract_judge_last_name(x.text) for x in soup.find_all("author")
        ]
        # Flatten and dedupe list of judges
        judges = ", ".join(
            sorted(
                list(
                    set(
                        itertools.chain.from_iterable(judge_list + author_list)
                    )
                )
            )
        )
        judges = titlecase(judges)
        docket_string = (
            data["docket_number"]
            .replace("Docket No.", "")
            .replace("Docket Nos.", "")
            .strip()
        )

        short_fields = ["attorneys", "disposition", "otherdate", "seealso"]

        long_fields = [
            "syllabus",
            "summary",
            "history",
            "headnotes",
            "correction",
        ]

        short_data = parse_extra_fields(soup, short_fields, False)
        long_data = parse_extra_fields(soup, long_fields, True)

        with transaction.atomic():
            logger.info("Adding docket for: %s", citation.base_citation())
            docket = Docket(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                docket_number=docket_string,
                court_id=court_id,
                source=Docket.HARVARD,
                ia_needs_upload=False,
            )
            try:
                with transaction.atomic():
                    docket.save()
            except OperationalError as e:
                if "exceeds maximum" in str(e):
                    docket.docket_number = (
                        "%s, See Corrections for full Docket Number"
                        % trunc(docket_string, length=5000, ellipsis="...")
                    )
                    docket.save()
                    long_data["correction"] = "%s <br> %s" % (
                        data["docket_number"],
                        long_data["correction"],
                    )
            # Handle partial dates by adding -01v to YYYY-MM dates
            date_filed, is_approximate = validate_dt(data["decision_date"])

            logger.info("Adding cluster for: %s", citation.base_citation())
            cluster = OpinionCluster(
                case_name=case_name,
                case_name_short=case_name_short,
                case_name_full=case_name_full,
                precedential_status="Published",
                docket_id=docket.id,
                source="U",
                date_filed=date_filed,
                date_filed_is_approximate=is_approximate,
                attorneys=short_data["attorneys"],
                disposition=short_data["disposition"],
                syllabus=long_data["syllabus"],
                summary=long_data["summary"],
                history=long_data["history"],
                other_dates=short_data["otherdate"],
                cross_reference=short_data["seealso"],
                headnotes=long_data["headnotes"],
                correction=long_data["correction"],
                judges=judges,
                filepath_json_harvard=file_path,
            )
            cluster.save(index=False)

            logger.info("Adding citation for: %s", citation.base_citation())
            Citation.objects.create(
                volume=citation.volume,
                reporter=citation.reporter,
                page=citation.page,
                type=map_reporter_db_cite_type(
                    REPORTERS[citation.canonical_reporter][0]["cite_type"]
                ),
                cluster_id=cluster.id,
            )
            new_op_pks = []
            for op in soup.find_all("opinion"):
                # This code cleans author tags for processing.
                # It is particularly useful for identifiying Per Curiam
                for elem in [op.find("author")]:
                    if elem is not None:
                        [x.extract() for x in elem.find_all("page-number")]

                auth = op.find("author")
                if auth is not None:
                    author_tag_str = titlecase(auth.text.strip(":"))
                    author_str = titlecase(
                        "".join(extract_judge_last_name(author_tag_str))
                    )
                else:
                    author_str = ""
                    author_tag_str = ""

                per_curiam = True if author_tag_str == "Per Curiam" else False
                # If Per Curiam is True set author string to Per Curiam
                if per_curiam:
                    author_str = "Per Curiam"

                op_type = map_opinion_type(op.get("type"))
                opinion_xml = str(op)
                logger.info("Adding opinion for: %s", citation.base_citation())
                op = Opinion(
                    cluster_id=cluster.id,
                    type=op_type,
                    author_str=author_str,
                    xml_harvard=opinion_xml,
                    per_curiam=per_curiam,
                    extracted_by_ocr=True,
                )
                # Don't index now; do so later if desired
                op.save(index=False)
                new_op_pks.append(op.pk)

        if make_searchable:
            add_items_to_solr.delay(new_op_pks, "search.Opinion")

        logger.info("Finished: %s", citation.base_citation())
    def handle(self, *args, **options):
        """Identify parallel citations and save them as requested.

        This process proceeds in two phases. The first phase is to work through
        the entire corpus, identifying citations that occur very near to each
        other. These are considered parallel citations, and they are built into
        a graph data structure where citations are nodes and each parallel
        citation is an edge. The weight of each edge is determined by the
        number of times a parallel citation has been identified between two
        citations. This should solve problems like typos or other issues with
        our heuristic approach.

        The second phase of this process is to update the database with the
        high quality citations. This can only be done by matching the citations
        with actual items in the database and then updating them with parallel
        citations that are sufficiently likely to be good.
        """
        super(Command, self).handle(*args, **options)
        no_option = not any([options.get("doc_id"), options.get("all")])
        if no_option:
            raise CommandError(
                "Please specify if you want all items or a specific item.")
        if not options["update_database"]:
            logger.info(
                "--update_database is not set. No changes will be made to the "
                "database.")

        logger.info("## Entering phase one: Building a network object of "
                    "all citations.\n")
        q = Opinion.objects.all()
        if options.get("doc_id"):
            q = q.filter(pk__in=options["doc_id"])
        count = q.count()
        opinions = queryset_generator(q, chunksize=10000)

        node_count = edge_count = completed = 0
        subtasks = []
        for o in opinions:
            subtasks.append(
                identify_parallel_citations.s(
                    get_citations(get_and_clean_opinion_text(o).cleaned_text)))
            last_item = count == completed + 1
            if (completed % 50 == 0) or last_item:
                job = group(subtasks)
                result = job.apply_async().join()
                [
                    self.add_groups_to_network(citation_groups)
                    for citation_groups in result
                ]
                subtasks = []

            completed += 1
            if completed % 250 == 0 or last_item:
                # Only do this once in a while.
                node_count = len(self.g.nodes())
                edge_count = len(self.g.edges())
            sys.stdout.write("\r  Completed %s of %s. (%s nodes, %s edges)" %
                             (completed, count, node_count, edge_count))
            sys.stdout.flush()

        logger.info("\n\n## Entering phase two: Saving the best edges to "
                    "the database.\n\n")
        for sub_graph in nx.connected_component_subgraphs(self.g):
            self.handle_subgraph(sub_graph, options)

        logger.info(f"\n\n## Done. Added {self.update_count} new citations.")

        self.do_solr(options)
Пример #13
0
    def test_make_html_from_plain_text(self) -> None:
        """Can we convert the plain text of an opinion into HTML?"""
        # fmt: off

        full_citation_html = ('<pre class="inline">asdf </pre><span class="'
                              'citation no-link"><span class="volume">22'
                              '</span> <span class="reporter">U.S.</span> '
                              '<span class="page">33</span> </span><pre class='
                              '"inline">asdf</pre>')
        test_pairs = [
            # Simple example for full citations
            ('asdf 22 U.S. 33 asdf', full_citation_html),

            # Using a variant format for U.S. (Issue #409)
            ('asdf 22 U. S. 33 asdf', full_citation_html),

            # Full citation across line break
            ('asdf John v. Doe, 123\nU.S. 456, upholding foo bar',
             '<pre class="inline">asdf John v. Doe, </pre><span class="'
             'citation no-link"><span class="volume">123</span>\n<span class='
             '"reporter">U.S.</span> <span class="page">456</span></span><pre'
             ' class="inline">, upholding foo bar</pre>'),

            # Basic short form citation
            ('existing text asdf, 515 U.S., at 240. foobar',
             '<pre class="inline">existing text </pre><span class="citation '
             'no-link"><span class="antecedent_guess">asdf,</span> <span '
             'class="volume">515</span> <span class="reporter">U.S.</span>, '
             'at <span class="page">240</span></span><pre class="inline">. '
             'foobar</pre>'),

            # Short form citation with no comma after reporter in original
            ('existing text asdf, 1 U. S. at 2. foobar',
             '<pre class="inline">existing text </pre><span class="citation '
             'no-link"><span class="antecedent_guess">asdf,</span> <span class'
             '="volume">1</span> <span class="reporter">U.S.</span> at <span '
             'class="page">2</span></span><pre class="inline">. foobar</pre>'),

            # Short form citation across line break
            ('asdf.’ ” 123 \n U.S., at 456. Foo bar foobar',
             '<pre class="inline">asdf.’ </pre><span class="'
             'citation no-link"><span class="antecedent_guess">”'
             '</span> <span class="volume">123</span> \n <span class='
             '"reporter">U.S.</span>, at <span class="page">456</span></span>'
             '<pre class="inline">. Foo bar foobar</pre>'),

            # First kind of supra citation (standard kind)
            ('existing text asdf, supra, at 2. foobar',
             '<pre class="inline">existing text </pre><span class="citation '
             'no-link"><span class="antecedent_guess">asdf,</span> supra, at '
             '<span class="page">2</span></span><pre class="inline">. foobar'
             '</pre>'),

            # Second kind of supra citation (with volume)
            ('existing text asdf, 123 supra, at 2. foo bar',
             '<pre class="inline">existing text </pre><span class="citation '
             'no-link"><span class="antecedent_guess">asdf,</span> <span '
             'class="volume">123</span> supra, at <span class="page">2</span>'
             '</span><pre class="inline">. foo bar</pre>'),

            # Third kind of supra citation (sans page)
            ('existing text asdf, supra, foo bar',
             '<pre class="inline">existing text </pre><span class="citation '
             'no-link"><span class="antecedent_guess">asdf,</span> supra'
             '</span><pre class="inline">, foo bar</pre>'),

            # Fourth kind of supra citation (with period)
            ('existing text asdf, supra. foo bar',
             '<pre class="inline">existing text </pre><span class="citation '
             'no-link"><span class="antecedent_guess">asdf,</span> supra'
             '</span><pre class="inline">. foo bar</pre>'),

            # Supra citation across line break
            ('existing text asdf, supra, at\n99 (quoting foo)',
             '<pre class="inline">existing text </pre><span class="citation '
             'no-link"><span class="antecedent_guess">asdf,</span> supra, '
             'at\n<span class="page">99</span> </span><pre class="inline">'
             '(quoting foo)</pre>'),

            # Id. citation ("Id., at 123")
            ('asdf, id., at 123. Lorem ipsum dolor sit amet',
             '<pre class="inline">asdf</pre><span class="citation no-link">, '
             '<span class="id_token">id.,</span> at 123. </span><pre class="'
             'inline">Lorem ipsum dolor sit amet</pre>'),

            # Duplicate Id. citation
            ('asd, id., at 123. Lo rem ip sum. asdf, id., at 123. Lo rem ip.',
             '<pre class="inline">asd</pre><span class="citation no-link">, '
             '<span class="id_token">id.,</span> at 123. </span><pre class="'
             'inline">Lo rem ip sum. asdf</pre><span class="citation '
             'no-link">, <span class="id_token">id.,</span> at 123. </span>'
             '<pre class="inline">Lo rem ip.</pre>'),

            # Id. citation across line break
            ('asdf." Id., at 315.\n       Lorem ipsum dolor sit amet',
             '<pre class="inline">asdf."</pre><span class="citation no-link"> '
             '<span class="id_token">Id.,</span> at 315.\n</span><pre class="'
             'inline">       Lorem ipsum dolor sit amet</pre>'),

            # Ibid. citation ("... Ibid.")
            ('asdf, Ibid. Lorem ipsum dolor sit amet',
             '<pre class="inline">asdf</pre><span class="citation no-link">, '
             '<span class="id_token">Ibid.</span> Lorem ipsum dolor </span>'
             '<pre class="inline">sit amet</pre>'),

            # NonopinionCitation (currently nothing should happen here)
            ('Lorem ipsum dolor sit amet. U.S. Code §3617. Foo bar.',
             '<pre class="inline">Lorem ipsum dolor sit amet. U.S. Code '
             '§3617. Foo bar.</pre>'),
        ]

        # fmt: on
        for s, expected_html in test_pairs:
            print("Testing plain text to html conversion for %s..." % s,
                  end=" ")
            opinion = Opinion(plain_text=s)
            citations = get_citations(s)
            created_html = create_cited_html(opinion, citations)
            self.assertEqual(
                created_html,
                expected_html,
                msg="\n%s\n\n    !=\n\n%s" % (created_html, expected_html),
            )
            print("✓")