Python strip_punct 예제들, microscope.utils.strip_punct Python 예제들

예제 #1

0

파일 보기

파일: helpers.py 프로젝트: jcushman/eyecite

def get_court_by_paren(paren_string: str, citation: Citation) -> Optional[str]:
    """Takes the citation string, usually something like "2d Cir", and maps
    that back to the court code.

    Does not work on SCOTUS, since that court lacks parentheticals, and
    needs to be handled after disambiguation has been completed.
    """
    if citation.year is None:
        court_str = strip_punct(paren_string)
    else:
        year_index = paren_string.find(str(citation.year))
        court_str = strip_punct(paren_string[:year_index])

    court_code = None
    if court_str == "":
        court_code = None
    else:
        # Map the string to a court, if possible.
        for court in courts:
            # Use startswith because citations are often missing final period,
            # e.g. "2d Cir"
            if court["citation_string"].startswith(court_str):
                court_code = court["id"]
                break

    return court_code

예제 #2

0

파일 보기

파일: find_citations.py 프로젝트: jcushman/eyecite

def extract_full_citation(
    words: List[str],
    reporter_index: int,
) -> Optional[FullCitation]:
    """Given a list of words and the index of a federal reporter, look before
    and after for volume and page. If found, construct and return a
    FullCitation object.

    Example: Adarand Constructors, Inc. v. Peña, 515 U.S. 200, 240

    If we are given neutral, tax court opinions we treat them differently.
    The formats often follow {REPORTER} {YEAR}-{ITERATIVE_NUMBER}
    ex. T.C. Memo. 2019-13
    """
    # Get reporter
    reporter = words[reporter_index]

    # Variables to extact
    volume: Union[int, str, None]
    page: Union[int, str, None]

    # Handle tax citations
    is_tax_citation = is_neutral_tc_reporter(reporter)
    if is_tax_citation:
        volume, page = words[reporter_index + 1].replace("–", "-").split("-")

    # Handle "normal" citations, e.g., XX F.2d YY
    else:
        # Don't check if we are at the beginning of a string
        if reporter_index == 0:
            return None
        volume = strip_punct(words[reporter_index - 1])
        page = strip_punct(words[reporter_index + 1])

    # Get volume
    if volume.isdigit():
        volume = int(volume)
    else:
        # No volume, therefore not a valid citation
        return None

    # Get page
    page = parse_page(page)
    if not page:
        return None

    # Return FullCitation
    return FullCitation(
        reporter,
        page,
        volume,
        reporter_found=reporter,
        reporter_index=reporter_index,
    )

예제 #3

0

파일 보기

파일: helpers.py 프로젝트: jcushman/eyecite

def parse_page(page: Union[str, int]) -> Optional[str]:
    """Test whether something is a valid page number."""
    page = strip_punct(str(page))

    if page.isdigit():
        # First, check whether the page is a simple digit. Most will be.
        return page

    # Otherwise, check whether the "page" is really one of the following:
    # (ordered in descending order of likelihood)
    # 1) A numerical page range. E.g., "123-124"
    # 2) A roman numeral. E.g., "250 Neb. xxiv (1996)"
    # 3) A special Connecticut or Illinois number. E.g., "13301-M"
    # 4) A page with a weird suffix. E.g., "559 N.W.2d 826|N.D."
    # 5) A page with a ¶ symbol, star, and/or colon. E.g., "¶ 119:12-14"
    match = (
        re.match(r"\d{1,6}-\d{1,6}", page)  # Simple page range
        or is_roman(page)  # Roman numeral
        or re.match(r"\d{1,6}[-]?[a-zA-Z]{1,6}", page)  # CT/IL page
        or re.match(r"\d{1,6}", page)  # Weird suffix
        or re.match(r"[*\u00b6\ ]*[0-9:\-]+", page)  # ¶, star, colon
    )
    if match:
        return str(match.group(0))
    return None

예제 #4

0

파일 보기

파일: find_citations.py 프로젝트: jcushman/eyecite

def extract_shortform_citation(
    words: List[str],
    reporter_index: int,
) -> Optional[ShortformCitation]:
    """Given a list of words and the index of a federal reporter, look before
    and after to see if this is a short form citation. If found, construct
    and return a ShortformCitation object.

    Shortform 1: Adarand, 515 U.S., at 241
    Shortform 2: 515 U.S., at 241
    """
    # Don't check if we are at the beginning of a string
    if reporter_index <= 2:
        return None

    # Variables to extact
    volume: Union[int, str, None]
    page: Union[int, str, None]
    antecedent_guess: str

    # Get volume
    volume = strip_punct(words[reporter_index - 1])
    if volume.isdigit():
        volume = int(volume)
    else:
        # No volume, therefore not a valid citation
        return None

    # Get page
    try:
        page = parse_page(words[reporter_index + 2])
        if not page:
            # There might be a comma in the way, so try one more index
            page = parse_page(words[reporter_index + 3])
            if not page:
                # No page, therefore not a valid citation
                return None
    except IndexError:
        return None

    # Get antecedent
    antecedent_guess = words[reporter_index - 2]
    if antecedent_guess == ",":
        antecedent_guess = words[reporter_index - 3] + ","

    # Get reporter
    reporter = words[reporter_index]

    # Return ShortformCitation
    return ShortformCitation(
        reporter,
        page,
        volume,
        antecedent_guess,
        reporter_found=reporter,
        reporter_index=reporter_index,
    )

예제 #5

0

파일 보기

파일: helpers.py 프로젝트: jcushman/eyecite

def get_year(token: str) -> Optional[int]:
    """Given a string token, look for a valid 4-digit number at the start and
    return its value.
    """
    token = strip_punct(token)
    if not token.isdigit():
        # Sometimes funny stuff happens?
        token = re.sub(r"(\d{4}).*", r"\1", token)
        if not token.isdigit():
            return None
    if len(token) != 4:
        return None
    year = int(token)
    if year < 1754:  # Earliest case in the database
        return None
    return year

예제 #6

0

파일 보기

파일: helpers.py 프로젝트: jcushman/eyecite

def add_defendant(citation: Citation, words: List[str]) -> None:
    """Scan backwards from 2 tokens before reporter until you find v., in re,
    etc. If no known stop-token is found, no defendant name is stored.  In the
    future, this could be improved.
    """
    start_index = None
    back_seek = citation.reporter_index - BACKWARD_SEEK
    for index in range(citation.reporter_index - 1, max(back_seek, 0), -1):
        word = words[index]
        if word == ",":
            # Skip it
            continue
        if strip_punct(word).lower() in STOP_TOKENS:
            if word == "v.":
                citation.plaintiff = words[index - 1]
            start_index = index + 1
            break
        if word.endswith(";"):
            # String citation
            break
    if start_index:
        citation.defendant = " ".join(
            words[start_index:citation.reporter_index - 1])

예제 #7

0

파일 보기

파일: find_citations.py 프로젝트: jcushman/eyecite

def get_citations(
    text: str,
    html: bool = True,
    do_post_citation: bool = True,
    do_defendant: bool = True,
    disambiguate: bool = True,
) -> List[Union[NonopinionCitation, Citation]]:
    """Main function"""
    if html:
        text = get_visible_text(text)
    words = tokenize(text)
    citations: List[Union[Citation, NonopinionCitation]] = []

    for i in range(0, len(words) - 1):
        citation_token = words[i]
        citation: Union[Citation, NonopinionCitation, None] = None

        # CASE 1: Citation token is a reporter (e.g., "U. S.").
        # In this case, first try extracting it as a standard, full citation,
        # and if that fails try extracting it as a short form citation.
        if citation_token in list(EDITIONS.keys()) + list(
                VARIATIONS_ONLY.keys()):
            citation = extract_full_citation(words, i)
            if citation:
                # CASE 1A: Standard citation found, try to add additional data
                if do_post_citation:
                    add_post_citation(citation, words)
                if do_defendant:
                    add_defendant(citation, words)
            else:
                # CASE 1B: Standard citation not found, so see if this
                # reference to a reporter is a short form citation instead
                citation = extract_shortform_citation(words, i)

                if not citation:
                    # Neither a full nor short form citation
                    continue

        # CASE 2: Citation token is an "Id." or "Ibid." reference.
        # In this case, the citation should simply be to the item cited
        # immediately prior, but for safety we will leave that resolution up
        # to the user.
        elif citation_token.lower() in {"id.", "id.,", "ibid."}:
            citation = extract_id_citation(words, i)

        # CASE 3: Citation token is a "supra" reference.
        # In this case, we're not sure yet what the citation's antecedent is.
        # It could be any of the previous citations above. Thus, like an Id.
        # citation, for safety we won't resolve this reference yet.
        elif strip_punct(citation_token.lower()) == "supra":
            citation = extract_supra_citation(words, i)

        # CASE 4: Citation token is a section marker.
        # In this case, it's likely that this is a reference to a non-
        # opinion document. So we record this marker in order to keep
        # an accurate list of the possible antecedents for id citations.
        elif "§" in citation_token:
            citation = NonopinionCitation(match_token=citation_token)

        # CASE 5: The token is not a citation.
        else:
            continue

        if citation is not None:
            citations.append(citation)

    # Disambiguate each citation's reporter
    if disambiguate:
        citations = disambiguate_reporters(citations)

    citations = remove_address_citations(citations)

    # Set each citation's court property to "scotus" by default
    for citation in citations:
        if (isinstance(citation, Citation) and not citation.court
                and is_scotus_reporter(citation)):
            citation.court = "scotus"

    # Returns a list of citations ordered in the sequence that they appear in
    # the document. The ordering of this list is important for reconstructing
    # the references of the ShortformCitation, SupraCitation, and
    # IdCitation objects.
    return citations