def extract_base_citation(words, reporter_index): """Construct and return a citation object from a list of "words" Given a list of words and the index of a federal reporter, look before and after for volume and page. If found, construct and return a Citation object. """ volume = strip_punct(words[reporter_index - 1]) if volume.isdigit(): volume = int(volume) else: # No volume, therefore not a valid citation return None page = strip_punct(words[reporter_index + 1]) if page.isdigit(): # Most page numbers will be digits. page = int(page) else: if isroman(page): # Some places like Nebraska have Roman numerals, e.g. in # '250 Neb. xxiv (1996)'. No processing needed. pass elif re.match('\d{1,6}[-]?[a-zA-Z]{1,6}', page): # Some places, like Connecticut, have pages like "13301-M". # Other places, like Illinois have "pages" like "110311-B". pass else: # Not Roman, and not a weird connecticut page number. return None reporter = words[reporter_index] return Citation(reporter, page, volume, reporter_found=reporter, reporter_index=reporter_index)
def parse_page(page): page = strip_punct(page) if page.isdigit(): # First, check whether the page is a simple digit. Most will be. return int(page) else: # Otherwise, check whether the "page" is really one of the following: # (ordered in descending order of likelihood) # 1) A numerical page range. E.g., "123-124" # 2) A roman numeral. E.g., "250 Neb. xxiv (1996)" # 3) A special Connecticut or Illinois number. E.g., "13301-M" # 4) A page with a weird suffix. E.g., "559 N.W.2d 826|N.D." # 5) A page with a ¶ symbol, star, and/or colon. E.g., "¶ 119:12-14" match = ( re.match(r"\d{1,6}-\d{1,6}", page) # Simple page range or isroman(page) # Roman numeral or re.match(r"\d{1,6}[-]?[a-zA-Z]{1,6}", page) # CT/IL page or re.match(r"\d{1,6}", page) # Weird suffix or re.match(r"[*\u00b6\ ]*[0-9:\-]+", page) # ¶, star, colon ) if match: return match.group(0) else: return None
def extract_base_citation(words, reporter_index): """Construct and return a citation object from a list of "words" Given a list of words and the index of a federal reporter, look before and after for volume and page. If found, construct and return a Citation object. """ volume = strip_punct(words[reporter_index - 1]) if volume.isdigit(): volume = int(volume) else: # No volume, therefore not a valid citation return None page = strip_punct(words[reporter_index + 1]) if page.isdigit(): # Most page numbers will be digits. page = int(page) else: if isroman(page): # Some places like Nebraska have Roman numerals, e.g. in # '250 Neb. xxiv (1996)'. No processing needed. pass elif re.match('\d{1,5}[-]?[a-zA-Z]{1,6}', page): # Some places, like Connecticut, have pages like "13301-M" pass else: # Not Roman, and not a weird connecticut page number. return None reporter = words[reporter_index] return Citation(reporter, page, volume, reporter_found=reporter, reporter_index=reporter_index)
def extract_base_citation(words, reporter_index): """Construct and return a citation object from a list of "words" Given a list of words and the index of a federal reporter, look before and after for volume and page. If found, construct and return a Citation object. If we are given neutral, tax court opinions we treat them differently. The formats often follow {REPORTER} {YEAR}-{ITERATIVE_NUMBER} ex. T.C. Memo. 2019-13 """ reporter = words[reporter_index] neutral_tc_reporter = is_neutral_tc_reporter(reporter) if neutral_tc_reporter: volume, page = (words[reporter_index + 1].encode("utf-8").replace( "–", "-").split("-")) else: # "Normal" reporter: XX F.2d YY if reporter_index == 0: return None volume = strip_punct(words[reporter_index - 1]) page = strip_punct(words[reporter_index + 1]) # Normalize volume and page if volume.isdigit(): volume = int(volume) else: # No volume, therefore not a valid citation return None if page.isdigit(): # Most page numbers will be digits. page = int(page) else: if isroman(page): # Some places like Nebraska have Roman numerals, e.g. in # '250 Neb. xxiv (1996)'. No processing needed. pass elif re.match(r"\d{1,6}[-]?[a-zA-Z]{1,6}", page): # Some places, like Connecticut, have pages like "13301-M". # Other places, like Illinois have "pages" like "110311-B". pass else: # Not Roman, and not a weird connecticut page number. Thus a bad # value. Abort. return None return Citation( reporter, page, volume, reporter_found=reporter, reporter_index=reporter_index, )