예제 #1
0
def legal_generator(doc):
    legal_spans = []
    for span in utils.get_spans(doc, ["proper2_detector", "nnp_detector"]):
        if not utils.is_likely_proper(doc[span.end - 1]):
            continue
        last_token = doc[span.end - 1].text.title().rstrip("s")

        if last_token in LEGAL:
            legal_spans.append((span.start, span.end, "LAW"))

    # Handling legal references such as Article 5
    for i in range(len(doc) - 1):
        if doc[i].text.rstrip("s") in {
                "Article", "Paragraph", "Section", "Chapter", "§"
        }:
            if doc[i + 1].text[0].isdigit() or doc[i +
                                                   1].text in ROMAN_NUMERALS:
                start, end = i, i + 2
                if (i < len(doc) - 3 and doc[i + 2].text in {"-", "to", "and"}
                        and (doc[i + 3].text[0].isdigit()
                             or doc[i + 3].text in ROMAN_NUMERALS)):
                    end = i + 4
                legal_spans.append((start, end, "LAW"))

    # Merge contiguous spans of legal references ("Article 5, Paragraph 3")
    legal_spans = utils.merge_contiguous_spans(legal_spans, doc)
    for start, end, label in legal_spans:
        yield start, end, label
예제 #2
0
def date_generator(doc):
    """Searches for occurrences of date patterns in text"""

    spans = []

    i = 0
    while i < len(doc):
        tok = doc[i]
        if tok.lemma_ in DAYS | DAYS_ABBRV:
            spans.append((i, i + 1, "DATE"))
        elif tok.is_digit and re.match(
                "\\d+$", tok.text) and 1920 < int(tok.text) < 2040:
            spans.append((i, i + 1, "DATE"))
        elif tok.lemma_ in MONTHS | MONTHS_ABBRV:
            if tok.tag_ == "MD":  # Skipping "May" used as auxiliary
                pass
            elif i > 0 and re.match(
                    "\\d+$", doc[i - 1].text) and int(doc[i - 1].text) < 32:
                spans.append((i - 1, i + 1, "DATE"))
            elif i > 1 and re.match(
                    "\\d+(?:st|nd|rd|th)$",
                    doc[i - 2].text) and doc[i - 1].lower_ == "of":
                spans.append((i - 2, i + 1, "DATE"))
            elif i < len(doc) - 1 and re.match(
                    "\\d+$", doc[i + 1].text) and int(doc[i + 1].text) < 32:
                spans.append((i, i + 2, "DATE"))
                i += 1
            else:
                spans.append((i, i + 1, "DATE"))
        i += 1

    for start, end, content in utils.merge_contiguous_spans(spans, doc):
        yield start, end, content