Exemplo n.º 1
0
def test_find_legislation(text, expected):
    tokenized = tokenization.tokenize(text)
    tokenized_titles = ttls.find_titles(tokenized)
    # Check result matches our expected output
    legislation_matches = lgsln.find_legislation(tokenized_titles, text)
    assert legislation_matches == expected

    # Check match can be located in the original text
    for legislation_match in legislation_matches:
        category, match = legislation_match
        title, begin, end = match
        assert text[begin:end] == title
Exemplo n.º 2
0
def get_meta(s):
    soup = bs4.BeautifulSoup(s, 'lxml')

    meta = {}

    case = soup.find(class_=u'case')

    meta['case'] = case.text

    parties = [x for x in case.children]
    separator = u" v "
    split = -1
    for i, party in enumerate(parties):
        if isinstance(party, bs4.Tag) and party.text == separator:
            split = i

    left_parties = [unicode(x) for x in parties[:split]]
    right_parties = [unicode(x) for x in parties[split+1:]]

    meta['parties'] = {
        'left': left_parties,
        'right': right_parties,
    }

    meta['keywords'] = {}

    keywords = soup.find_all(class_=u'kw')
    for kw in keywords:
        words = kw.text.split(_EM_DASH)
        category = words[0].strip()
        others = words[1:]
        meta['keywords'][category] = [s.strip() for s in others]

    citations = soup.find_all(class_=u'ncit')
    if not len(citations) == 1:
        raise Exception(u'Could not parse any citations for this document')

    citation = citations[0]
    citations = [c.strip() for c in citation.text.split(";")]

    pattern = r"^\[\d\d\d\d\]\sWLR\s\(D\)"

    if not len(citations) > 0 and not re.match(pattern, citations[-1], re.UNICODE):
        raise Exception(u'Unable to parse citation for this document')
    else:
        neutral = citations[:-1]  # Can be more than one
        iclr = citations[-1]      # Only ever one for iclr publications

    meta['cite_as'] = {
        'neutral': neutral,
        'iclr': iclr,
    }

    judges_and_date = citation.find_next_sibling('p').text

    # Fields look reliably separated by newlines
    fields = [f.strip() for f in judges_and_date.split('\n')]

    # Colons carry no meaning and are commonly-mispelled twice
    fields = [f[:-2] if f.endswith(u'::') else f for f in fields]
    fields = [f[:-1] if f.endswith(u':') else f for f in fields]

    if not len(fields) == 3:
        raise Exception('Could not parse hearing details')

    court_abbr, judges, date = fields
    judges = [j.strip() for j in judges.split(',')]

    meta['hearing'] = {
        'court_abbr': court_abbr,
        'judges': judges,
        'date': _date_to_js(_parse_date(date)),
    }

    # TODO: Reporter
    reporter_el = soup.find(class_=u'reporter')

    if reporter_el:
        reporter_str = reporter_el.text
        prefix = u'Reported by:'

        if reporter_str.startswith(prefix):
            reporter_str = reporter_str[len(prefix):]

        if reporter_str.endswith('.'):
            reporter_str = reporter_str[:-1]
    else:
        reporter_str = None

    def map_reporter(reporter):
        # Sometimes only the name is given
        reporter_fields = [f.strip() for f in reporter.split(',')]
        return {
            'name': reporter_fields[0] if len(reporter_fields) > 0 else None,
            'role': reporter_fields[1] if len(reporter_fields) > 1 else None,
        }

    if reporter_str is not None:
        reporters = [s.strip() for s in reporter_str.split('and')]
        reporters = map(map_reporter, reporters)
    else:
        reporters = []

    meta['reporters'] = reporters

    # TODO: Appearances in court, these need proper parsing. They are tricky!

    appearances = soup.find_all(class_='hnote')[-1].find_next_sibling('p').text
    prefix = u'Appearances:'
    if not appearances.startswith(prefix):
        appearances = ''
    else:
        appearances = appearances[len(prefix):].strip()

    meta['appearances'] = appearances

    # TODO: Body of report
    paras = soup.find_all(class_=u'hnote')
    body = u'\n\n'.join(p.text for p in paras)
    meta['body'] = body


    # TODO: Cited cases/legislation
    # TODO: Check len(citations) == len(candidates)
    # A common inconsistency is a reference to an act in square
    # brackets, like this: "Under the [2004] Act". Normally square
    # brackets indicate a citation of case-law.

    tokenized = tokenization.tokenize(body)
    tokenized_titles = ttls.find_titles(tokenized)

    citations = ctns.find_citations(body)
    candidate_citations = ctns.weak_find_citations(body)

    legislation = lgs.find_legislation(tokenized_titles, body)
    candidate_legislation = lgs.find_candidate_legislation(body)

    meta['citations'] = list(set(citations))
    meta['candidate_citations'] = candidate_citations
    meta['legislation'] = list(set(legislation))
    meta['candidate_legislation'] = candidate_legislation

    # TODO: Which party won?
    return meta