Пример #1
0
def parse_html(session, report_date, soup, page_id, original_url):
    divnumber = 0
    report_view = soup.find('div', attrs={'id': 'ReportView'})
    div_children_of_report_view = report_view.findChildren('div',
                                                           recursive=False)
    if len(div_children_of_report_view) != 1:
        raise Exception, 'We only expect one <div> child of <div id="ReportView">; there were %d in page with ID %d' % (
            len(div_children_of_report_view), page_id)

    Speech.reset_speakers_so_far()

    main_div = div_children_of_report_view[0]

    top_level_divs = main_div.findChildren('div', recursive=False)

    # The first div should just contain links to sections further down
    # the page:

    contents_div, text_div = top_level_divs

    # Just check that my assumption that the first div only contains
    # links is correct:

    contents_tuples = []

    contents_links = contents_div.findAll(True)
    for link in contents_links:
        if link.name == 'br':
            continue
        if link.name != 'a':
            raise Exception, "There was something other than a <br> or an <a> in the supposed contents <div>, for page ID: %d" % (
                page_id, )
        href = link['href']
        m = re.search(r'#(.*)', href)
        if not m:
            raise Exception, "Failed to find the ID from '%s' in page with ID: %d" % (
                href, page_id)
        contents_tuples.append(
            (m.group(1), tidy_string(non_tag_data_in(link))))

    parsed_page = ParsedPage(session, report_date, page_id)

    # Now consider the div that actually has text in it.  Each speech
    # is in its own div, while the rest that we care about are
    # headings:

    current_votes = None
    current_division_way = None
    current_time = None
    current_url = original_url

    for top_level in text_div:
        # There are sometimes some empty NavigableString elements at
        # the top level, so just ignore those:
        if not len(unicode(top_level).strip()):
            continue
        if top_level.name == 'h2':
            section_title = tidy_string(
                non_tag_data_in(top_level, tag_replacement=u' '))
            if not section_title:
                raise Exception, "There was an empty section title in page ID: %d" % (
                    page_id)
            parsed_page.sections.append(Section(section_title, current_url))
        elif top_level.name in ('br', ):
            # Ignore line breaks - we use paragraphs instead
            continue
        elif top_level.name == 'a':
            try:
                current_url = original_url + "#" + top_level['id']
            except KeyError:
                pass
        elif top_level.name == 'div':
            # This div contains a speech, essentially:

            # the new style pages wraps speeches in p.span tags that we can ignore so
            # remove them from the tree. Occasionally there are multiple spans in a p
            # hence the for loop
            # This does mean we are losing some formatting information but because it's
            # hardcoded style attributes in the spans it's arguable that we'd want to
            # remove them anyway.
            for p in top_level.findChildren('p'):
                if p.span:
                    for span in p.findChildren('span'):
                        span.unwrap()
                    p.unwrap()

            removed_number = None
            for speech_part in top_level:
                if hasattr(speech_part, 'name') and speech_part.name != None:
                    if speech_part.name == 'b':
                        speaker_name = non_tag_data_in(speech_part)
                        # If there's a leading question number remove that (and any whitespace)
                        match = re.match(r'^\d+\.\s*', speaker_name)
                        if match:
                            speaker_name = re.sub(r'^\d+\.\s*', '',
                                                  speaker_name)
                            removed_number = match.group(0)
                        # If there's a training colon, remove that (and any whitespace)
                        speaker_name = re.sub(r'[\s:]*$', '', speaker_name)
                        current_speech = Speech(tidy_string(speaker_name),
                                                report_date, current_time,
                                                current_url)
                        parsed_page.sections[-1].speeches_and_votes.append(
                            current_speech)
                    elif speech_part.name == 'br':
                        # Ignore the line breaks...
                        pass
                    elif speech_part.name == 'ul':
                        current_speech.paragraphs.append(speech_part.html)
                    elif speech_part.name == 'a' and speech_part.text == '':
                        # skip empty a anchors
                        pass
                    else:
                        raise Exception, "Unexpected tag '%s' in page ID: %d" % (
                            speech_part.name, page_id)
                elif isinstance(speech_part, NavigableString):
                    tidied_paragraph = tidy_string(speech_part)
                    if tidied_paragraph == "":
                        # just ignore blank lines
                        continue
                    # print "tidied_paragraph is", tidied_paragraph.encode('utf-8'), "of type", type(tidied_paragraph)
                    division_way, division_candidate, division_candidate_id = is_division_way(
                        tidied_paragraph, report_date)
                    member_vote = is_member_vote(
                        tidied_paragraph,
                        report_date,
                        expecting_a_vote=current_votes)
                    maybe_time = just_time(tidied_paragraph)
                    closed_time = meeting_closed(tidied_paragraph)
                    if closed_time:
                        current_time = closed_time
                    suspended_time_tuple = meeting_suspended(tidied_paragraph)
                    if suspended_time_tuple:
                        suspended, suspension_time_type, suspension_time = suspended_time_tuple
                    else:
                        suspended = False
                        suspension_time_type = suspension_time = None
                    if division_way:
                        # If this is a vote for a particular
                        # candidate, or the introduction to an
                        # oath-taking, add the text as a speech too:
                        if division_candidate:
                            current_speech = Speech(None, report_date,
                                                    current_time, current_url)
                            parsed_page.sections[-1].speeches_and_votes.append(
                                current_speech)
                            current_speech.paragraphs.append(tidied_paragraph)
                        if (not current_votes) or (current_votes.candidate !=
                                                   division_candidate):
                            current_votes = Division(report_date, current_url,
                                                     divnumber,
                                                     division_candidate,
                                                     division_candidate_id)
                            divnumber += 1
                            parsed_page.sections[-1].speeches_and_votes.append(
                                current_votes)
                        current_division_way = division_way
                    elif member_vote:
                        if current_votes is None:
                            raise Exception, "Got a member's vote before an indication of which way the vote is"
                        current_votes.add_vote(current_division_way,
                                               tidied_paragraph, member_vote)
                    elif maybe_time:
                        current_time = maybe_time
                    else:
                        if current_votes:
                            current_votes = None
                        # If this speech doesn't have any paragraphs
                        # yet, make sure that it has the current time,
                        # so that (for example) if we get a "Meeting
                        # closed at 17:44." at the end, that speech
                        # ends up with that time.
                        if len(current_speech.paragraphs) == 0:
                            current_speech.last_time = current_time
                        if removed_number and tidied_paragraph:
                            tidied_paragraph = removed_number + tidied_paragraph
                            removed_number = None
                        current_speech.paragraphs.append(tidied_paragraph)
                    if suspended and suspension_time:
                        current_time = suspension_time
                else:
                    raise Exception, "Totally unparsed element:\n%s\n... unhandled in page ID: %d" % (
                        speech_part, page_id)

        else:
            raise Exception, "There was an unhandled element '%s' in page with ID: %d" % (
                top_level.name, page_id)

    return parsed_page
def parse_html(session, report_date, soup, page_id, original_url):
    divnumber = 0
    report_view = soup.find('div', attrs={'id': 'ReportView'})
    div_children_of_report_view = report_view.findChildren('div', recursive=False)
    if len(div_children_of_report_view) != 1:
        raise Exception, 'We only expect one <div> child of <div id="ReportView">; there were %d in page with ID %d' % (len(div_children_of_report_view), page_id)

    Speech.reset_speakers_so_far()

    main_div = div_children_of_report_view[0]

    top_level_divs = main_div.findChildren('div', recursive=False)

    # The first div should just contain links to sections further down
    # the page:

    contents_div, text_div = top_level_divs

    # Just check that my assumption that the first div only contains
    # links is correct:

    contents_tuples = []

    contents_links = contents_div.findAll(True)
    for link in contents_links:
        if link.name == 'br':
            continue
        if link.name != 'a':
            raise Exception, "There was something other than a <br> or an <a> in the supposed contents <div>, for page ID: %d" % (page_id,)
        href = link['href']
        m = re.search(r'#(.*)', href)
        if not m:
            raise Exception, "Failed to find the ID from '%s' in page with ID: %d" % (href, page_id)
        contents_tuples.append((m.group(1), tidy_string(non_tag_data_in(link))))

    parsed_page = ParsedPage(session, report_date, page_id)

    # Now consider the div that actually has text in it.  Each speech
    # is in its own div, while the rest that we care about are
    # headings:

    current_votes = None
    current_division_way = None
    current_time = None
    current_url = original_url

    for top_level in text_div:
        # There are sometimes some empty NavigableString elements at
        # the top level, so just ignore those:
        if not len(unicode(top_level).strip()):
            continue
        if top_level.name == 'h2':
            section_title = tidy_string(non_tag_data_in(top_level, tag_replacement=u' '))
            if not section_title:
                raise Exception, "There was an empty section title in page ID: %d" % (page_id)
            parsed_page.sections.append(
                Section(section_title, current_url))
        elif top_level.name in ('br',):
            # Ignore line breaks - we use paragraphs instead
            continue
        elif top_level.name == 'a':
            try:
                current_url = original_url + "#" + top_level['id']
            except KeyError:
                pass
        elif top_level.name == 'div':
            # This div contains a speech, essentially:

            # the new style pages wraps speeches in p.span tags that we can ignore so
            # remove them from the tree. Occasionally there are multiple spans in a p
            # hence the for loop
            # This does mean we are losing some formatting information but because it's
            # hardcoded style attributes in the spans it's arguable that we'd want to
            # remove them anyway.
            for p in top_level.findChildren('p'):
                if p.span:
                    for span in p.findChildren('span'):
                        span.unwrap()
                    p.unwrap()

            removed_number = None
            for speech_part in top_level:
                if hasattr(speech_part, 'name') and speech_part.name != None:
                    if speech_part.name == 'b':
                        speaker_name = non_tag_data_in(speech_part)
                        # If there's a leading question number remove that (and any whitespace)
                        match = re.match(r'^\d+\.\s*', speaker_name)
                        if match:
                            speaker_name = re.sub(r'^\d+\.\s*', '', speaker_name)
                            removed_number = match.group(0)
                        # If there's a training colon, remove that (and any whitespace)
                        speaker_name = re.sub(r'[\s:]*$', '', speaker_name)
                        current_speech = Speech(tidy_string(speaker_name),
                                                report_date,
                                                current_time,
                                                current_url)
                        parsed_page.sections[-1].speeches_and_votes.append(current_speech)
                    elif speech_part.name == 'br':
                        # Ignore the line breaks...
                        pass
                    elif speech_part.name == 'ul':
                        current_speech.paragraphs.append(speech_part.html)
                    elif speech_part.name == 'a' and speech_part.text == '':
                        # skip empty a anchors
                        pass
                    else:
                        raise Exception, "Unexpected tag '%s' in page ID: %d" % (speech_part.name, page_id)
                elif isinstance(speech_part, NavigableString):
                    tidied_paragraph = tidy_string(speech_part)
                    if tidied_paragraph == "":
                        # just ignore blank lines
                        continue
                    # print "tidied_paragraph is", tidied_paragraph.encode('utf-8'), "of type", type(tidied_paragraph)
                    division_way, division_candidate, division_candidate_id = is_division_way(tidied_paragraph, report_date)
                    member_vote = is_member_vote(tidied_paragraph, report_date, expecting_a_vote=current_votes)
                    maybe_time = just_time(tidied_paragraph)
                    closed_time = meeting_closed(tidied_paragraph)
                    if closed_time:
                        current_time = closed_time
                    suspended_time_tuple = meeting_suspended(tidied_paragraph)
                    if suspended_time_tuple:
                        suspended, suspension_time_type, suspension_time = suspended_time_tuple
                    else:
                        suspended = False
                        suspension_time_type = suspension_time = None
                    if division_way:
                        # If this is a vote for a particular
                        # candidate, or the introduction to an
                        # oath-taking, add the text as a speech too:
                        if division_candidate:
                            current_speech = Speech(None,
                                                    report_date,
                                                    current_time,
                                                    current_url)
                            parsed_page.sections[-1].speeches_and_votes.append(current_speech)
                            current_speech.paragraphs.append(tidied_paragraph)
                        if (not current_votes) or (current_votes.candidate != division_candidate):
                            current_votes = Division(report_date, current_url, divnumber, division_candidate, division_candidate_id)
                            divnumber += 1
                            parsed_page.sections[-1].speeches_and_votes.append(current_votes)
                        current_division_way = division_way
                    elif member_vote:
                        if current_votes is None:
                            raise Exception, "Got a member's vote before an indication of which way the vote is"
                        current_votes.add_vote(current_division_way, tidied_paragraph, member_vote)
                    elif maybe_time:
                        current_time = maybe_time
                    else:
                        if current_votes:
                            current_votes = None
                        # If this speech doesn't have any paragraphs
                        # yet, make sure that it has the current time,
                        # so that (for example) if we get a "Meeting
                        # closed at 17:44." at the end, that speech
                        # ends up with that time.
                        if len(current_speech.paragraphs) == 0:
                            current_speech.last_time = current_time
                        if removed_number and tidied_paragraph:
                            tidied_paragraph = removed_number + tidied_paragraph
                            removed_number = None
                        current_speech.paragraphs.append(tidied_paragraph)
                    if suspended and suspension_time:
                        current_time = suspension_time
                else:
                    raise Exception, "Totally unparsed element:\n%s\n... unhandled in page ID: %d" % (speech_part, page_id)

        else:
            raise Exception, "There was an unhandled element '%s' in page with ID: %d" % (top_level.name, page_id)

    return parsed_page