def parse_html(session, report_date, soup, page_id, original_url): divnumber = 0 report_view = soup.find('div', attrs={'id': 'ReportView'}) div_children_of_report_view = report_view.findChildren('div', recursive=False) if len(div_children_of_report_view) != 1: raise Exception, 'We only expect one <div> child of <div id="ReportView">; there were %d in page with ID %d' % ( len(div_children_of_report_view), page_id) Speech.reset_speakers_so_far() main_div = div_children_of_report_view[0] top_level_divs = main_div.findChildren('div', recursive=False) # The first div should just contain links to sections further down # the page: contents_div, text_div = top_level_divs # Just check that my assumption that the first div only contains # links is correct: contents_tuples = [] contents_links = contents_div.findAll(True) for link in contents_links: if link.name == 'br': continue if link.name != 'a': raise Exception, "There was something other than a <br> or an <a> in the supposed contents <div>, for page ID: %d" % ( page_id, ) href = link['href'] m = re.search(r'#(.*)', href) if not m: raise Exception, "Failed to find the ID from '%s' in page with ID: %d" % ( href, page_id) contents_tuples.append( (m.group(1), tidy_string(non_tag_data_in(link)))) parsed_page = ParsedPage(session, report_date, page_id) # Now consider the div that actually has text in it. Each speech # is in its own div, while the rest that we care about are # headings: current_votes = None current_division_way = None current_time = None current_url = original_url for top_level in text_div: # There are sometimes some empty NavigableString elements at # the top level, so just ignore those: if not len(unicode(top_level).strip()): continue if top_level.name == 'h2': section_title = tidy_string( non_tag_data_in(top_level, tag_replacement=u' ')) if not section_title: raise Exception, "There was an empty section title in page ID: %d" % ( page_id) parsed_page.sections.append(Section(section_title, current_url)) elif top_level.name in ('br', ): # Ignore line breaks - we use paragraphs instead continue elif top_level.name == 'a': try: current_url = original_url + "#" + top_level['id'] except KeyError: pass elif top_level.name == 'div': # This div contains a speech, essentially: # the new style pages wraps speeches in p.span tags that we can ignore so # remove them from the tree. Occasionally there are multiple spans in a p # hence the for loop # This does mean we are losing some formatting information but because it's # hardcoded style attributes in the spans it's arguable that we'd want to # remove them anyway. for p in top_level.findChildren('p'): if p.span: for span in p.findChildren('span'): span.unwrap() p.unwrap() removed_number = None for speech_part in top_level: if hasattr(speech_part, 'name') and speech_part.name != None: if speech_part.name == 'b': speaker_name = non_tag_data_in(speech_part) # If there's a leading question number remove that (and any whitespace) match = re.match(r'^\d+\.\s*', speaker_name) if match: speaker_name = re.sub(r'^\d+\.\s*', '', speaker_name) removed_number = match.group(0) # If there's a training colon, remove that (and any whitespace) speaker_name = re.sub(r'[\s:]*$', '', speaker_name) current_speech = Speech(tidy_string(speaker_name), report_date, current_time, current_url) parsed_page.sections[-1].speeches_and_votes.append( current_speech) elif speech_part.name == 'br': # Ignore the line breaks... pass elif speech_part.name == 'ul': current_speech.paragraphs.append(speech_part.html) elif speech_part.name == 'a' and speech_part.text == '': # skip empty a anchors pass else: raise Exception, "Unexpected tag '%s' in page ID: %d" % ( speech_part.name, page_id) elif isinstance(speech_part, NavigableString): tidied_paragraph = tidy_string(speech_part) if tidied_paragraph == "": # just ignore blank lines continue # print "tidied_paragraph is", tidied_paragraph.encode('utf-8'), "of type", type(tidied_paragraph) division_way, division_candidate, division_candidate_id = is_division_way( tidied_paragraph, report_date) member_vote = is_member_vote( tidied_paragraph, report_date, expecting_a_vote=current_votes) maybe_time = just_time(tidied_paragraph) closed_time = meeting_closed(tidied_paragraph) if closed_time: current_time = closed_time suspended_time_tuple = meeting_suspended(tidied_paragraph) if suspended_time_tuple: suspended, suspension_time_type, suspension_time = suspended_time_tuple else: suspended = False suspension_time_type = suspension_time = None if division_way: # If this is a vote for a particular # candidate, or the introduction to an # oath-taking, add the text as a speech too: if division_candidate: current_speech = Speech(None, report_date, current_time, current_url) parsed_page.sections[-1].speeches_and_votes.append( current_speech) current_speech.paragraphs.append(tidied_paragraph) if (not current_votes) or (current_votes.candidate != division_candidate): current_votes = Division(report_date, current_url, divnumber, division_candidate, division_candidate_id) divnumber += 1 parsed_page.sections[-1].speeches_and_votes.append( current_votes) current_division_way = division_way elif member_vote: if current_votes is None: raise Exception, "Got a member's vote before an indication of which way the vote is" current_votes.add_vote(current_division_way, tidied_paragraph, member_vote) elif maybe_time: current_time = maybe_time else: if current_votes: current_votes = None # If this speech doesn't have any paragraphs # yet, make sure that it has the current time, # so that (for example) if we get a "Meeting # closed at 17:44." at the end, that speech # ends up with that time. if len(current_speech.paragraphs) == 0: current_speech.last_time = current_time if removed_number and tidied_paragraph: tidied_paragraph = removed_number + tidied_paragraph removed_number = None current_speech.paragraphs.append(tidied_paragraph) if suspended and suspension_time: current_time = suspension_time else: raise Exception, "Totally unparsed element:\n%s\n... unhandled in page ID: %d" % ( speech_part, page_id) else: raise Exception, "There was an unhandled element '%s' in page with ID: %d" % ( top_level.name, page_id) return parsed_page
def parse_html(session, report_date, soup, page_id, original_url): divnumber = 0 report_view = soup.find('div', attrs={'id': 'ReportView'}) div_children_of_report_view = report_view.findChildren('div', recursive=False) if len(div_children_of_report_view) != 1: raise Exception, 'We only expect one <div> child of <div id="ReportView">; there were %d in page with ID %d' % (len(div_children_of_report_view), page_id) Speech.reset_speakers_so_far() main_div = div_children_of_report_view[0] top_level_divs = main_div.findChildren('div', recursive=False) # The first div should just contain links to sections further down # the page: contents_div, text_div = top_level_divs # Just check that my assumption that the first div only contains # links is correct: contents_tuples = [] contents_links = contents_div.findAll(True) for link in contents_links: if link.name == 'br': continue if link.name != 'a': raise Exception, "There was something other than a <br> or an <a> in the supposed contents <div>, for page ID: %d" % (page_id,) href = link['href'] m = re.search(r'#(.*)', href) if not m: raise Exception, "Failed to find the ID from '%s' in page with ID: %d" % (href, page_id) contents_tuples.append((m.group(1), tidy_string(non_tag_data_in(link)))) parsed_page = ParsedPage(session, report_date, page_id) # Now consider the div that actually has text in it. Each speech # is in its own div, while the rest that we care about are # headings: current_votes = None current_division_way = None current_time = None current_url = original_url for top_level in text_div: # There are sometimes some empty NavigableString elements at # the top level, so just ignore those: if not len(unicode(top_level).strip()): continue if top_level.name == 'h2': section_title = tidy_string(non_tag_data_in(top_level, tag_replacement=u' ')) if not section_title: raise Exception, "There was an empty section title in page ID: %d" % (page_id) parsed_page.sections.append( Section(section_title, current_url)) elif top_level.name in ('br',): # Ignore line breaks - we use paragraphs instead continue elif top_level.name == 'a': try: current_url = original_url + "#" + top_level['id'] except KeyError: pass elif top_level.name == 'div': # This div contains a speech, essentially: # the new style pages wraps speeches in p.span tags that we can ignore so # remove them from the tree. Occasionally there are multiple spans in a p # hence the for loop # This does mean we are losing some formatting information but because it's # hardcoded style attributes in the spans it's arguable that we'd want to # remove them anyway. for p in top_level.findChildren('p'): if p.span: for span in p.findChildren('span'): span.unwrap() p.unwrap() removed_number = None for speech_part in top_level: if hasattr(speech_part, 'name') and speech_part.name != None: if speech_part.name == 'b': speaker_name = non_tag_data_in(speech_part) # If there's a leading question number remove that (and any whitespace) match = re.match(r'^\d+\.\s*', speaker_name) if match: speaker_name = re.sub(r'^\d+\.\s*', '', speaker_name) removed_number = match.group(0) # If there's a training colon, remove that (and any whitespace) speaker_name = re.sub(r'[\s:]*$', '', speaker_name) current_speech = Speech(tidy_string(speaker_name), report_date, current_time, current_url) parsed_page.sections[-1].speeches_and_votes.append(current_speech) elif speech_part.name == 'br': # Ignore the line breaks... pass elif speech_part.name == 'ul': current_speech.paragraphs.append(speech_part.html) elif speech_part.name == 'a' and speech_part.text == '': # skip empty a anchors pass else: raise Exception, "Unexpected tag '%s' in page ID: %d" % (speech_part.name, page_id) elif isinstance(speech_part, NavigableString): tidied_paragraph = tidy_string(speech_part) if tidied_paragraph == "": # just ignore blank lines continue # print "tidied_paragraph is", tidied_paragraph.encode('utf-8'), "of type", type(tidied_paragraph) division_way, division_candidate, division_candidate_id = is_division_way(tidied_paragraph, report_date) member_vote = is_member_vote(tidied_paragraph, report_date, expecting_a_vote=current_votes) maybe_time = just_time(tidied_paragraph) closed_time = meeting_closed(tidied_paragraph) if closed_time: current_time = closed_time suspended_time_tuple = meeting_suspended(tidied_paragraph) if suspended_time_tuple: suspended, suspension_time_type, suspension_time = suspended_time_tuple else: suspended = False suspension_time_type = suspension_time = None if division_way: # If this is a vote for a particular # candidate, or the introduction to an # oath-taking, add the text as a speech too: if division_candidate: current_speech = Speech(None, report_date, current_time, current_url) parsed_page.sections[-1].speeches_and_votes.append(current_speech) current_speech.paragraphs.append(tidied_paragraph) if (not current_votes) or (current_votes.candidate != division_candidate): current_votes = Division(report_date, current_url, divnumber, division_candidate, division_candidate_id) divnumber += 1 parsed_page.sections[-1].speeches_and_votes.append(current_votes) current_division_way = division_way elif member_vote: if current_votes is None: raise Exception, "Got a member's vote before an indication of which way the vote is" current_votes.add_vote(current_division_way, tidied_paragraph, member_vote) elif maybe_time: current_time = maybe_time else: if current_votes: current_votes = None # If this speech doesn't have any paragraphs # yet, make sure that it has the current time, # so that (for example) if we get a "Meeting # closed at 17:44." at the end, that speech # ends up with that time. if len(current_speech.paragraphs) == 0: current_speech.last_time = current_time if removed_number and tidied_paragraph: tidied_paragraph = removed_number + tidied_paragraph removed_number = None current_speech.paragraphs.append(tidied_paragraph) if suspended and suspension_time: current_time = suspension_time else: raise Exception, "Totally unparsed element:\n%s\n... unhandled in page ID: %d" % (speech_part, page_id) else: raise Exception, "There was an unhandled element '%s' in page with ID: %d" % (top_level.name, page_id) return parsed_page