def get_page_title(soup): """Extract and clean up the title tag from a page Returns a string with the page title minus cruft""" title = soup.title.text title = re.sub('(?ims):.*$', '', title) title = tidy_string(title) return title
def find_id_and_possible_holding_date(self,tag_or_string): if tag_or_string.__class__ == Tag: if verbose: print "Parsing tag: "+str(tag_or_string) s = tidy_string(non_tag_data_in(tag_or_string)) else: if verbose: print "Parsing string: "+tag_or_string s = tag_or_string rest = self.find_holding_answer_issued(s) if rest: s = rest if len(s) > 0: m = re.search('\((S[0-9][A-Z0-9]+\-) ?([0-9]+)\)',s) if m: sp_id = m.group(1) + m.group(2) self.set_id(sp_id) return True return False
def find_id_and_possible_holding_date(self, tag_or_string): if tag_or_string.__class__ == Tag: if verbose: print "Parsing tag: " + str(tag_or_string) s = tidy_string(non_tag_data_in(tag_or_string)) else: if verbose: print "Parsing string: " + tag_or_string s = tag_or_string rest = self.find_holding_answer_issued(s) if rest: s = rest if len(s) > 0: m = re.search('\((S[0-9][A-Z0-9]+\-) ?([0-9]+)\)', s) if m: sp_id = m.group(1) + m.group(2) self.set_id(sp_id) return True return False
def parse(self,filename): m = re.match('(?ims)^.*(day-(wa-\d\d)_([a-z0-9]+\.htm))',filename) if not m: raise Exception, "Couldn't parse filename: "+filename self.original_url = "http://www.scottish.parliament.uk/business/pqa/%s/%s" % (m.group(2),m.group(3)) filename_leaf = m.group(1) # We need to know what date this is, so deal with that first # of all in a brutish fashion, but cache the results: self.date = None if file_to_date.has_key(filename_leaf): if verbose: print "Found file to date mapping in cache." self.date = datetime.date(*strptime(file_to_date[filename_leaf],"%Y-%m-%d")[0:3]) else: self.make_soup(filename) page_as_text = tidy_string(non_tag_data_in(self.soup.find('body'))) m = re.search('(?ims) (Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday) (\d+)\w* (\w+) (\d+)?',page_as_text) if m: day_of_week = m.group(1) day = m.group(2) month = month_name_to_int(m.group(3)) year = m.group(4) # Sometimes the date string doesn't have the year: if not year: m = re.search('day-wa-(\d\d)',filename) if m.group(1) == '99': year = '1999' else: year = '20' + m.group(1) self.date = datetime.date( int(year,10), month, int(day,10) ) if not options.quiet: "Adding file to date mapping to cache." add_file_to_date_mapping(filename_leaf,str(self.date)) else: raise Exception, "No date found in file: "+filename temp_output_filename = xml_output_directory + "tmp.xml" output_filename = xml_output_directory + "spwa" + str(self.date) + ".xml" if os.path.exists(output_filename): #error = "The output file "+output_filename+" already exists - skipping "+re.sub('^.*/','',filename) # raise Exception, error #if not options.quiet: print error return if not options.quiet: print "Parsing %s" % filename self.make_soup(filename) self.ofp = open(temp_output_filename,"w") self.ofp.write('''<?xml version="1.0" encoding="utf-8"?> <!DOCTYPE publicwhip [ <!ENTITY pound "£"> <!ENTITY euro "€"> <!ENTITY agrave "à"> <!ENTITY aacute "á"> <!ENTITY egrave "è"> <!ENTITY eacute "é"> <!ENTITY ecirc "ê"> <!ENTITY iacute "í"> <!ENTITY ograve "ò"> <!ENTITY oacute "ó"> <!ENTITY uacute "ú"> <!ENTITY Aacute "Á"> <!ENTITY Eacute "É"> <!ENTITY Iacute "Í"> <!ENTITY Oacute "Ó"> <!ENTITY Uacute "Ú"> <!ENTITY Uuml "Ü"> <!ENTITY auml "ä"> <!ENTITY euml "ë"> <!ENTITY iuml "ï"> <!ENTITY ouml "ö"> <!ENTITY uuml "ü"> <!ENTITY fnof "ƒ"> <!ENTITY aelig "æ"> <!ENTITY dagger "†"> <!ENTITY reg "®"> <!ENTITY nbsp " "> <!ENTITY shy "­"> <!ENTITY deg "°"> <!ENTITY middot "·"> <!ENTITY ordm "º"> <!ENTITY ndash "–"> <!ENTITY mdash "—"> <!ENTITY lsquo "‘"> <!ENTITY rsquo "’"> <!ENTITY ldquo "“"> <!ENTITY rdquo "”"> <!ENTITY hellip "…"> <!ENTITY bull "•"> <!ENTITY acirc "â"> <!ENTITY Agrave "À"> <!ENTITY Aring "Å"> <!ENTITY aring "å"> <!ENTITY atilde "ã"> <!ENTITY Ccedil "Ç"> <!ENTITY ccedil "ç"> <!ENTITY Egrave "È"> <!ENTITY Icirc "Î"> <!ENTITY icirc "î"> <!ENTITY Igrave "Ì"> <!ENTITY igrave "ì"> <!ENTITY ntilde "ñ"> <!ENTITY ocirc "ô"> <!ENTITY oelig "œ"> <!ENTITY Ograve "Ò"> <!ENTITY Oslash "Ø"> <!ENTITY oslash "ø"> <!ENTITY Scaron "Š"> <!ENTITY scaron "š"> <!ENTITY sup1 "¹"> <!ENTITY sup2 "²"> <!ENTITY sup3 "³"> <!ENTITY ugrave "ù"> <!ENTITY ucirc "û"> <!ENTITY Ugrave "Ù"> <!ENTITY yacute "ý"> <!ENTITY frac12 "½"> <!ENTITY micro "µ"> <!ENTITY sbquo "‚"> <!ENTITY trade "™"> <!ENTITY Dagger "‡"> <!ENTITY radic "√"> ]> <publicwhip> ''') self.ofp.write("<source url=\"%s\"/>" % self.original_url ) tag_with_most_paragraphs = None most_paragraphs_so_far = -1 for t in self.soup.findAll(True): ps = paragraphs_in_tag(t) if ps > most_paragraphs_so_far: tag_with_most_paragraphs = t most_paragraphs_so_far = ps if verbose: print "Using element name: "+tag_with_most_paragraphs.name+" with "+str(most_paragraphs_so_far)+" paragraphs from "+filename if verbose: print tag_with_most_paragraphs.prettify() # When we're parsing we might have multiple questions in a # row. We say that something's a question rather than an # answer if (a) it's followed by an ID or (b) it begins with # "To ask", otherwise it's an answer. If we hit a new # heading, that suggests that the previous thing was an answer # as well. # The business of "Holding answers" is a bit confusing. At # the bottom of each page there may be a list of question IDs # which were given holding answers, but the text of the # question is not in the page - you only find it when the # question is eventually answered. for t in tag_with_most_paragraphs: if t.__class__ == NavigableString: s = str(t) s = re.sub('(?ims)\s+',' ',s) if re.match('(?ims)^\s*$',s): continue else: self.add_to_paragraph(tidy_string(str(t))) if verbose: print "string: "+str(s) elif t.__class__ == Tag: # Look for any <a name=""> tags in here: a = t.find( lambda p: p.name == 'a' and p.has_key('name') ) if a: self.sp_name = a['name'] if t.has_key('align') and t['align'].lower() == 'right': # Right aligned tags just have the question ID. if self.find_id_and_possible_holding_date(t): self.complete_question() else: if verbose: print "Couldn't parse top-level right aligned tag: "+str(t) elif t.has_key('class') and t['class'] == 'largeHeading': self.add_large_heading(tidy_string(non_tag_data_in(t))) elif self.something_centered(t) or self.c1_heading(t): # Centred tags are headings for questions... s = tidy_string(non_tag_data_in(t)) if len(s) > 0: self.complete_answer() if verbose: print "center: "+s self.add_heading(s) elif t.name == 'table': # This is probably a table that's inserted just to # right align the question ID. The left cell may # contain something to indicate that it's a # holding answer. if self.find_id_and_possible_holding_date(t): # Then also look for the "Holding answer # issued" details... s = non_tag_data_in(t) self.find_holding_answer_issued(s) self.complete_question() else: # Then maybe it's a table as part of the # answer, so add it as a paragraph. self.add_paragraph(str(t)) elif t.name == 'p': if re.search("(The following questions were given holding answers|Questions given holding answers)",tidy_string(non_tag_data_in(t))): if verbose: print "Found the trailing holding question list!" # This indicates the end of the day's report # for us (just ignore the following list of # answers - it's not very interesting until we # parse some later day and we can tell what # the question was...) break if verbose: print "Didn't find the trailing holding question list in: "+non_tag_data_in(t) non_empty_contents = filter( lambda x: x.__class__ != NavigableString or not re.match('^\s*$',x), t.contents ) if len(non_empty_contents) == 0: continue initial_strong_text = '' while len(non_empty_contents) > 0 and non_empty_contents[0].__class__ == Tag and (non_empty_contents[0].name == 'strong' or non_empty_contents[0].name == 'b'): initial_strong_text += " " + non_tag_data_in(non_empty_contents[0]) non_empty_contents = non_empty_contents[1:] if len(initial_strong_text) > 0: speaker_name = tidy_string(initial_strong_text) # In some files this will be the ID (possibly # plus holding indication), not right aligned # as usual :( if self.find_id_and_possible_holding_date(speaker_name): self.complete_question() else: speaker_name = re.sub('(?ims)\s*:\s*$','',speaker_name) speaker_id = self.valid_speaker(speaker_name) if speaker_name and speaker_id: self.complete_answer() self.set_speaker(speaker_name,speaker_id) for e in non_empty_contents: s = tidy_string(str(e)) self.add_to_paragraph(s) else: self.add_paragraph_removing_enclosure(t) else: self.add_paragraph_removing_enclosure(t) elif t.name == 'div' or t.name == 'blockquote' or t.name == 'ol' or t.name == 'ul' or t.name == 'center': # Just add them in a paragraph anyway, even though # that wouldn't be valid HTML 4 strict in the case # of the last three (IIRC) self.add_paragraph(str(t)) else: # Well, if it's empty of text we don't care... s = non_tag_data_in(t) if not re.match('(?ims)^\s*$',s): raise Exception, "Unknown tag found of name '"+t.name+"' with text: "+t.prettify() self.complete_answer() # Now output all the XML, working out IDs for each element. # IDs are of the form: # # uk.org.publicwhip/spwa/YYYY-MM-DD.X.T # # .... where: # - YYYY-MM-DD is an ISO 8601 date # # - X is a integer starting at 0 on each day, which # should be incremented for each new heading and # be the same for a group of questions and their # answer. # # - T is "mh" or "h" for major and minor headings, # "q0", "q1", "q2", etc. for each group of # questions and "r0", "r1", etc. for the answers x = -1 last_heading = None current_sp_id = None index = 0 for i in range(0,len(self.all_stuff)): if i > 0: previous = self.all_stuff[i-1] else: previous = None if i < (len(self.all_stuff) - 1): next = self.all_stuff[i+1] else: next = None a = self.all_stuff[i] self.ofp.write('\n\n') if a.__class__ == Heading: last_was_answer = True if a.major: subtype = "mh" else: subtype = "h" if next and next.__class__ == QuestionOrReply and next.sp_id: # Then use the question's sp_id: self.ofp.write(a.to_xml(self.get_id(next.sp_id,subtype))) else: x += 1 self.ofp.write(a.to_xml(self.get_id(str(x),subtype))) last_heading = a elif a.__class__ == QuestionOrReply: # Occasionally we think questions are actually # answers, so check the beginning of the first # paragraph: if not a.is_question and len(a.paragraphs) > 0 and re.search('^(?ims)\s*To\s+ask',a.paragraphs[0]): a.is_question = True # If we're suddenly in an answer, reset index. if (not a.is_question) and previous and not (previous.__class__ == QuestionOrReply and not previous.is_question): index = 0 # If we're suddenly in a question, reset index and increment x unless the previous is a heading elif a.is_question: if previous: if previous.__class__ == QuestionOrReply: if previous.is_question: # If the one before is a question, that's fine. current_sp_id = a.sp_id else: current_sp_id = a.sp_id # If the previous one was an answer # then we need to replay the last # heading: if not last_heading: raise Exception, "Somehow there's been no heading so far." last_heading.sp_name = a.sp_name if current_sp_id: self.ofp.write(last_heading.to_xml(self.get_id(current_sp_id,"h"))) else: x += 1 self.ofp.write(last_heading.to_xml(self.get_id(str(x),"h"))) self.ofp.write("\n\n") index = 0 else: # i.e. this is the normal case, a question after a heading: current_sp_id = a.sp_id index = 0 else: raise Exception, "Nothing before the first question (no heading)" if a.is_question: subtype = "q" + str(index) else: subtype = "r" + str(index) if current_sp_id: self.ofp.write(a.to_xml(self.get_id(current_sp_id,subtype))) else: self.ofp.write(a.to_xml(self.get_id(str(x),subtype))) index += 1 self.ofp.write("</publicwhip>") self.ofp.close() retcode = call( [ "mv", temp_output_filename, output_filename ] ) if retcode != 0: raise Exception, "Moving "+temp_output_filename+" to "+output_filename+" failed." xmlvalidate.parse(output_filename) #retcode = call( [ "xmlstarlet", "val", output_filename ] ) #if retcode != 0: # raise Exception, "Validating "+output_filename+" for well-formedness failed." fil = open('%schangedates.txt' % xml_output_directory, 'a+') fil.write('%d,spwa%s.xml\n' % (time.time(), self.date)) fil.close()
def add_paragraph_removing_enclosure(self,t): paragraph = str(t) paragraph = re.sub('(?ims)^\s*<p[^>]*>\s*(.*)</p[^>]*>\s*$',r'\1',paragraph) paragraph = tidy_string(paragraph) self.add_paragraph(paragraph)
def parse_html(session, report_date, soup, page_id, original_url): divnumber = 0 report_view = soup.find('div', attrs={'id': 'ReportView'}) div_children_of_report_view = report_view.findChildren('div', recursive=False) if len(div_children_of_report_view) != 1: raise Exception, 'We only expect one <div> child of <div id="ReportView">; there were %d in page with ID %d' % ( len(div_children_of_report_view), page_id) Speech.reset_speakers_so_far() main_div = div_children_of_report_view[0] top_level_divs = main_div.findChildren('div', recursive=False) # The first div should just contain links to sections further down # the page: contents_div, text_div = top_level_divs # Just check that my assumption that the first div only contains # links is correct: contents_tuples = [] contents_links = contents_div.findAll(True) for link in contents_links: if link.name == 'br': continue if link.name != 'a': raise Exception, "There was something other than a <br> or an <a> in the supposed contents <div>, for page ID: %d" % ( page_id, ) href = link['href'] m = re.search(r'#(.*)', href) if not m: raise Exception, "Failed to find the ID from '%s' in page with ID: %d" % ( href, page_id) contents_tuples.append( (m.group(1), tidy_string(non_tag_data_in(link)))) parsed_page = ParsedPage(session, report_date, page_id) # Now consider the div that actually has text in it. Each speech # is in its own div, while the rest that we care about are # headings: current_votes = None current_division_way = None current_time = None current_url = original_url for top_level in text_div: # There are sometimes some empty NavigableString elements at # the top level, so just ignore those: if not len(unicode(top_level).strip()): continue if top_level.name == 'h2': section_title = tidy_string( non_tag_data_in(top_level, tag_replacement=u' ')) if not section_title: raise Exception, "There was an empty section title in page ID: %d" % ( page_id) parsed_page.sections.append(Section(section_title, current_url)) elif top_level.name in ('br', ): # Ignore line breaks - we use paragraphs instead continue elif top_level.name == 'a': try: current_url = original_url + "#" + top_level['id'] except KeyError: pass elif top_level.name == 'div': # This div contains a speech, essentially: # the new style pages wraps speeches in p.span tags that we can ignore so # remove them from the tree. Occasionally there are multiple spans in a p # hence the for loop # This does mean we are losing some formatting information but because it's # hardcoded style attributes in the spans it's arguable that we'd want to # remove them anyway. for p in top_level.findChildren('p'): if p.span: for span in p.findChildren('span'): span.unwrap() p.unwrap() removed_number = None for speech_part in top_level: if hasattr(speech_part, 'name') and speech_part.name != None: if speech_part.name == 'b': speaker_name = non_tag_data_in(speech_part) # If there's a leading question number remove that (and any whitespace) match = re.match(r'^\d+\.\s*', speaker_name) if match: speaker_name = re.sub(r'^\d+\.\s*', '', speaker_name) removed_number = match.group(0) # If there's a training colon, remove that (and any whitespace) speaker_name = re.sub(r'[\s:]*$', '', speaker_name) current_speech = Speech(tidy_string(speaker_name), report_date, current_time, current_url) parsed_page.sections[-1].speeches_and_votes.append( current_speech) elif speech_part.name == 'br': # Ignore the line breaks... pass elif speech_part.name == 'ul': current_speech.paragraphs.append(speech_part.html) elif speech_part.name == 'a' and speech_part.text == '': # skip empty a anchors pass else: raise Exception, "Unexpected tag '%s' in page ID: %d" % ( speech_part.name, page_id) elif isinstance(speech_part, NavigableString): tidied_paragraph = tidy_string(speech_part) if tidied_paragraph == "": # just ignore blank lines continue # print "tidied_paragraph is", tidied_paragraph.encode('utf-8'), "of type", type(tidied_paragraph) division_way, division_candidate, division_candidate_id = is_division_way( tidied_paragraph, report_date) member_vote = is_member_vote( tidied_paragraph, report_date, expecting_a_vote=current_votes) maybe_time = just_time(tidied_paragraph) closed_time = meeting_closed(tidied_paragraph) if closed_time: current_time = closed_time suspended_time_tuple = meeting_suspended(tidied_paragraph) if suspended_time_tuple: suspended, suspension_time_type, suspension_time = suspended_time_tuple else: suspended = False suspension_time_type = suspension_time = None if division_way: # If this is a vote for a particular # candidate, or the introduction to an # oath-taking, add the text as a speech too: if division_candidate: current_speech = Speech(None, report_date, current_time, current_url) parsed_page.sections[-1].speeches_and_votes.append( current_speech) current_speech.paragraphs.append(tidied_paragraph) if (not current_votes) or (current_votes.candidate != division_candidate): current_votes = Division(report_date, current_url, divnumber, division_candidate, division_candidate_id) divnumber += 1 parsed_page.sections[-1].speeches_and_votes.append( current_votes) current_division_way = division_way elif member_vote: if current_votes is None: raise Exception, "Got a member's vote before an indication of which way the vote is" current_votes.add_vote(current_division_way, tidied_paragraph, member_vote) elif maybe_time: current_time = maybe_time else: if current_votes: current_votes = None # If this speech doesn't have any paragraphs # yet, make sure that it has the current time, # so that (for example) if we get a "Meeting # closed at 17:44." at the end, that speech # ends up with that time. if len(current_speech.paragraphs) == 0: current_speech.last_time = current_time if removed_number and tidied_paragraph: tidied_paragraph = removed_number + tidied_paragraph removed_number = None current_speech.paragraphs.append(tidied_paragraph) if suspended and suspension_time: current_time = suspension_time else: raise Exception, "Totally unparsed element:\n%s\n... unhandled in page ID: %d" % ( speech_part, page_id) else: raise Exception, "There was an unhandled element '%s' in page with ID: %d" % ( top_level.name, page_id) return parsed_page
def is_division_way(element, report_date=None): """If it's a division heading, return a normalized version, otherwise None >>> is_division_way(' For ') ('FOR', None, None) >>> is_division_way('nonsense') (None, None, None) >>> is_division_way('abstentions ') ('ABSTENTIONS', None, None) >>> is_division_way(":\xA0FOR") ('FOR', None, None) >>> is_division_way('Abstention') ('ABSTENTIONS', None, None) >>> is_division_way('Absentions') ('ABSTENTIONS', None, None) >>> example_date = datetime.date(1999, 5, 13) >>> is_division_way('VOTES FOR DONALD DEWAR', example_date) ('FOR', 'Donald Dewar', u'uk.org.publicwhip/member/80147') >>> is_division_way('now cast your votes for someone', example_date) (None, None, None) >>> example_date = datetime.date(2000, 3, 14) >>> is_division_way('For Mr Kenneth Macintosh', example_date) ('FOR', 'Mr Kenneth Macintosh', u'uk.org.publicwhip/member/80191') >>> is_division_way('For option 1', example_date) ('FOR', 'Option 1', None) >>> is_division_way('The following member took the oath:') ('FOR', 'oath', None) >>> is_division_way('The following member made a solemn affirmation:') ('FOR', 'affirmation', None) >>> is_division_way('The following member made a solemn affirmation and repeated it in French:') ('FOR', 'affirmation', None) """ tidied = tidy_string(non_tag_data_in(element)).upper() # Strip any non-word letters at the start and end: tidied = re.sub(r'^\W*(.*?)\W*$', '\\1', tidied) if tidied in DIVISION_HEADINGS: return (tidied, None, None) elif tidied in ('ABSTENTION', 'ABSENTIONS'): return ('ABSTENTIONS', None, None) elif re.search( '^THE FOLLOWING MEMBERS? TOOK THE OATH( AND REPEATED IT IN .*)?:?$', tidied): return ('FOR', 'oath', None) elif re.search( '^THE FOLLOWING MEMBERS? MADE A SOLEMN AFFIRMATION( AND REPEATED IT IN .*)?:?$', tidied): return ('FOR', 'affirmation', None) elif len(tidied.split()) < 128: # The second regular expression could be *very* slow on # strings that begin 'FOR', so only try it on short strings # that might be introducing a division, and assume that there # are 2 to 4 words in the name: m1 = re.search(r'^(?i)VOTES? FOR ([A-Z ]+)$', tidied) m2 = re.search(r'^FOR ((?:[A-Z]+\s*){2,4})$', tidied) m = m1 or m2 if m: person_name = m.group(1).title() person_id = None if report_date: person_id = get_unique_person_id(person_name, report_date) return ('FOR', person_name, person_id) else: m = re.search(r'FOR OPTION (\d+)$', tidied) if m: return ('FOR', 'Option ' + m.group(1), None) return (None, None, None)
def is_member_vote(element, vote_date, expecting_a_vote=True): """Returns a speaker ID if this looks like a member's vote in a division Otherwise returns None. If it looks like a vote, but the speaker can't be identified, this throws an exception. As an example: >>> is_member_vote('Something random...', '2012-11-12') >>> is_member_vote('Baillie, Jackie (Dumbarton) (Lab)', '2012-11-12') u'uk.org.publicwhip/member/80476' >>> is_member_vote('Alexander, Ms Wendy (Paisley North) (Lab)', '2010-05-12') u'uk.org.publicwhip/member/80281' >>> is_member_vote('Purvis, Jeremy (Tweeddale, Ettrick and Lauderdale)', '2005-05-18') u'uk.org.publicwhip/member/80101' Now some examples that should be ignored: >>> is_member_vote(': SP 440 (EC Ref No 11766/99, COM(99) 473 final)', '1999-11-23') >>> is_member_vote('SP 666 (EC Ref No 566 99/0225, COM(99) (CNS))', '2000-02-08') >>> is_member_vote('to promote a private bill, the company relied on its general power under section 10(1)(xxxii)', '2006-05-22') And one that should throw an exception: >>> is_member_vote('Lebowski, Jeffrey (Los Angeles) (The Dude)', '2012-11-12') Traceback (most recent call last): ... Exception: A voting member 'Jeffrey Lebowski (Los Angeles)' couldn't be resolved If expecting_a_vote is False, then don't throw an exception if the name can't be resolved: >>> is_member_vote('Lebowski, Jeffrey (Los Angeles) (The Dude)', '2012-11-12', expecting_a_vote=False) Also try resolving names that aren't comma-reversed: >>> is_member_vote('Brian Adam (North-East Scotland) (SNP)', '1999-11-09') u'uk.org.publicwhip/member/80129' """ tidied = tidy_string(non_tag_data_in(element)) from_first_and_last = lambda m: m and "%s %s (%s)" % (m.group( 'first_names'), m.group('last_name'), m.group('constituency')) from_full = lambda m: m and m.group('full_name') vote_matches = ((member_vote_re, from_first_and_last), (member_vote_just_constituency_re, from_first_and_last), (member_vote_fullname_re, from_full)) reformed_name = first( processor(regexp.search(tidied)) for regexp, processor in vote_matches) if not reformed_name: return None person_id = get_unique_person_id(reformed_name, str(vote_date)) if person_id is None and expecting_a_vote: print "reformed_name is:", reformed_name print "vote_date is:", vote_date raise Exception, "A voting member '%s' couldn't be resolved" % ( reformed_name, ) else: return person_id
def parse(self, filename): m = re.match('(?ims)^.*(day-(wa-\d\d)_([a-z0-9]+\.htm))', filename) if not m: raise Exception, "Couldn't parse filename: " + filename self.original_url = "http://www.scottish.parliament.uk/business/pqa/%s/%s" % ( m.group(2), m.group(3)) filename_leaf = m.group(1) # We need to know what date this is, so deal with that first # of all in a brutish fashion, but cache the results: self.date = None if file_to_date.has_key(filename_leaf): if verbose: print "Found file to date mapping in cache." self.date = datetime.date( *strptime(file_to_date[filename_leaf], "%Y-%m-%d")[0:3]) else: self.make_soup(filename) page_as_text = tidy_string(non_tag_data_in(self.soup.find('body'))) m = re.search( '(?ims) (Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday) (\d+)\w* (\w+) (\d+)?', page_as_text) if m: day_of_week = m.group(1) day = m.group(2) month = month_name_to_int(m.group(3)) year = m.group(4) # Sometimes the date string doesn't have the year: if not year: m = re.search('day-wa-(\d\d)', filename) if m.group(1) == '99': year = '1999' else: year = '20' + m.group(1) self.date = datetime.date(int(year, 10), month, int(day, 10)) if not options.quiet: "Adding file to date mapping to cache." add_file_to_date_mapping(filename_leaf, str(self.date)) else: raise Exception, "No date found in file: " + filename temp_output_filename = xml_output_directory + "tmp.xml" output_filename = xml_output_directory + "spwa" + str( self.date) + ".xml" if os.path.exists(output_filename): #error = "The output file "+output_filename+" already exists - skipping "+re.sub('^.*/','',filename) # raise Exception, error #if not options.quiet: print error return if not options.quiet: print "Parsing %s" % filename self.make_soup(filename) self.ofp = open(temp_output_filename, "w") self.ofp.write('''<?xml version="1.0" encoding="utf-8"?> <!DOCTYPE publicwhip [ <!ENTITY pound "£"> <!ENTITY euro "€"> <!ENTITY agrave "à"> <!ENTITY aacute "á"> <!ENTITY egrave "è"> <!ENTITY eacute "é"> <!ENTITY ecirc "ê"> <!ENTITY iacute "í"> <!ENTITY ograve "ò"> <!ENTITY oacute "ó"> <!ENTITY uacute "ú"> <!ENTITY Aacute "Á"> <!ENTITY Eacute "É"> <!ENTITY Iacute "Í"> <!ENTITY Oacute "Ó"> <!ENTITY Uacute "Ú"> <!ENTITY Uuml "Ü"> <!ENTITY auml "ä"> <!ENTITY euml "ë"> <!ENTITY iuml "ï"> <!ENTITY ouml "ö"> <!ENTITY uuml "ü"> <!ENTITY fnof "ƒ"> <!ENTITY aelig "æ"> <!ENTITY dagger "†"> <!ENTITY reg "®"> <!ENTITY nbsp " "> <!ENTITY shy "­"> <!ENTITY deg "°"> <!ENTITY middot "·"> <!ENTITY ordm "º"> <!ENTITY ndash "–"> <!ENTITY mdash "—"> <!ENTITY lsquo "‘"> <!ENTITY rsquo "’"> <!ENTITY ldquo "“"> <!ENTITY rdquo "”"> <!ENTITY hellip "…"> <!ENTITY bull "•"> <!ENTITY acirc "â"> <!ENTITY Agrave "À"> <!ENTITY Aring "Å"> <!ENTITY aring "å"> <!ENTITY atilde "ã"> <!ENTITY Ccedil "Ç"> <!ENTITY ccedil "ç"> <!ENTITY Egrave "È"> <!ENTITY Icirc "Î"> <!ENTITY icirc "î"> <!ENTITY Igrave "Ì"> <!ENTITY igrave "ì"> <!ENTITY ntilde "ñ"> <!ENTITY ocirc "ô"> <!ENTITY oelig "œ"> <!ENTITY Ograve "Ò"> <!ENTITY Oslash "Ø"> <!ENTITY oslash "ø"> <!ENTITY Scaron "Š"> <!ENTITY scaron "š"> <!ENTITY sup1 "¹"> <!ENTITY sup2 "²"> <!ENTITY sup3 "³"> <!ENTITY ugrave "ù"> <!ENTITY ucirc "û"> <!ENTITY Ugrave "Ù"> <!ENTITY yacute "ý"> <!ENTITY frac12 "½"> <!ENTITY micro "µ"> <!ENTITY sbquo "‚"> <!ENTITY trade "™"> <!ENTITY Dagger "‡"> <!ENTITY radic "√"> ]> <publicwhip> ''') self.ofp.write("<source url=\"%s\"/>" % self.original_url) tag_with_most_paragraphs = None most_paragraphs_so_far = -1 for t in self.soup.findAll(True): ps = paragraphs_in_tag(t) if ps > most_paragraphs_so_far: tag_with_most_paragraphs = t most_paragraphs_so_far = ps if verbose: print "Using element name: " + tag_with_most_paragraphs.name + " with " + str( most_paragraphs_so_far) + " paragraphs from " + filename if verbose: print tag_with_most_paragraphs.prettify() # When we're parsing we might have multiple questions in a # row. We say that something's a question rather than an # answer if (a) it's followed by an ID or (b) it begins with # "To ask", otherwise it's an answer. If we hit a new # heading, that suggests that the previous thing was an answer # as well. # The business of "Holding answers" is a bit confusing. At # the bottom of each page there may be a list of question IDs # which were given holding answers, but the text of the # question is not in the page - you only find it when the # question is eventually answered. for t in tag_with_most_paragraphs: if t.__class__ == NavigableString: s = str(t) s = re.sub('(?ims)\s+', ' ', s) if re.match('(?ims)^\s*$', s): continue else: self.add_to_paragraph(tidy_string(str(t))) if verbose: print "string: " + str(s) elif t.__class__ == Tag: # Look for any <a name=""> tags in here: a = t.find(lambda p: p.name == 'a' and p.has_key('name')) if a: self.sp_name = a['name'] if t.has_key('align') and t['align'].lower() == 'right': # Right aligned tags just have the question ID. if self.find_id_and_possible_holding_date(t): self.complete_question() else: if verbose: print "Couldn't parse top-level right aligned tag: " + str( t) elif t.has_key('class') and t['class'] == 'largeHeading': self.add_large_heading(tidy_string(non_tag_data_in(t))) elif self.something_centered(t) or self.c1_heading(t): # Centred tags are headings for questions... s = tidy_string(non_tag_data_in(t)) if len(s) > 0: self.complete_answer() if verbose: print "center: " + s self.add_heading(s) elif t.name == 'table': # This is probably a table that's inserted just to # right align the question ID. The left cell may # contain something to indicate that it's a # holding answer. if self.find_id_and_possible_holding_date(t): # Then also look for the "Holding answer # issued" details... s = non_tag_data_in(t) self.find_holding_answer_issued(s) self.complete_question() else: # Then maybe it's a table as part of the # answer, so add it as a paragraph. self.add_paragraph(str(t)) elif t.name == 'p': if re.search( "(The following questions were given holding answers|Questions given holding answers)", tidy_string(non_tag_data_in(t))): if verbose: print "Found the trailing holding question list!" # This indicates the end of the day's report # for us (just ignore the following list of # answers - it's not very interesting until we # parse some later day and we can tell what # the question was...) break if verbose: print "Didn't find the trailing holding question list in: " + non_tag_data_in( t) non_empty_contents = filter( lambda x: x.__class__ != NavigableString or not re. match('^\s*$', x), t.contents) if len(non_empty_contents) == 0: continue initial_strong_text = '' while len(non_empty_contents) > 0 and non_empty_contents[ 0].__class__ == Tag and ( non_empty_contents[0].name == 'strong' or non_empty_contents[0].name == 'b'): initial_strong_text += " " + non_tag_data_in( non_empty_contents[0]) non_empty_contents = non_empty_contents[1:] if len(initial_strong_text) > 0: speaker_name = tidy_string(initial_strong_text) # In some files this will be the ID (possibly # plus holding indication), not right aligned # as usual :( if self.find_id_and_possible_holding_date( speaker_name): self.complete_question() else: speaker_name = re.sub('(?ims)\s*:\s*$', '', speaker_name) person_id = self.valid_speaker(speaker_name) if speaker_name and person_id: self.complete_answer() self.set_speaker(speaker_name, person_id) for e in non_empty_contents: s = tidy_string(str(e)) self.add_to_paragraph(s) else: self.add_paragraph_removing_enclosure(t) else: self.add_paragraph_removing_enclosure(t) elif t.name == 'div' or t.name == 'blockquote' or t.name == 'ol' or t.name == 'ul' or t.name == 'center': # Just add them in a paragraph anyway, even though # that wouldn't be valid HTML 4 strict in the case # of the last three (IIRC) self.add_paragraph(str(t)) else: # Well, if it's empty of text we don't care... s = non_tag_data_in(t) if not re.match('(?ims)^\s*$', s): raise Exception, "Unknown tag found of name '" + t.name + "' with text: " + t.prettify( ) self.complete_answer() # Now output all the XML, working out IDs for each element. # IDs are of the form: # # uk.org.publicwhip/spwa/YYYY-MM-DD.X.T # # .... where: # - YYYY-MM-DD is an ISO 8601 date # # - X is a integer starting at 0 on each day, which # should be incremented for each new heading and # be the same for a group of questions and their # answer. # # - T is "mh" or "h" for major and minor headings, # "q0", "q1", "q2", etc. for each group of # questions and "r0", "r1", etc. for the answers x = -1 last_heading = None current_sp_id = None index = 0 for i in range(0, len(self.all_stuff)): if i > 0: previous = self.all_stuff[i - 1] else: previous = None if i < (len(self.all_stuff) - 1): next = self.all_stuff[i + 1] else: next = None a = self.all_stuff[i] self.ofp.write('\n\n') if a.__class__ == Heading: last_was_answer = True if a.major: subtype = "mh" else: subtype = "h" if next and next.__class__ == QuestionOrReply and next.sp_id: # Then use the question's sp_id: self.ofp.write(a.to_xml(self.get_id(next.sp_id, subtype))) else: x += 1 self.ofp.write(a.to_xml(self.get_id(str(x), subtype))) last_heading = a elif a.__class__ == QuestionOrReply: # Occasionally we think questions are actually # answers, so check the beginning of the first # paragraph: if not a.is_question and len(a.paragraphs) > 0 and re.search( '^(?ims)\s*To\s+ask', a.paragraphs[0]): a.is_question = True # If we're suddenly in an answer, reset index. if (not a.is_question) and previous and not ( previous.__class__ == QuestionOrReply and not previous.is_question): index = 0 # If we're suddenly in a question, reset index and increment x unless the previous is a heading elif a.is_question: if previous: if previous.__class__ == QuestionOrReply: if previous.is_question: # If the one before is a question, that's fine. current_sp_id = a.sp_id else: current_sp_id = a.sp_id # If the previous one was an answer # then we need to replay the last # heading: if not last_heading: raise Exception, "Somehow there's been no heading so far." last_heading.sp_name = a.sp_name if current_sp_id: self.ofp.write( last_heading.to_xml( self.get_id(current_sp_id, "h"))) else: x += 1 self.ofp.write( last_heading.to_xml( self.get_id(str(x), "h"))) self.ofp.write("\n\n") index = 0 else: # i.e. this is the normal case, a question after a heading: current_sp_id = a.sp_id index = 0 else: raise Exception, "Nothing before the first question (no heading)" if a.is_question: subtype = "q" + str(index) else: subtype = "r" + str(index) if current_sp_id: self.ofp.write( a.to_xml(self.get_id(current_sp_id, subtype))) else: self.ofp.write(a.to_xml(self.get_id(str(x), subtype))) index += 1 self.ofp.write("</publicwhip>") self.ofp.close() retcode = call(["mv", temp_output_filename, output_filename]) if retcode != 0: raise Exception, "Moving " + temp_output_filename + " to " + output_filename + " failed." xmlvalidate.parse(output_filename) #retcode = call( [ "xmlstarlet", "val", output_filename ] ) #if retcode != 0: # raise Exception, "Validating "+output_filename+" for well-formedness failed." fil = open('%schangedates.txt' % xml_output_directory, 'a+') fil.write('%d,spwa%s.xml\n' % (time.time(), self.date)) fil.close()
def add_paragraph_removing_enclosure(self, t): paragraph = str(t) paragraph = re.sub('(?ims)^\s*<p[^>]*>\s*(.*)</p[^>]*>\s*$', r'\1', paragraph) paragraph = tidy_string(paragraph) self.add_paragraph(paragraph)
def parse_html(session, report_date, soup, page_id, original_url): divnumber = 0 report_view = soup.find('div', attrs={'id': 'ReportView'}) div_children_of_report_view = report_view.findChildren('div', recursive=False) if len(div_children_of_report_view) != 1: raise Exception, 'We only expect one <div> child of <div id="ReportView">; there were %d in page with ID %d' % (len(div_children_of_report_view), page_id) Speech.reset_speakers_so_far() main_div = div_children_of_report_view[0] top_level_divs = main_div.findChildren('div', recursive=False) # The first div should just contain links to sections further down # the page: contents_div, text_div = top_level_divs # Just check that my assumption that the first div only contains # links is correct: contents_tuples = [] contents_links = contents_div.findAll(True) for link in contents_links: if link.name == 'br': continue if link.name != 'a': raise Exception, "There was something other than a <br> or an <a> in the supposed contents <div>, for page ID: %d" % (page_id,) href = link['href'] m = re.search(r'#(.*)', href) if not m: raise Exception, "Failed to find the ID from '%s' in page with ID: %d" % (href, page_id) contents_tuples.append((m.group(1), tidy_string(non_tag_data_in(link)))) parsed_page = ParsedPage(session, report_date, page_id) # Now consider the div that actually has text in it. Each speech # is in its own div, while the rest that we care about are # headings: current_votes = None current_division_way = None current_time = None current_url = original_url for top_level in text_div: # There are sometimes some empty NavigableString elements at # the top level, so just ignore those: if not len(unicode(top_level).strip()): continue if top_level.name == 'h2': section_title = tidy_string(non_tag_data_in(top_level, tag_replacement=u' ')) if not section_title: raise Exception, "There was an empty section title in page ID: %d" % (page_id) parsed_page.sections.append( Section(section_title, current_url)) elif top_level.name in ('br',): # Ignore line breaks - we use paragraphs instead continue elif top_level.name == 'a': try: current_url = original_url + "#" + top_level['id'] except KeyError: pass elif top_level.name == 'div': # This div contains a speech, essentially: # the new style pages wraps speeches in p.span tags that we can ignore so # remove them from the tree. Occasionally there are multiple spans in a p # hence the for loop # This does mean we are losing some formatting information but because it's # hardcoded style attributes in the spans it's arguable that we'd want to # remove them anyway. for p in top_level.findChildren('p'): if p.span: for span in p.findChildren('span'): span.unwrap() p.unwrap() removed_number = None for speech_part in top_level: if hasattr(speech_part, 'name') and speech_part.name != None: if speech_part.name == 'b': speaker_name = non_tag_data_in(speech_part) # If there's a leading question number remove that (and any whitespace) match = re.match(r'^\d+\.\s*', speaker_name) if match: speaker_name = re.sub(r'^\d+\.\s*', '', speaker_name) removed_number = match.group(0) # If there's a training colon, remove that (and any whitespace) speaker_name = re.sub(r'[\s:]*$', '', speaker_name) current_speech = Speech(tidy_string(speaker_name), report_date, current_time, current_url) parsed_page.sections[-1].speeches_and_votes.append(current_speech) elif speech_part.name == 'br': # Ignore the line breaks... pass elif speech_part.name == 'ul': current_speech.paragraphs.append(speech_part.html) elif speech_part.name == 'a' and speech_part.text == '': # skip empty a anchors pass else: raise Exception, "Unexpected tag '%s' in page ID: %d" % (speech_part.name, page_id) elif isinstance(speech_part, NavigableString): tidied_paragraph = tidy_string(speech_part) if tidied_paragraph == "": # just ignore blank lines continue # print "tidied_paragraph is", tidied_paragraph.encode('utf-8'), "of type", type(tidied_paragraph) division_way, division_candidate, division_candidate_id = is_division_way(tidied_paragraph, report_date) member_vote = is_member_vote(tidied_paragraph, report_date, expecting_a_vote=current_votes) maybe_time = just_time(tidied_paragraph) closed_time = meeting_closed(tidied_paragraph) if closed_time: current_time = closed_time suspended_time_tuple = meeting_suspended(tidied_paragraph) if suspended_time_tuple: suspended, suspension_time_type, suspension_time = suspended_time_tuple else: suspended = False suspension_time_type = suspension_time = None if division_way: # If this is a vote for a particular # candidate, or the introduction to an # oath-taking, add the text as a speech too: if division_candidate: current_speech = Speech(None, report_date, current_time, current_url) parsed_page.sections[-1].speeches_and_votes.append(current_speech) current_speech.paragraphs.append(tidied_paragraph) if (not current_votes) or (current_votes.candidate != division_candidate): current_votes = Division(report_date, current_url, divnumber, division_candidate, division_candidate_id) divnumber += 1 parsed_page.sections[-1].speeches_and_votes.append(current_votes) current_division_way = division_way elif member_vote: if current_votes is None: raise Exception, "Got a member's vote before an indication of which way the vote is" current_votes.add_vote(current_division_way, tidied_paragraph, member_vote) elif maybe_time: current_time = maybe_time else: if current_votes: current_votes = None # If this speech doesn't have any paragraphs # yet, make sure that it has the current time, # so that (for example) if we get a "Meeting # closed at 17:44." at the end, that speech # ends up with that time. if len(current_speech.paragraphs) == 0: current_speech.last_time = current_time if removed_number and tidied_paragraph: tidied_paragraph = removed_number + tidied_paragraph removed_number = None current_speech.paragraphs.append(tidied_paragraph) if suspended and suspension_time: current_time = suspension_time else: raise Exception, "Totally unparsed element:\n%s\n... unhandled in page ID: %d" % (speech_part, page_id) else: raise Exception, "There was an unhandled element '%s' in page with ID: %d" % (top_level.name, page_id) return parsed_page
def is_division_way(element, report_date=None): """If it's a division heading, return a normalized version, otherwise None >>> is_division_way(' For ') ('FOR', None, None) >>> is_division_way('nonsense') (None, None, None) >>> is_division_way('abstentions ') ('ABSTENTIONS', None, None) >>> is_division_way(":\xA0FOR") ('FOR', None, None) >>> is_division_way('Abstention') ('ABSTENTIONS', None, None) >>> is_division_way('Absentions') ('ABSTENTIONS', None, None) >>> example_date = datetime.date(1999, 5, 13) >>> is_division_way('VOTES FOR DONALD DEWAR', example_date) ('FOR', 'Donald Dewar', u'uk.org.publicwhip/member/80147') >>> is_division_way('now cast your votes for someone', example_date) (None, None, None) >>> example_date = datetime.date(2000, 3, 14) >>> is_division_way('For Mr Kenneth Macintosh', example_date) ('FOR', 'Mr Kenneth Macintosh', u'uk.org.publicwhip/member/80191') >>> is_division_way('For option 1', example_date) ('FOR', 'Option 1', None) >>> is_division_way('The following member took the oath:') ('FOR', 'oath', None) >>> is_division_way('The following member made a solemn affirmation:') ('FOR', 'affirmation', None) >>> is_division_way('The following member made a solemn affirmation and repeated it in French:') ('FOR', 'affirmation', None) """ tidied = tidy_string(non_tag_data_in(element)).upper() # Strip any non-word letters at the start and end: tidied = re.sub(r'^\W*(.*?)\W*$', '\\1', tidied) if tidied in DIVISION_HEADINGS: return (tidied, None, None) elif tidied in ('ABSTENTION', 'ABSENTIONS'): return ('ABSTENTIONS', None, None) elif re.search('^THE FOLLOWING MEMBERS? TOOK THE OATH( AND REPEATED IT IN .*)?:?$', tidied): return ('FOR', 'oath', None) elif re.search('^THE FOLLOWING MEMBERS? MADE A SOLEMN AFFIRMATION( AND REPEATED IT IN .*)?:?$', tidied): return ('FOR', 'affirmation', None) elif len(tidied.split()) < 128: # The second regular expression could be *very* slow on # strings that begin 'FOR', so only try it on short strings # that might be introducing a division, and assume that there # are 2 to 4 words in the name: m1 = re.search(r'^(?i)VOTES? FOR ([A-Z ]+)$', tidied) m2 = re.search(r'^FOR ((?:[A-Z]+\s*){2,4})$', tidied) m = m1 or m2 if m: person_name = m.group(1).title() person_id = None if report_date: person_id = get_unique_person_id(person_name, report_date) return ('FOR', person_name, person_id) else: m = re.search(r'FOR OPTION (\d+)$', tidied) if m: return ('FOR', 'Option ' + m.group(1), None) return (None, None, None)
def is_member_vote(element, vote_date, expecting_a_vote=True): """Returns a speaker ID if this looks like a member's vote in a division Otherwise returns None. If it looks like a vote, but the speaker can't be identified, this throws an exception. As an example: >>> is_member_vote('Something random...', '2012-11-12') >>> is_member_vote('Baillie, Jackie (Dumbarton) (Lab)', '2012-11-12') u'uk.org.publicwhip/member/80476' >>> is_member_vote('Alexander, Ms Wendy (Paisley North) (Lab)', '2010-05-12') u'uk.org.publicwhip/member/80281' >>> is_member_vote('Purvis, Jeremy (Tweeddale, Ettrick and Lauderdale)', '2005-05-18') u'uk.org.publicwhip/member/80101' Now some examples that should be ignored: >>> is_member_vote(': SP 440 (EC Ref No 11766/99, COM(99) 473 final)', '1999-11-23') >>> is_member_vote('SP 666 (EC Ref No 566 99/0225, COM(99) (CNS))', '2000-02-08') >>> is_member_vote('to promote a private bill, the company relied on its general power under section 10(1)(xxxii)', '2006-05-22') And one that should throw an exception: >>> is_member_vote('Lebowski, Jeffrey (Los Angeles) (The Dude)', '2012-11-12') Traceback (most recent call last): ... Exception: A voting member 'Jeffrey Lebowski (Los Angeles)' couldn't be resolved If expecting_a_vote is False, then don't throw an exception if the name can't be resolved: >>> is_member_vote('Lebowski, Jeffrey (Los Angeles) (The Dude)', '2012-11-12', expecting_a_vote=False) Also try resolving names that aren't comma-reversed: >>> is_member_vote('Brian Adam (North-East Scotland) (SNP)', '1999-11-09') u'uk.org.publicwhip/member/80129' """ tidied = tidy_string(non_tag_data_in(element)) from_first_and_last = lambda m: m and "%s %s (%s)" % (m.group('first_names'), m.group('last_name'), m.group('constituency')) from_full = lambda m: m and m.group('full_name') vote_matches = ( (member_vote_re, from_first_and_last), (member_vote_just_constituency_re, from_first_and_last), (member_vote_fullname_re, from_full)) reformed_name = first(processor(regexp.search(tidied)) for regexp, processor in vote_matches) if not reformed_name: return None person_id = get_unique_person_id(reformed_name, str(vote_date)) if person_id is None and expecting_a_vote: print "reformed_name is:", reformed_name print "vote_date is:", vote_date raise Exception, "A voting member '%s' couldn't be resolved" % (reformed_name,) else: return person_id