def StripWransHeadings(headspeak, sdate): # check and strip the first two headings in as much as they are there i = 0 if (headspeak[i][0] != 'Initial') or headspeak[i][2]: print headspeak[0] raise ContextException('non-conforming Initial heading ') i += 1 # import pdb;pdb.set_trace() if (not re.match( '(?:<stamp aname="[^"]*"/>)*written answers?(?: to questions?)?(?i)', headspeak[i][0])) or headspeak[i][2]: if not re.match('The following answers were received.*', headspeak[i][0]): pass # print headspeak[i] else: i += 1 givendate = string.replace(headspeak[i][0], " ", " ") givendate = re.sub("</?i>", "", givendate) gd = re.match('(?:<stamp aname="[^"]*"/>)*(.*)$', givendate) if gd: givendate = gd.group(1) if (not re.match('(?i)(?:<stamp[^>]*>)*(?:<i>)?\s*(?:The following answers were|Answers) received.*', headspeak[i][0]) and not re.match('(?:<stamp[^>]*>)?The following question was answered on.*', headspeak[i][0]) and \ (sdate != mx.DateTime.DateTimeFrom(givendate).date)) or headspeak[i][2]: if (not parlPhrases.wransmajorheadings.has_key( headspeak[i][0])) or headspeak[i][2]: print headspeak[i] raise ContextException('non-conforming second heading', stamp=None, fragment=headspeak[i][0]) else: i += 1 # find the url and colnum stamps that occur before anything else stampurl = StampUrl(sdate) for j in range(0, i): stampurl.UpdateStampUrl(headspeak[j][0]) stampurl.UpdateStampUrl(headspeak[j][1]) # Later editions seem to miss first column number, sigh if not stampurl.stamp: for speeches in headspeak: text = ''.join([speech[1] for speech in speeches[2]]) m = re.search('colnum="(\d+)W"', text) if m: stampurl.UpdateStampUrl('<stamp coldate="%s" colnum="%dW"/>' % (sdate, int(m.group(1)) - 1)) break if not stampurl.stamp or not stampurl.pageurl or not stampurl.aname: raise ContextException('missing stamp url at beginning of file') return (i, stampurl)
def StripWestminhallHeadings(headspeak, sdate): # check and strip the first two headings in as much as they are there ih = 0 ih = StripDebateHeading('Initial', ih, headspeak) # Westminster Hall ih = StripDebateHeading('westminster hall(?i)', ih, headspeak) # date line givendate = re.sub('</?i>', ' ', headspeak[ih][0]) gd = re.match('(?:<stamp aname="[^"]*"/>)*(.*)$(?i)', givendate) if gd: givendate = gd.group(1) if ((sdate != mx.DateTime.DateTimeFrom(givendate).date)) or headspeak[ih][2]: raise Exception, 'date heading %s mismatches with date %s' % (repr( headspeak[ih]), sdate) ih = ih + 1 # next line is: # <H3><center>[Mr. John McWilliam in the Chair]</center></H3> # but we leave it as a title. # find the url, colnum and time stamps that occur before anything else in the unspoken text stampurl = StampUrl(sdate) stampurl.timestamp = '<stamp time="%s"/>' % "unknown" for j in range(0, ih): stampurl.UpdateStampUrl(headspeak[j][1]) if (not stampurl.stamp) or (not stampurl.pageurl): raise Exception, ' missing stamp url at beginning of file ' return (ih, stampurl)
def FilterWMSSpeakers(fout, text, sdate): stampurl = StampUrl(sdate) for fss in recomb.split(text): stampurl.UpdateStampUrl(fss) # speaker detection speakerg = respeakervals.match(fss) if speakerg: anamestamp = speakerg.group(1) or speakerg.group(2) or "" spstr = string.strip(speakerg.group(3)) spstrbrack = speakerg.group(4) if not spstr: continue try: #print "spstr", spstr, ",", spstrbrack result = memberList.matchwmsname(spstr, spstrbrack, sdate) except Exception, e: raise ContextException(str(e), stamp=stampurl, fragment=fss) # put record in thisplace spxm = '%s<speaker %s>%s</speaker>\n' % (anamestamp, result.encode("latin-1"), spstr) fout.write(spxm) continue # nothing detected # check if we've missed anything obvious if recomb.match(fss): raise ContextException('regexpvals not general enough', fragment=fss, stamp=stampurl) if remarginal.search(fss): raise ContextException(' marginal speaker detection case: %s' % remarginal.search(fss).group(0), fragment=fss, stamp=stampurl) fout.write(fss)
def StripLordsDebateHeadings(headspeak, sdate): # check and strip the first two headings in as much as they are there ih = 0 ih = StripDebateHeading('Initial', ih, headspeak) # House of Lords ih = StripDebateHeading('house of lords(?i)', ih, headspeak, True) # Thursday, 18th December 2003. mdateheading = re.match('(?:<stamp aname="[^"]*"/>)*([\w\s\d,]*)\.?', headspeak[ih][0]) #time = TimeProcessing(timeg.group(1), previoustime, False, stampurl) #fout.write('<stamp time="%s"/>' % time) if not mdateheading or (sdate != mx.DateTime.DateTimeFrom( mdateheading.group(1)).date) or headspeak[ih][2]: print headspeak[ih] #raise ContextException('non-conforming date heading') # recoverable? else: ih = ih + 1 if re.match( '(?:<stamp aname="[^"]*"/>)*(?:THE )?(?i)QUEEN(?:\'|&....;)S SPEECH', headspeak[ih][0]): print headspeak[ih][0] print "QUEENS SPEECH" # don't advance, because this is the heading (works for 2005-05-17) elif re.match("Parliament", headspeak[ih][0]): print "parliamentparliament" # don't advance; this is a title (works for 2005-05-11) else: #<H4><center>Reassembling after the Christmas Recess, the House met at half-past two of the clock: The LORD CHANCELLOR on the Woolsack.</center></H4> # The House met at eleven of the clock (Prayers having been read earlier at the Judicial Sitting by the Lord Bishop of St Albans): The CHAIRMAN OF COMMITTEES on the Woolsack. ih = StripDebateHeading( '(?:reassembling.*?recess, )?the house (?:met|resumed)(?: for Judicial Business)? at ([^(]*)(?i)', ih, headspeak, True) #print starttime. (we should use the "Half past two" business in house met to set it, unfortunately the filtercoltime has already happened # Prayers—Read by the Lord Bishop of Southwell. ih = StripDebateHeading('prayers(?i)', ih, headspeak, True) # find the url, colnum and time stamps that occur before anything else in the unspoken text stampurl = StampUrl(sdate) #stampurl.timestamp = '<stamp( time="%s")/>', starttime) # set the time from the wording 'house met at' thing. for j in range(0, ih): stampurl.UpdateStampUrl(headspeak[j][1]) if (not stampurl.stamp) or (not stampurl.pageurl): raise Exception, ' missing stamp url at beginning of file ' return (ih, stampurl)
def StripDebateHeadings(headspeak, sdate): # check and strip the first two headings in as much as they are there ih = 0 ih = StripDebateHeading( 'Initial', ih, headspeak ) # the 'Initial' is inserted by the splitheadingsspeakers function # volume type heading if re.search('THE$', headspeak[ih][0]): ih = StripDebateHeading('THE', ih, headspeak) ih = StripDebateHeading('PARLIAMENTARY(?: )+DEBATES', ih, headspeak) elif re.search('THE PARLIAMENTARY DEBATES', headspeak[ih][0]): ih = StripDebateHeading('THE PARLIAMENTARY DEBATES', ih, headspeak) if re.search('OFFICIAL REPORT', headspeak[ih][0]): ih = StripDebateHeading('OFFICIAL REPORT', ih, headspeak) ih = StripDebateHeading( 'IN THE .*? SESSION OF THE .*? PARLIAMENT OF THE', ih, headspeak, True) ih = StripDebateHeading( 'UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND', ih, headspeak, True) ih = StripDebateHeading('\[WHICH OPENED .*?\]', ih, headspeak, True) ih = StripDebateHeading('.*? YEAR OF THE REIGN OF.*?', ih, headspeak, True) ih = StripDebateHeading('HER MAJESTY QUEEN ELIZABETH II', ih, headspeak, True) ih = StripDebateHeading('SI.*? SERIES.*?VOLUME \d+', ih, headspeak, True) ih = StripDebateHeading('SI.*? SERIES', ih, headspeak, True) ih = StripDebateHeading('VOLUME \d+', ih, headspeak, True) ih = StripDebateHeading('.*? VOLUME OF SESSION .*?', ih, headspeak) #House of Commons ih = StripDebateHeading('house of commons(?i)', ih, headspeak) # Tuesday 9 December 2003 if not re.match('the house met at .*(?i)', headspeak[ih][0]): givendate = re.sub(' ', ' ', headspeak[ih][0]) givendate = re.sub('</?i>', ' ', givendate) gd = re.match('(?:<stamp aname="[^"]*"/>)*(.*)$(?i)', givendate) if gd: givendate = gd.group(1) if ((sdate != mx.DateTime.DateTimeFrom(givendate).date)) or headspeak[ih][2]: raise Exception, 'date heading %s mismatches with date %s' % (repr( headspeak[ih]), sdate) ih = ih + 1 gstarttime = None if sdate != "2001-06-13": #The House met at half-past Ten o'clock gstarttime = re.match( '(?:<stamp aname="[^"]*"/>)*(?:<i>)?\s*the\s+house (?:being |having )?met at?\s+(?:</i><i>\s*)?(.*?)(?:, and the Speaker-Elect having taken the Chair;)?(?:</i>)?$(?i)', headspeak[ih][0]) if (not gstarttime) or headspeak[ih][2]: raise ContextException( 'non-conforming "the house met at" heading %s' % repr(headspeak[ih]), "") ih = ih + 1 # Start of a new parliament is special if sdate not in ["2001-06-14", "2001-06-13", "2005-05-11", "2005-05-12"]: #PRAYERS ih = StripDebateHeading('prayers(?i)', ih, headspeak, True) ih = StripDebateHeading('pursuant to the Standing Order\.', ih, headspeak, True) # in the chair ih = StripDebateHeading('\[.*?[ >]in the chair[<>i/\.]*\](?i)', ih, headspeak, True) # find the url, colnum and time stamps that occur before anything else in the unspoken text stampurl = StampUrl(sdate) # set the time from the wording 'house met at' thing. if gstarttime: time = gstarttime.group(1) time = re.sub('</?i>', ' ', time) time = re.sub('\s+', ' ', time) if re.match("half-past Nine(?i)", time): newtime = '09:30:00' elif re.match("a quarter to Ten o(?i)", time): newtime = '09:45:00' elif re.match("Ten o'clock(?i)", time): newtime = '10:00:00' elif re.match("half-past Ten(?i)", time): newtime = '10:30:00' elif re.match("Eleven o'clock(?i)", time): newtime = '11:00:00' elif re.match("twenty-five minutes past\s*Eleven(?i)", time): newtime = '11:25:00' elif re.match("twenty-six minutes past\s*Eleven(?i)", time): newtime = '11:26:00' elif re.match("twenty-nine minutes past\s*Eleven(?i)", time): newtime = '11:29:00' elif re.match("half-past Eleven(?i)", time): newtime = '11:30:00' elif re.match("Twelve noon(?i)", time): newtime = '12:00:00' elif re.match("half-past One(?i)", time): newtime = '13:30:00' elif re.match("half-past Two(?i)", time): newtime = '14:30:00' elif re.match("twenty minutes to Three(?i)", time): newtime = '14:40:00' elif re.match("10 minutes past Three(?i)", time): newtime = '15:10:00' elif re.match("Six o'clock(?i)", time): newtime = '18:00:00' else: raise ContextException, "Start time not known: " + time stampurl.timestamp = '<stamp time="%s"/>' % newtime for j in range(0, ih): stampurl.UpdateStampUrl(headspeak[j][1]) if (not stampurl.stamp) or (not stampurl.pageurl): raise Exception, ' missing stamp url at beginning of file ' return (ih, stampurl)
def LordsFilterSpeakers(fout, text, sdate): stampurl = StampUrl(sdate) officematches = {} # setup for scanning through the file. for fss in respeaker.split(text): # strip off the bolds tags # get rid of non-bold stuff bffs = respeakerb.match(fss) if not bffs: fout.write(fss) stampurl.UpdateStampUrl(fss) continue stampurl.UpdateStampUrl(fss) # grab a trailing colon if there is one fssb = bffs.group(1) if bffs.group(2): fssb = fssb + ":" # Remove the cruft fssb = re.sub('<stamp aname="[^"]*"/>', '', fssb) fssb = re.sub('</b><b>', '', fssb) # empty bold phrase if not re.search('\S', fssb): continue # division/contents/amendment which means this is not a speaker if renonspek.search(fssb): fout.write(fss) continue # part of quotes as an inserted title in an amendment if re.match('("|\[|")', fssb): fout.write(fss) continue # another title type (all caps), or a clause number if not re.search('[a-z]', fssb): fout.write(fss) continue # start piecing apart the name by office and leadout type namec = respeakervals.match(fssb) if not namec: print '*', fssb, '*' raise ContextException("bad format", stamp=stampurl, fragment=fssb) if namec.group('bracket'): name = re.sub('\s+', ' ', namec.group('bracket')) loffice = re.sub('\s+', ' ', namec.group('name')) else: name = re.sub('\s+', ' ', namec.group('name')) loffice = None colon = namec.group('colon') if not colon: colon = "" # get rid of some standard ones if re.match('the lord chancellor|noble lords|a noble lord|a noble baroness|the speaker(?i)', name): fout.write('<speaker person_id="%s" speakername="%s">%s</speaker>' % ('unknown', name, name)) continue # map through any office information if loffice: if (not re.match("The (Deputy |Minister of State)", loffice)) and (loffice in officematches): if sdate!='2014-09-26' and sdate!='2012-09-24' and officematches[loffice] != name: raise ContextException("office inconsistency, loffice: %s name: %s officematches: %s" % (loffice, name, officematches[loffice]), stamp=stampurl, fragment=fssb) else: officematches[loffice] = name elif name in officematches: loffice = name name = officematches[loffice] if regenericspeak.match(name): fout.write('<speaker person_id="%s" speakername="%s">%s</speaker>' % ('unknown', name, name)) continue lsid = lordsList.GetLordIDfname(name, loffice=loffice, sdate=sdate, stampurl=stampurl) # maybe throw the exception on the outside if not lsid: fout.write('<speaker person_id="unknown" error="No match" speakername="%s" colon="%s">%s</speaker>' % (name, colon, name)) else: fout.write('<speaker person_id="%s" speakername="%s" colon="%s">%s</speaker>' % (lsid, name, colon, name)) if namec.group('maiden'): fout.write('<i>%s</i>' % namec.group('maiden'))
def FilterDebateSpeakers(fout, text, sdate, typ): if typ == "westminhall": depspeakerrg = re.search("\[(.*?)(?:<i>)? ?in the Chair(?:</i>)?\]", text) if not depspeakerrg: raise ContextException("Can't find the [... in the Chair] phrase") depspeaker = depspeakerrg.group(1) # old style fixing (before patches existed) if typ == "debate": text = ApplyFixSubstitutions(text, sdate, fixsubs) # for error messages stampurl = StampUrl(sdate) # Fix missing bold tags around names missingbolds = re.findall('(\n?<p>(?:<stamp aname="[^"]+"/>)+)((?:<b></b>)?\s*)([A-Za-z.\-\s]+)((?:\([^)]*\)\s*)*)(:\s)', text) for p1,p2,p3,p4,p5 in missingbolds: missingbold = "%s%s%s%s%s" % (p1,p2,p3,p4,p5) bold = "%s<b>%s%s%s</b>" % (p1,p3,p4,p5) namematches = memberList.fullnametoids(p3, sdate) if namematches: if not missingbold in text: print "ERROR: missing bold text found, but then vanished when replacing" text = text.replace(missingbold, bold) # Move Urgent Question out of speaker name urgentqns = re.findall('(<p>(?:<stamp aname="[^"]+"/>\s*)+)(<b>[^<]*?)(\s*<i>\s*\(Urgent Question\)</i>\s*)(:</b>)(?i)', text) for p1,p2,p3,p4 in urgentqns: urgentqn = "%s%s%s%s" % (p1,p2,p3,p4) correction = "%s%s%s%s" % (p1,p2,p4,p3) text = text.replace(urgentqn, correction) # setup for scanning through the file. for fss in recomb.split(text): stampurl.UpdateStampUrl(fss) #print fss #print "--------------------" # division number detection (these get through the speaker detection regexp) if redivno.match(fss) or retabletext.match(fss): fout.write(fss.encode("latin-1")) continue # CORRECTION title (these also get through) -- both these are surrounded by <center> tags usually. if fss == "<b>CORRECTION</b>": fout.write(fss.encode("latin-1")) continue if re.match('<b>(“)?([0-9]+[A-Z]* .*|(CHAPTER|PART) [0-9]+[A-Z]*|[A-Z, ]+)</b>$', fss): fout.write(fss) continue # speaker detection speakerg = respeakervals.match(fss) if speakerg: # optional parts of the group # we can use oqnum to detect oral questions anamestamp = speakerg.group(4) or speakerg.group(3) or "" oqnum = speakerg.group(1) if speakerg.group(5): assert not oqnum oqnum = speakerg.group(5) if oqnum: oqnum = ' oral-qnum="%s"' % oqnum else: oqnum = "" # the preceding square bracket qnums sqbnum = speakerg.group(2) or "" party = speakerg.group(8) or speakerg.group(10) spstr = string.strip(speakerg.group(6)) spstrbrack = speakerg.group(7) or speakerg.group(9) # the bracketted phrase (sometimes the constituency or name if it is a minister) if spstrbrack: spstrbrack = re.sub("\n", ' ', spstrbrack) # do quick substitution for dep speakers in westminster hall if typ == "westminhall" and re.search("deputy[ \-]speaker(?i)", spstr) and not spstrbrack: #spstrbrack = depspeaker spstr = depspeaker # match the member to a unique identifier and displayname try: #print "spstr", spstr, ",", spstrbrack #print speakerg.groups() result = memberList.matchdebatename(spstr, spstrbrack, sdate, typ) except Exception, e: # add extra stamp info to the exception raise ContextException(str(e), stamp=stampurl, fragment=fss) # put record in this place #print "ree", result.encode("latin-1") spxm = '%s<speaker %s%s>%s</speaker>\n%s' % (anamestamp, result.encode("latin-1"), oqnum, spstr, sqbnum) fout.write(spxm) continue # nothing detected # check if we've missed anything obvious if recomb.match(fss): raise ContextException('regexpvals not general enough', fragment=fss, stamp=stampurl) if remarginal.search(fss): raise ContextException(' marginal speaker detection case: %s' % remarginal.search(fss).group(0), fragment=fss, stamp=stampurl) # this is where we phase in the ascii encoding fout.write(fss)
def FilterWransSpeakers(fout, text, sdate): text = ApplyFixSubstitutions(text, sdate, fixsubs) # Fix things like this, to put bold in. We use bold below to detect names, but # occasionally the reporters miss it out, and we catch such cases here: # <p><a name="qnpa_0">Caroline Flint: This information is not held centrally. </p> # <p><a name="qnpa_15">Ms Harman: The information can be found in the following table. </p> missingbolds = re.findall( '(\n?<p>(?:<stamp aname="[^"]+"/>)+)((?:<b></b>)?\s*)([A-Za-z.\-\s]+)(:\s)', text) for p1, p2, p3, p4 in missingbolds: missingbold = "%s%s%s%s" % (p1, p2, p3, p4) bold = "%s<b>%s%s</b>" % (p1, p3, p4) namematches = memberList.fullnametoids(p3, sdate) # Only fix if we found a matching name in the middle (and do it even if ambiguous) if namematches: #print "Fixing missing bold, had name matches:\n\t%s\n\t%s" % (missingbold.strip(), bold.strip()) if not missingbold in text: print "ERROR: missing bold text found, but then vanished when replacing" text = text.replace(missingbold, bold) #else: #print "Plausible missing bold not fixed, as no name matches:\n\t%s\n\t%s" % (missingbold.strip(), bold.strip()) # <B> Mrs. Iris Robinson: </B> lspeakerregexp = '<b>.*?</b>(?:\s*:)?' ltableregexp = '<table[^>]*>[\s\S]*?</table>' # these have bolds, so must be separated out tableregexp = ltableregexp + '(?i)' lregexp = '(%s|%s)(?i)' % (ltableregexp, lspeakerregexp) # setup for scanning through the file. fs = re.split(lregexp, text) # for error messages stampurl = StampUrl(sdate) for i in range(len(fs)): fss = fs[i] fss = stampurl.UpdateStampUrl(fss) # Speakers have new stamps in them if re.match(tableregexp, fss): continue speakerg = re.findall('<b>\s*([^:]*)[:\s]*?([^<:]*)</b>(?i)', fss) if not speakerg: continue # we have a string in bold boldnamestring = string.strip(speakerg[0][0]) # trailing text after the colon in the bold speech bit if re.search('\S', speakerg[0][1]): fs[i + 1] = speakerg[0][1] + fs[i + 1] # push the square brackets outside of the boldstring if there is one # <B> Mr. Miliband [ </B> <i>holding answer 24 March</i>]: sqb = re.findall('^([^\[]*)(\[.*)$', boldnamestring) if sqb: boldnamestring = string.strip(sqb[0][0]) fs[i + 1] = sqb[0][1] + fs[i + 1] # get rid of blank bold strings if not re.search('\S', boldnamestring): fs[i] = '' continue # try to pull in the question number if preceding # These signify aborted oral questions, and are normally # useless and at the start of the page. # 27. <B> Mr. Steen: </B> if i > 0: oqnsep = re.findall( '^([\s\S]*?)Q?(\d+\.?)(\s*?(?:<stamp aname=".*?"/>)?)$', fs[i - 1]) if oqnsep: fs[i - 1] = oqnsep[0][0] + oqnsep[0][2] boldnamestring = oqnsep[0][1] + ' ' + boldnamestring # take out the initial digits and a dot which we may have just put in # (although sometimes it would have already been there) robj = re.match(r"(\d*\.? )(.*)$", boldnamestring) deci = None if robj: (deci, boldnamestring) = robj.groups() # TODO: do something with deci here (it is the "failed # oral questions" signifier) # see if it is an explicitly bad/ambiguous name which will never match if boldnamestring.find('<broken-name>') >= 0: person_id = 'unknown' boldnamestring = boldnamestring.replace('<broken-name>', '') remadename = ' speakername="%s" error="Name ambiguous in Hansard"' % ( boldnamestring) else: # split bracketed cons out if present brakmatch = re.match("(.*)\s+\((.*)\)", boldnamestring) if brakmatch: (name, cons) = brakmatch.groups() else: (name, cons) = (boldnamestring, None) # match the member to a unique identifier (person_id, remadename, remadecons) = memberList.matchfullnamecons(name, cons, sdate, alwaysmatchcons=False) if person_id and remadename: remadename = ' speakername="%s"' % (remadename) if not person_id: if remadename == "MultipleMatch": if boldnamestring == 'Mr. Michael Foster': if remadecons[0] == 'uk.org.publicwhip/person/10209': person_id = remadecons[0] remadename = ' speakername="Michael Foster"' remadecons = 'Worcester' else: person_id = 'unknown' remadename = ' speakername="%s" error="MultipleMatch"' % boldnamestring elif boldnamestring == 'Jim Dobbin' and sdate == '2014-09-08': person_id = 'uk.org.publicwhip/person/10170' remadename = ' speakername="Jim Dobbin"' else: print " No name,const match (%s,%s)" % (name, cons) raise ContextException("No name match", stamp=stampurl, fragment=boldnamestring) # put record in this place fs[i] = '<speaker person_id="%s"%s>%s</speaker>\n' % \ (person_id.encode("latin-1"), remadename.encode("latin-1"), boldnamestring) # scan through everything and output it into the file fout.writelines(fs)