def FilterDebateSpeakers(fout, text, sdate, typ): if typ == "westminhall": depspeakerrg = re.search("\[(.*?)(?:<i>)? ?in the Chair(?:</i>)?\]", text) if not depspeakerrg: raise ContextException("Can't find the [... in the Chair] phrase") depspeaker = depspeakerrg.group(1) # old style fixing (before patches existed) if typ == "debate": text = ApplyFixSubstitutions(text, sdate, fixsubs) # for error messages stampurl = StampUrl(sdate) # Fix missing bold tags around names missingbolds = re.findall('(\n?<p>(?:<stamp aname="[^"]+"/>)+)((?:<b></b>)?\s*)([A-Za-z.\-\s]+)((?:\([^)]*\)\s*)*)(:\s)', text) for p1,p2,p3,p4,p5 in missingbolds: missingbold = "%s%s%s%s%s" % (p1,p2,p3,p4,p5) bold = "%s<b>%s%s%s</b>" % (p1,p3,p4,p5) namematches = memberList.fullnametoids(p3, sdate) if namematches: if not missingbold in text: print "ERROR: missing bold text found, but then vanished when replacing" text = text.replace(missingbold, bold) # Move Urgent Question out of speaker name urgentqns = re.findall('(<p>(?:<stamp aname="[^"]+"/>)+)(<b>[^<]*?)(\s*<i>\s*\(Urgent Question\)</i>)(:</b>)(?i)', text) for p1,p2,p3,p4 in urgentqns: urgentqn = "%s%s%s%s" % (p1,p2,p3,p4) correction = "%s%s%s%s" % (p1,p2,p4,p3) text = text.replace(urgentqn, correction) # setup for scanning through the file. for fss in recomb.split(text): stampurl.UpdateStampUrl(fss) #print fss #print "--------------------" # division number detection (these get through the speaker detection regexp) if redivno.match(fss): fout.write(fss.encode("latin-1")) continue # CORRECTION title (these also get through) -- both these are surrounded by <center> tags usually. if fss == "<b>CORRECTION</b>": fout.write(fss.encode("latin-1")) continue # speaker detection speakerg = respeakervals.match(fss) if speakerg: # optional parts of the group # we can use oqnum to detect oral questions anamestamp = speakerg.group(4) or speakerg.group(3) or "" oqnum = speakerg.group(1) if speakerg.group(5): assert not oqnum oqnum = speakerg.group(5) if oqnum: oqnum = ' oral-qnum="%s"' % oqnum else: oqnum = "" # the preceding square bracket qnums sqbnum = speakerg.group(2) or "" party = speakerg.group(8) or speakerg.group(10) spstr = string.strip(speakerg.group(6)) spstrbrack = speakerg.group(7) or speakerg.group(9) # the bracketted phrase (sometimes the constituency or name if it is a minister) if spstrbrack: spstrbrack = re.sub("\n", ' ', spstrbrack) # do quick substitution for dep speakers in westminster hall if typ == "westminhall" and re.search("deputy[ \-]speaker(?i)", spstr) and not spstrbrack: #spstrbrack = depspeaker spstr = depspeaker # match the member to a unique identifier and displayname try: #print "spstr", spstr, ",", spstrbrack #print speakerg.groups() result = memberList.matchdebatename(spstr, spstrbrack, sdate, typ) except Exception, e: # add extra stamp info to the exception raise ContextException(str(e), stamp=stampurl, fragment=fss) # put record in this place #print "ree", result.encode("latin-1") spxm = '%s<speaker %s%s>%s</speaker>\n%s' % (anamestamp, result.encode("latin-1"), oqnum, spstr, sqbnum) fout.write(spxm) continue # nothing detected # check if we've missed anything obvious if recomb.match(fss): raise ContextException('regexpvals not general enough', fragment=fss, stamp=stampurl) if remarginal.search(fss): raise ContextException(' marginal speaker detection case: %s' % remarginal.search(fss).group(0), fragment=fss, stamp=stampurl) # this is where we phase in the ascii encoding fout.write(fss)
def FilterWransSpeakers(fout, text, sdate): text = ApplyFixSubstitutions(text, sdate, fixsubs) # Fix things like this, to put bold in. We use bold below to detect names, but # occasionally the reporters miss it out, and we catch such cases here: # <p><a name="qnpa_0">Caroline Flint: This information is not held centrally. </p> # <p><a name="qnpa_15">Ms Harman: The information can be found in the following table. </p> missingbolds = re.findall('(\n?<p>(?:<stamp aname="[^"]+"/>)+)((?:<b></b>)?\s*)([A-Za-z.\-\s]+)(:\s)', text) for p1,p2,p3,p4 in missingbolds: missingbold = "%s%s%s%s" % (p1,p2,p3,p4) bold = "%s<b>%s%s</b>" % (p1,p3,p4) namematches = memberList.fullnametoids(p3, sdate) # Only fix if we found a matching name in the middle (and do it even if ambiguous) if namematches: #print "Fixing missing bold, had name matches:\n\t%s\n\t%s" % (missingbold.strip(), bold.strip()) if not missingbold in text: print "ERROR: missing bold text found, but then vanished when replacing" text = text.replace(missingbold, bold) #else: #print "Plausible missing bold not fixed, as no name matches:\n\t%s\n\t%s" % (missingbold.strip(), bold.strip()) # <B> Mrs. Iris Robinson: </B> lspeakerregexp = '<b>.*?</b>(?:\s*:)?' ltableregexp = '<table[^>]*>[\s\S]*?</table>' # these have bolds, so must be separated out tableregexp = ltableregexp + '(?i)' lregexp = '(%s|%s)(?i)' % (ltableregexp, lspeakerregexp) # setup for scanning through the file. fs = re.split(lregexp, text) # for error messages stampurl = StampUrl(sdate) for i in range(len(fs)): fss = fs[i] fss = stampurl.UpdateStampUrl(fss) # Speakers have new stamps in them if re.match(tableregexp, fss): continue speakerg = re.findall('<b>\s*([^:]*)[:\s]*?([^<:]*)</b>(?i)', fss) if not speakerg: continue # we have a string in bold boldnamestring = string.strip(speakerg[0][0]) # trailing text after the colon in the bold speech bit if re.search('\S', speakerg[0][1]): fs[i+1] = speakerg[0][1] + fs[i+1] # push the square brackets outside of the boldstring if there is one # <B> Mr. Miliband [ </B> <i>holding answer 24 March</i>]: sqb = re.findall('^([^\[]*)(\[.*)$', boldnamestring) if sqb: boldnamestring = string.strip(sqb[0][0]) fs[i+1] = sqb[0][1] + fs[i+1] # get rid of blank bold strings if not re.search('\S', boldnamestring): fs[i] = '' continue # try to pull in the question number if preceeding # These signify aborted oral questions, and are normally # useless and at the start of the page. # 27. <B> Mr. Steen: </B> if i > 0: oqnsep = re.findall('^([\s\S]*?)Q?(\d+\.?)(\s*?(?:<stamp aname=".*?"/>)?)$', fs[i-1]) if oqnsep: fs[i-1] = oqnsep[0][0] + oqnsep[0][2] boldnamestring = oqnsep[0][1] + ' ' + boldnamestring # take out the initial digits and a dot which we may have just put in # (although sometimes it would have already been there) robj = re.match(r"(\d*\.? )(.*)$", boldnamestring) deci = None if robj: (deci, boldnamestring) = robj.groups() # TODO: do something with deci here (it is the "failed # oral questions" signifier) # see if it is an explicitly bad/ambiguous name which will never match if boldnamestring.find('<broken-name>') >= 0: id = 'unknown' boldnamestring = boldnamestring.replace('<broken-name>', '') remadename = ' speakername="%s" error="Name ambiguous in Hansard"' % (boldnamestring) else: # split bracketed cons out if present brakmatch = re.match("(.*)\s+\((.*)\)", boldnamestring) if brakmatch: (name, cons) = brakmatch.groups() else: (name, cons) = (boldnamestring, None) # match the member to a unique identifier (id, remadename, remadecons) = memberList.matchfullnamecons(name, cons, sdate, alwaysmatchcons = False) if id and remadename: remadename = ' speakername="%s"' % (remadename) if not id: if remadename == "MultipleMatch": if boldnamestring == 'Mr. Michael Foster': if remadecons[1] == 'uk.org.publicwhip/member/1939': id = remadecons[1] remadename = ' speakername="Michael Foster"' remadecons = 'Worcester' elif remadecons[0] == 'uk.org.publicwhip/member/896': id = remadecons[0] remadename = ' speakername="Michael Foster"' remadecons = 'Worcester' else: id = 'unknown' remadename = ' speakername="%s" error="MultipleMatch"' % boldnamestring elif boldnamestring == 'Jim Dobbin' and sdate == '2014-09-08': id = 'uk.org.publicwhip/member/40316' remadename = ' speakername="Jim Dobbin"' else: print " No name,const match (%s,%s)" % (name, cons) raise ContextException("No name match", stamp=stampurl, fragment=boldnamestring) # put record in this place fs[i] = '<speaker speakerid="%s"%s>%s</speaker>\n' % \ (id.encode("latin-1"), remadename.encode("latin-1"), boldnamestring) # scan through everything and output it into the file fout.writelines(fs)
def FilterDebateSpeakers(fout, text, sdate, typ): if typ == "westminhall": depspeakerrg = re.search("\[(.*?)(?:<i>)? ?in the Chair(?:</i>)?\]", text) if not depspeakerrg: raise ContextException("Can't find the [... in the Chair] phrase") depspeaker = depspeakerrg.group(1) # old style fixing (before patches existed) if typ == "debate": text = ApplyFixSubstitutions(text, sdate, fixsubs) # for error messages stampurl = StampUrl(sdate) # Fix missing bold tags around names missingbolds = re.findall('(\n?<p>(?:<stamp aname="[^"]+"/>)+)((?:<b></b>)?\s*)([A-Za-z.\-\s]+)((?:\([^)]*\)\s*)*)(:\s)', text) for p1,p2,p3,p4,p5 in missingbolds: missingbold = "%s%s%s%s%s" % (p1,p2,p3,p4,p5) bold = "%s<b>%s%s%s</b>" % (p1,p3,p4,p5) namematches = memberList.fullnametoids(p3, sdate) if namematches: if not missingbold in text: print "ERROR: missing bold text found, but then vanished when replacing" text = text.replace(missingbold, bold) # Move Urgent Question out of speaker name urgentqns = re.findall('(<p>(?:<stamp aname="[^"]+"/>\s*)+)(<b>[^<]*?)(\s*<i>\s*\(Urgent Question\)</i>\s*)(:</b>)(?i)', text) for p1,p2,p3,p4 in urgentqns: urgentqn = "%s%s%s%s" % (p1,p2,p3,p4) correction = "%s%s%s%s" % (p1,p2,p4,p3) text = text.replace(urgentqn, correction) # setup for scanning through the file. for fss in recomb.split(text): stampurl.UpdateStampUrl(fss) #print fss #print "--------------------" # division number detection (these get through the speaker detection regexp) if redivno.match(fss) or retabletext.match(fss): fout.write(fss.encode("latin-1")) continue # CORRECTION title (these also get through) -- both these are surrounded by <center> tags usually. if fss == "<b>CORRECTION</b>": fout.write(fss.encode("latin-1")) continue if re.match('<b>(“)?([0-9]+[A-Z]* .*|(CHAPTER|PART) [0-9]+[A-Z]*|[A-Z, ]+)</b>$', fss): fout.write(fss) continue # speaker detection speakerg = respeakervals.match(fss) if speakerg: # optional parts of the group # we can use oqnum to detect oral questions anamestamp = speakerg.group(4) or speakerg.group(3) or "" oqnum = speakerg.group(1) if speakerg.group(5): assert not oqnum oqnum = speakerg.group(5) if oqnum: oqnum = ' oral-qnum="%s"' % oqnum else: oqnum = "" # the preceding square bracket qnums sqbnum = speakerg.group(2) or "" party = speakerg.group(8) or speakerg.group(10) spstr = string.strip(speakerg.group(6)) spstrbrack = speakerg.group(7) or speakerg.group(9) # the bracketted phrase (sometimes the constituency or name if it is a minister) if spstrbrack: spstrbrack = re.sub("\n", ' ', spstrbrack) # do quick substitution for dep speakers in westminster hall if typ == "westminhall" and re.search("deputy[ \-]speaker(?i)", spstr) and not spstrbrack: #spstrbrack = depspeaker spstr = depspeaker # match the member to a unique identifier and displayname try: #print "spstr", spstr, ",", spstrbrack #print speakerg.groups() result = memberList.matchdebatename(spstr, spstrbrack, sdate, typ) except Exception, e: # add extra stamp info to the exception raise ContextException(str(e), stamp=stampurl, fragment=fss) # put record in this place #print "ree", result.encode("latin-1") spxm = '%s<speaker %s%s>%s</speaker>\n%s' % (anamestamp, result.encode("latin-1"), oqnum, spstr, sqbnum) fout.write(spxm) continue # nothing detected # check if we've missed anything obvious if recomb.match(fss): raise ContextException('regexpvals not general enough', fragment=fss, stamp=stampurl) if remarginal.search(fss): raise ContextException(' marginal speaker detection case: %s' % remarginal.search(fss).group(0), fragment=fss, stamp=stampurl) # this is where we phase in the ascii encoding fout.write(fss)
def FilterWransColnum(fout, text, sdate): # Legacy individual substitution rules text = ApplyFixSubstitutions(text, sdate, fixsubs) # Remove junk text = text.replace("{**con**}{**/con**}", "") stamp = StampUrl(sdate) # for error messages colnum = -1 for fss in recomb.split(text): columng = recolumnumvals.match(fss) if columng: ldate = mx.DateTime.DateTimeFrom(columng.group(1)).date if sdate != ldate: raise ContextException("Column date disagrees %s -- %s" % (sdate, fss), fragment=fss, stamp=stamp) lcolnum = string.atoi(columng.group(2)) if (colnum == -1) or (lcolnum == colnum + 1): pass # good elif lcolnum < colnum: raise ContextException("Colnum not incrementing %d -- %s" % (lcolnum, fss), fragment=fss, stamp=stamp) # column numbers do get skipped during division listings colnum = lcolnum stamp.stamp = '<stamp coldate="%s" colnum="%sW"/>' % (sdate, lcolnum) fout.write(' ') fout.write(stamp.stamp) continue columncontg = recolnumcontvals.match(fss) if columncontg: ldate = columncontg.group(1) or columncontg.group(3) or None lcolnum = columncontg.group(2) or columncontg.group(4) or None if ldate: ldate = mx.DateTime.DateTimeFrom(ldate).date if sdate != ldate: raise ContextException( "Cont column date disagrees %s -- %s" % (sdate, fss), fragment=fss, stamp=stamp) lcolnum = string.atoi(lcolnum) if colnum != lcolnum and sdate < '2006-05-08': raise ContextException( "Cont column number disagrees %d -- %s" % (colnum, fss), fragment=fss, stamp=stamp) # no need to output anything fout.write(' ') continue if columncontg.group(5): lcolnum = string.atoi(columncontg.group(5)) if colnum != lcolnum and colnum != lcolnum + 1: raise ContextException( "Cont column number disagrees %d -- %s" % (colnum, fss), fragment=fss, stamp=stamp) fout.write(' ') continue if columncontg.group(6): lcolnum = string.atoi(columncontg.group(6)) if colnum + 1 != lcolnum: raise ContextException( "Cont column number disagrees %d -- %s" % (colnum, fss), fragment=fss, stamp=stamp) colnum = lcolnum stamp.stamp = '<stamp coldate="%s" colnum="%sW"/>' % (sdate, lcolnum) fout.write(' ') fout.write(stamp.stamp) continue # anchor names from HTML <a name="xxx"> anameg = reanamevals.match(fss) if anameg: aname = anameg.group(1) stamp.aname = '<stamp aname="%s"/>' % aname fout.write(stamp.aname) continue # nothing detected # check if we've missed anything obvious if recomb.match(fss): raise ContextException('regexpvals not general enough', fragment=fss, stamp=stamp) # Removed FAI 2007-05-25, I really don't care! #if remarginal.search(fss): # raise ContextException('marginal colnum detection case', # fragment=remarginal.search(fss).group(0), # stamp=stamp) fout.write(fss)
def FilterWransColnum(fout, text, sdate): # Legacy individual substitution rules text = ApplyFixSubstitutions(text, sdate, fixsubs) # Remove junk text = text.replace("{**con**}{**/con**}", "") stamp = StampUrl(sdate) # for error messages colnum = -1 for fss in recomb.split(text): columng = recolumnumvals.match(fss) if columng: ldate = mx.DateTime.DateTimeFrom(columng.group(1)).date if sdate != ldate: raise ContextException("Column date disagrees %s -- %s" % (sdate, fss), fragment=fss, stamp=stamp) lcolnum = string.atoi(columng.group(2)) if (colnum == -1) or (lcolnum == colnum + 1): pass # good elif lcolnum < colnum: raise ContextException("Colnum not incrementing %d -- %s" % (lcolnum, fss), fragment=fss, stamp=stamp) # column numbers do get skipped during division listings colnum = lcolnum stamp.stamp = '<stamp coldate="%s" colnum="%sW"/>' % (sdate, lcolnum) fout.write(" ") fout.write(stamp.stamp) continue columncontg = recolnumcontvals.match(fss) if columncontg: ldate = columncontg.group(1) or columncontg.group(3) or None lcolnum = columncontg.group(2) or columncontg.group(4) or None if ldate: ldate = mx.DateTime.DateTimeFrom(ldate).date if sdate != ldate: raise ContextException( "Cont column date disagrees %s -- %s" % (sdate, fss), fragment=fss, stamp=stamp ) lcolnum = string.atoi(lcolnum) if colnum != lcolnum and sdate < "2006-05-08": raise ContextException( "Cont column number disagrees %d -- %s" % (colnum, fss), fragment=fss, stamp=stamp ) # no need to output anything fout.write(" ") continue if columncontg.group(5): lcolnum = string.atoi(columncontg.group(5)) if colnum != lcolnum and colnum != lcolnum + 1: raise ContextException( "Cont column number disagrees %d -- %s" % (colnum, fss), fragment=fss, stamp=stamp ) fout.write(" ") continue if columncontg.group(6): lcolnum = string.atoi(columncontg.group(6)) if colnum + 1 != lcolnum: raise ContextException( "Cont column number disagrees %d -- %s" % (colnum, fss), fragment=fss, stamp=stamp ) colnum = lcolnum stamp.stamp = '<stamp coldate="%s" colnum="%sW"/>' % (sdate, lcolnum) fout.write(" ") fout.write(stamp.stamp) continue # anchor names from HTML <a name="xxx"> anameg = reanamevals.match(fss) if anameg: aname = anameg.group(1) stamp.aname = '<stamp aname="%s"/>' % aname fout.write(stamp.aname) continue # nothing detected # check if we've missed anything obvious if recomb.match(fss): raise ContextException("regexpvals not general enough", fragment=fss, stamp=stamp) # Removed FAI 2007-05-25, I really don't care! # if remarginal.search(fss): # raise ContextException('marginal colnum detection case', # fragment=remarginal.search(fss).group(0), # stamp=stamp) fout.write(fss)
def FilterWransSpeakers(fout, text, sdate): text = ApplyFixSubstitutions(text, sdate, fixsubs) # Fix things like this, to put bold in. We use bold below to detect names, but # occasionally the reporters miss it out, and we catch such cases here: # <p><a name="qnpa_0">Caroline Flint: This information is not held centrally. </p> # <p><a name="qnpa_15">Ms Harman: The information can be found in the following table. </p> missingbolds = re.findall( '(\n?<p>(?:<stamp aname="[^"]+"/>)+)((?:<b></b>)?\s*)([A-Za-z.\-\s]+)(:\s)', text) for p1, p2, p3, p4 in missingbolds: missingbold = "%s%s%s%s" % (p1, p2, p3, p4) bold = "%s<b>%s%s</b>" % (p1, p3, p4) namematches = memberList.fullnametoids(p3, sdate) # Only fix if we found a matching name in the middle (and do it even if ambiguous) if namematches: #print "Fixing missing bold, had name matches:\n\t%s\n\t%s" % (missingbold.strip(), bold.strip()) if not missingbold in text: print "ERROR: missing bold text found, but then vanished when replacing" text = text.replace(missingbold, bold) #else: #print "Plausible missing bold not fixed, as no name matches:\n\t%s\n\t%s" % (missingbold.strip(), bold.strip()) # <B> Mrs. Iris Robinson: </B> lspeakerregexp = '<b>.*?</b>(?:\s*:)?' ltableregexp = '<table[^>]*>[\s\S]*?</table>' # these have bolds, so must be separated out tableregexp = ltableregexp + '(?i)' lregexp = '(%s|%s)(?i)' % (ltableregexp, lspeakerregexp) # setup for scanning through the file. fs = re.split(lregexp, text) # for error messages stampurl = StampUrl(sdate) for i in range(len(fs)): fss = fs[i] fss = stampurl.UpdateStampUrl(fss) # Speakers have new stamps in them if re.match(tableregexp, fss): continue speakerg = re.findall('<b>\s*([^:]*)[:\s]*?([^<:]*)</b>(?i)', fss) if not speakerg: continue # we have a string in bold boldnamestring = string.strip(speakerg[0][0]) # trailing text after the colon in the bold speech bit if re.search('\S', speakerg[0][1]): fs[i + 1] = speakerg[0][1] + fs[i + 1] # push the square brackets outside of the boldstring if there is one # <B> Mr. Miliband [ </B> <i>holding answer 24 March</i>]: sqb = re.findall('^([^\[]*)(\[.*)$', boldnamestring) if sqb: boldnamestring = string.strip(sqb[0][0]) fs[i + 1] = sqb[0][1] + fs[i + 1] # get rid of blank bold strings if not re.search('\S', boldnamestring): fs[i] = '' continue # try to pull in the question number if preceding # These signify aborted oral questions, and are normally # useless and at the start of the page. # 27. <B> Mr. Steen: </B> if i > 0: oqnsep = re.findall( '^([\s\S]*?)Q?(\d+\.?)(\s*?(?:<stamp aname=".*?"/>)?)$', fs[i - 1]) if oqnsep: fs[i - 1] = oqnsep[0][0] + oqnsep[0][2] boldnamestring = oqnsep[0][1] + ' ' + boldnamestring # take out the initial digits and a dot which we may have just put in # (although sometimes it would have already been there) robj = re.match(r"(\d*\.? )(.*)$", boldnamestring) deci = None if robj: (deci, boldnamestring) = robj.groups() # TODO: do something with deci here (it is the "failed # oral questions" signifier) # see if it is an explicitly bad/ambiguous name which will never match if boldnamestring.find('<broken-name>') >= 0: person_id = 'unknown' boldnamestring = boldnamestring.replace('<broken-name>', '') remadename = ' speakername="%s" error="Name ambiguous in Hansard"' % ( boldnamestring) else: # split bracketed cons out if present brakmatch = re.match("(.*)\s+\((.*)\)", boldnamestring) if brakmatch: (name, cons) = brakmatch.groups() else: (name, cons) = (boldnamestring, None) # match the member to a unique identifier (person_id, remadename, remadecons) = memberList.matchfullnamecons(name, cons, sdate, alwaysmatchcons=False) if person_id and remadename: remadename = ' speakername="%s"' % (remadename) if not person_id: if remadename == "MultipleMatch": if boldnamestring == 'Mr. Michael Foster': if remadecons[0] == 'uk.org.publicwhip/person/10209': person_id = remadecons[0] remadename = ' speakername="Michael Foster"' remadecons = 'Worcester' else: person_id = 'unknown' remadename = ' speakername="%s" error="MultipleMatch"' % boldnamestring elif boldnamestring == 'Jim Dobbin' and sdate == '2014-09-08': person_id = 'uk.org.publicwhip/person/10170' remadename = ' speakername="Jim Dobbin"' else: print " No name,const match (%s,%s)" % (name, cons) raise ContextException("No name match", stamp=stampurl, fragment=boldnamestring) # put record in this place fs[i] = '<speaker person_id="%s"%s>%s</speaker>\n' % \ (person_id.encode("latin-1"), remadename.encode("latin-1"), boldnamestring) # scan through everything and output it into the file fout.writelines(fs)