print memberList.matchfullnamecons(u"Si\xf4n Simon", "Birmingham Erdington", "2006-01-22") sys.exit(0) print lordsList.GetLordIDfname('Baroness Thatcher', None, '2006-05-01') print lordsList.GetLordIDfname('The Archbishop of York', None, '2006-05-01') print lordsList.GetLordIDfname('The Bishop of Southwell and Nottingham', None, '2006-05-01') print memberList.matchfullnamecons("Anne Moffat", "East Lothian", "2006-01-22") print memberList.matchfullnamecons("Anne Picking", "East Lothian", "2006-01-22") print memberList.matchfullnamecons("Anne Moffat", "East Lothian", "2004-01-22") print memberList.matchfullnamecons("Anne Picking", "East Lothian", "2004-01-22") print memberList.canonicalcons("Aberdeen North", "2001-01-01") print memberList.canonicalcons("Aberdeen North", "2005-05-06") print memberList.matchdebatename("Solicitor-General", None, "2003-11-21") print memberList.matchdebatename("The Advocate-General for Scotland", None, "2004-07-30") print memberList.getmembersoneelection("uk.org.publicwhip/member/1238") print memberList.getmembersoneelection("uk.org.publicwhip/member/1353") print memberList.getmembersoneelection("uk.org.publicwhip/member/1357") print memberList.matchdebatename("Mr. Mackay", None, "2003-11-21") print memberList.matchdebatename("James Marshall", None, "2003-11-21") print memberList.matchdebatename("Gareth Thomas", "Clwyd, West", "2003-11-21") print memberList.matchdebatename("Gareth Thomas", None, "2005-05-07") print memberList.matchfullnamecons("Mr. MacDonald", "Western Isles", "2005-04-01") print memberList.matchfullnamecons("Mr. MacNeil", "Na h-Eileanan an Iar", "2005-04-01") print memberList.matchfullnamecons("Mr. MacDonald", "Western Isles", "2005-05-07") print memberList.matchfullnamecons("Mr. MacNeil", "Na h-Eileanan an Iar", "2005-05-07")
def FilterDebateSpeakers(fout, text, sdate, typ): if typ == "westminhall": depspeakerrg = re.search("\[(.*?)(?:<i>)? ?in the Chair(?:</i>)?\]", text) if not depspeakerrg: raise ContextException("Can't find the [... in the Chair] phrase") depspeaker = depspeakerrg.group(1) # old style fixing (before patches existed) if typ == "debate": text = ApplyFixSubstitutions(text, sdate, fixsubs) # for error messages stampurl = StampUrl(sdate) # Fix missing bold tags around names missingbolds = re.findall('(\n?<p>(?:<stamp aname="[^"]+"/>)+)((?:<b></b>)?\s*)([A-Za-z.\-\s]+)((?:\([^)]*\)\s*)*)(:\s)', text) for p1,p2,p3,p4,p5 in missingbolds: missingbold = "%s%s%s%s%s" % (p1,p2,p3,p4,p5) bold = "%s<b>%s%s%s</b>" % (p1,p3,p4,p5) namematches = memberList.fullnametoids(p3, sdate) if namematches: if not missingbold in text: print "ERROR: missing bold text found, but then vanished when replacing" text = text.replace(missingbold, bold) # Move Urgent Question out of speaker name urgentqns = re.findall('(<p>(?:<stamp aname="[^"]+"/>)+)(<b>[^<]*?)(\s*<i>\s*\(Urgent Question\)</i>)(:</b>)(?i)', text) for p1,p2,p3,p4 in urgentqns: urgentqn = "%s%s%s%s" % (p1,p2,p3,p4) correction = "%s%s%s%s" % (p1,p2,p4,p3) text = text.replace(urgentqn, correction) # setup for scanning through the file. for fss in recomb.split(text): stampurl.UpdateStampUrl(fss) #print fss #print "--------------------" # division number detection (these get through the speaker detection regexp) if redivno.match(fss): fout.write(fss.encode("latin-1")) continue # CORRECTION title (these also get through) -- both these are surrounded by <center> tags usually. if fss == "<b>CORRECTION</b>": fout.write(fss.encode("latin-1")) continue # speaker detection speakerg = respeakervals.match(fss) if speakerg: # optional parts of the group # we can use oqnum to detect oral questions anamestamp = speakerg.group(4) or speakerg.group(3) or "" oqnum = speakerg.group(1) if speakerg.group(5): assert not oqnum oqnum = speakerg.group(5) if oqnum: oqnum = ' oral-qnum="%s"' % oqnum else: oqnum = "" # the preceding square bracket qnums sqbnum = speakerg.group(2) or "" party = speakerg.group(8) or speakerg.group(10) spstr = string.strip(speakerg.group(6)) spstrbrack = speakerg.group(7) or speakerg.group(9) # the bracketted phrase (sometimes the constituency or name if it is a minister) if spstrbrack: spstrbrack = re.sub("\n", ' ', spstrbrack) # do quick substitution for dep speakers in westminster hall if typ == "westminhall" and re.search("deputy[ \-]speaker(?i)", spstr) and not spstrbrack: #spstrbrack = depspeaker spstr = depspeaker # match the member to a unique identifier and displayname try: #print "spstr", spstr, ",", spstrbrack #print speakerg.groups() result = memberList.matchdebatename(spstr, spstrbrack, sdate, typ) except Exception, e: # add extra stamp info to the exception raise ContextException(str(e), stamp=stampurl, fragment=fss) # put record in this place #print "ree", result.encode("latin-1") spxm = '%s<speaker %s%s>%s</speaker>\n%s' % (anamestamp, result.encode("latin-1"), oqnum, spstr, sqbnum) fout.write(spxm) continue # nothing detected # check if we've missed anything obvious if recomb.match(fss): raise ContextException('regexpvals not general enough', fragment=fss, stamp=stampurl) if remarginal.search(fss): raise ContextException(' marginal speaker detection case: %s' % remarginal.search(fss).group(0), fragment=fss, stamp=stampurl) # this is where we phase in the ascii encoding fout.write(fss)
def FilterDebateSpeakers(fout, text, sdate, typ): if typ == "westminhall": depspeakerrg = re.search("\[(.*?)(?:<i>)? ?in the Chair(?:</i>)?\]", text) if not depspeakerrg: raise ContextException("Can't find the [... in the Chair] phrase") depspeaker = depspeakerrg.group(1) # old style fixing (before patches existed) if typ == "debate": text = ApplyFixSubstitutions(text, sdate, fixsubs) # for error messages stampurl = StampUrl(sdate) # Fix missing bold tags around names missingbolds = re.findall('(\n?<p>(?:<stamp aname="[^"]+"/>)+)((?:<b></b>)?\s*)([A-Za-z.\-\s]+)((?:\([^)]*\)\s*)*)(:\s)', text) for p1,p2,p3,p4,p5 in missingbolds: missingbold = "%s%s%s%s%s" % (p1,p2,p3,p4,p5) bold = "%s<b>%s%s%s</b>" % (p1,p3,p4,p5) namematches = memberList.fullnametoids(p3, sdate) if namematches: if not missingbold in text: print "ERROR: missing bold text found, but then vanished when replacing" text = text.replace(missingbold, bold) # Move Urgent Question out of speaker name urgentqns = re.findall('(<p>(?:<stamp aname="[^"]+"/>\s*)+)(<b>[^<]*?)(\s*<i>\s*\(Urgent Question\)</i>\s*)(:</b>)(?i)', text) for p1,p2,p3,p4 in urgentqns: urgentqn = "%s%s%s%s" % (p1,p2,p3,p4) correction = "%s%s%s%s" % (p1,p2,p4,p3) text = text.replace(urgentqn, correction) # setup for scanning through the file. for fss in recomb.split(text): stampurl.UpdateStampUrl(fss) #print fss #print "--------------------" # division number detection (these get through the speaker detection regexp) if redivno.match(fss) or retabletext.match(fss): fout.write(fss.encode("latin-1")) continue # CORRECTION title (these also get through) -- both these are surrounded by <center> tags usually. if fss == "<b>CORRECTION</b>": fout.write(fss.encode("latin-1")) continue if re.match('<b>(“)?([0-9]+[A-Z]* .*|(CHAPTER|PART) [0-9]+[A-Z]*|[A-Z, ]+)</b>$', fss): fout.write(fss) continue # speaker detection speakerg = respeakervals.match(fss) if speakerg: # optional parts of the group # we can use oqnum to detect oral questions anamestamp = speakerg.group(4) or speakerg.group(3) or "" oqnum = speakerg.group(1) if speakerg.group(5): assert not oqnum oqnum = speakerg.group(5) if oqnum: oqnum = ' oral-qnum="%s"' % oqnum else: oqnum = "" # the preceding square bracket qnums sqbnum = speakerg.group(2) or "" party = speakerg.group(8) or speakerg.group(10) spstr = string.strip(speakerg.group(6)) spstrbrack = speakerg.group(7) or speakerg.group(9) # the bracketted phrase (sometimes the constituency or name if it is a minister) if spstrbrack: spstrbrack = re.sub("\n", ' ', spstrbrack) # do quick substitution for dep speakers in westminster hall if typ == "westminhall" and re.search("deputy[ \-]speaker(?i)", spstr) and not spstrbrack: #spstrbrack = depspeaker spstr = depspeaker # match the member to a unique identifier and displayname try: #print "spstr", spstr, ",", spstrbrack #print speakerg.groups() result = memberList.matchdebatename(spstr, spstrbrack, sdate, typ) except Exception, e: # add extra stamp info to the exception raise ContextException(str(e), stamp=stampurl, fragment=fss) # put record in this place #print "ree", result.encode("latin-1") spxm = '%s<speaker %s%s>%s</speaker>\n%s' % (anamestamp, result.encode("latin-1"), oqnum, spstr, sqbnum) fout.write(spxm) continue # nothing detected # check if we've missed anything obvious if recomb.match(fss): raise ContextException('regexpvals not general enough', fragment=fss, stamp=stampurl) if remarginal.search(fss): raise ContextException(' marginal speaker detection case: %s' % remarginal.search(fss).group(0), fragment=fss, stamp=stampurl) # this is where we phase in the ascii encoding fout.write(fss)