matcher = '''<td><a href='(EDMByMember.aspx\?MID=\d+)\s+&SESSION=875'>(.*)[,.](.*)</a></td>\s+<td>(.*)</td>\s+<td>(.*)</td>\s+<td>(\d+)</td>''' matches = re.findall(matcher, content) for (url, last, first, cons, party, count) in matches: first = re.sub(" \(.*\)", "", first) fullname = first.strip() + " " + last.strip() id, name, cons = memberList.matchfullnamecons(fullname, cons, date_today) url = urlparse.urljoin(edm_index_url, url) if id: if id in aismembers: print >>sys.stderr, "Ignored repeated entry for " , id else: print '<memberinfo id="%s" edm_ais_url="%s" />' % (id, url) aismembers.add(id) else: print >>sys.stderr, "Failed to find '%s'" % (fullname) sys.stdout.flush() print '</publicwhip>' # Check we have everybody allmembers = sets.Set(memberList.currentmpslist()) symdiff = allmembers.symmetric_difference(aismembers) if len(symdiff) > 0: print >>sys.stderr, "Failed to get all MPs, these ones in symmetric difference" print >>sys.stderr, symdiff
# avoid ambiguity in debates parsing) if cons == 'Great Yarmouth' and name == 'Tony Wright': name = 'Anthony D Wright' id, canonname, canoncons = memberList.matchfullnamecons(name, cons, date_today) if not id: print >>sys.stderr, "Failed to match %s %s %s" % (name, cons, date_today) continue url = urlparse.urljoin(bbc_index_url, url) pid = memberList.membertoperson(id) if pid in bbcmembers: print >>sys.stderr, "Ignored repeated entry for " , pid else: print '<personinfo id="%s" bbc_profile_url="%s" />' % (pid, url) bbcmembers.add(pid) sys.stdout.flush() print '</publicwhip>' # Check we have everybody allmembers = sets.Set([ memberList.membertoperson(id) for id in memberList.currentmpslist() ]) symdiff = allmembers.symmetric_difference(bbcmembers) if len(symdiff) > 0: print >>sys.stderr, "Failed to get all MPs, these ones in symmetric difference" print >>sys.stderr, symdiff
if cons == 'Great Yarmouth' and name == 'Tony Wright': name = 'Anthony D Wright' id, canonname, canoncons = memberList.matchfullnamecons( name, cons, date_today) if not id: print >> sys.stderr, "Failed to match %s %s %s" % (name, cons, date_today) continue url = urlparse.urljoin(bbc_index_url, url) pid = memberList.membertoperson(id) if pid in bbcmembers: print >> sys.stderr, "Ignored repeated entry for ", pid else: print '<personinfo id="%s" bbc_profile_url="%s" />' % (pid, url) bbcmembers.add(pid) sys.stdout.flush() print '</publicwhip>' # Check we have everybody allmembers = sets.Set( [memberList.membertoperson(id) for id in memberList.currentmpslist()]) symdiff = allmembers.symmetric_difference(bbcmembers) if len(symdiff) > 0: print >> sys.stderr, "Failed to get all MPs, these ones in symmetric difference" print >> sys.stderr, symdiff