def extract_table(table, url, date=None): trs = table.xpath('.//tr') header = trs[0] tds = header.xpath('.//td') if len(tds) < 2: log( 1, "vote table has less than two columns in the header: %s %s" % (url, tostring(trs[0]))) raise ValueError type = junws(tds[1]) if type not in {"+", "-", "0"}: log( 1, "vote header type is unexpected value %s in %s" % (repr(type), url)) raise ValueError res = {'total': int(junws(tds[0])), 'type': type, 'meps': []} for tr in trs[1:]: tds = tr.xpath('.//td') if len(tds) < 2: log( 1, "vote table has less than two columns in the body: %s %s" % (url, tostring(tr))) raise ValueError #grp = junws(tds[0]).split() for meps in tds[1].xpath(".//p"): meps = junws(meps) if not meps: continue for m in meps.split(','): m = unws(m) if not m: continue mepid = db.getMep(m, date=date) if not mepid: log(2, "could not resolve MEP name: %s" % m) res['meps'].append(mepid or m) return res
def scrape(url, **kwargs): log(3,"scraping %s" % (url)) root = getXML(url) if root is None: log(1,"could not get votes for", url) return # angrily o/ log(3, "processing plenary votes xml from %s" % url) # root is: #PV.RollCallVoteResults EP.Number="PE 533.923" EP.Reference="P7_PV(2014)04-17" Sitting.Date="2014-04-17" Sitting.Identifier="1598443" votes=[] for vote in root.xpath('//RollCallVote.Result'): # hrmpf, some EP seriously used the braindead Y-d-m format sometimes in vote timestamps :/ time = vote.get('Date') if len(time.split()) == 2: ts = datetime.strptime(time, "%Y-%m-%d %H:%M:%S") else: ts = datetime.strptime(time, "%Y-%m-%d") tmp=vote.get('Identifier') if tmp: voteid = int(tmp) else: tmp = vote.get('Number') if not tmp: log(1, "blimey, could not deduce an id for the vote in %s" % url) raise ValueError("no id for vote in %s" % url) voteid = "%s-%s" % (ts,tmp) title = vote.xpath("RollCallVote.Description.Text") if len(title) != 1: log(2, "holy ambiguity Batman! This vote doesn't have one title, but %d: %d %s" % (len(title), voteid, url)) title="!unknown!" else: title=junws(title[0]) v={u"ts": ts, u"url": url, u"voteid": voteid, u"title": title, 'votes':{}} v.update(votemeta(v['title'], v['ts'])) if 'epref' not in v: ref = vote.xpath("RollCallVote.Description.Text/a/text()") if ref: v['epref']=unws(ref[0]) for type, stype in [('Result.For','+'), ('Result.Against','-'), ('Result.Abstention','0')]: type = vote.xpath(type) if not type: continue if len(type)>1: log(2, "[pff] more than one %s entry in vote (id:%d) in %s" % (stype, v['voteid'], url)) type = type[0] v['votes'][stype]={'total': int(type.get('Number')), 'groups': {}} for group in type.xpath('Result.PoliticalGroup.List'): g = str(group.get('Identifier')) if not g in v['votes'][stype]['groups']: v['votes'][stype]['groups'][g]=[] for tag in ['Member.Name', 'PoliticalGroup.Member.Name']: for mep in group.xpath(tag): m = {} name = junws(mep) mepid = mep.get("PersId") if mepid: mepid = int(mepid) else: mepid = db.getMep(name, v['ts'], abbr=g) if mepid: m['mepid']= mepid #if int(mep.get('MepId')) in ambiguous_meps: # oid = int(mep.get('MepId')) # ambiguous_meps.remove(oid) # log(2,'found mepid for previously ambigous obscure_id: "%s": %s' % (oid, mepid)) else: mepid = lost_meps.get(mep.get('MepId')) if mepid: m['mepid']= mepid else: m['name']= name m['obscure_id']=int(mep.get('MepId')) # it's a totally useless and confusing id that is nowhere else used v['votes'][stype]['groups'][g].append(m) # save process(v, v['voteid'], db.vote, 'ep_votes', v['title']) votes.append(v) return votes
def parse_block(block, url, reference, date, committee, rapporteur, PE): am={u'src': url, u'peid': PE, u'reference': reference, u'date': date, u'committee': committee} #logger.info(block) # get title try: am[u'seq']=int(unws(block[0]).split()[1]) except ValueError: am[u'seq']=unws(block[0]).split()[1] except IndexError: log(2,"wrong seq %s" % (block[0])) am[u'seq']=unws(block[0]) del block[0] pefix = PE.split('v')[0] # we strip of the v0[0-9]-[0-9]{1,2} part of the PEID am['id']="%s-%s" % (pefix,am['seq']) strip(block) # find and strip justification i=len(block)-1 while i>2 and not (unws(block[i])=="Justification" and block[i].startswith(' ' * 6)): i-=1 if i>2: if i<len(block)-1 and (not unws(block[i+1]) or not block[i+1].startswith(' ') ): am['justification']='\n'.join(block[i+2:]) del block[i:] strip(block) else: log(2, 'wrong justification in %s: "%s"' % (am['seq'], '\\n'.join(block[i:]))) # get original language if 4<len(unws(block[-1]))<=6 and unws(block[-1]).startswith('Or.'): am['orig_lang']=unws(block[-1])[4:] del block[-1] strip(block) # find split column new/old heading i=len(block)-1 while (i>2 and not ((block[i].endswith(" Amendment") or block[i].endswith(" PARTICULARS") or block[i].endswith(" Remedy") or block[i].endswith(" Amended text") or block[i].endswith(" Amendement") or block[i].endswith(" Amendments by Parliament") or block[i].endswith(" Proposal for rejection") or block[i].endswith(" Proposal for a rejection") or block[i].endswith(" Does not affect English version") or block[i].endswith(" (Does not affect English version)") or block[i].endswith(" Amendment by Parliament")) and len(block[i])>33) and not (unws(block[i])=='Text proposed by the Commission' or unws(block[i]) in types)): i-=1 if i>2: #if block[i].endswith(" Proposal for rejection"): # pass # location will be possibly '-' seq=False if unws(block[i]) in ["Amendment", "Amendment by Parliament"]: # sequential format, search for preceeding original text j=i while (j>2 and not (unws(block[j]) in types or unws(block[j])=='Text proposed by the Commission')): j-=1 if j>2: i=j seq=True; key='old' elif unws(block[i])=='Text proposed by the Commission' or block[i].strip() in types: seq=True; key='old' # throw headers del block[i] while i<len(block) and not unws(block[i]): del block[i] # skip blank lines mid=max([len(x) for x in block])//2 while i<len(block): if seq: if unws(block[i]) in ["Amendment", "Amendment by Parliament", "Text Amended"]: key='new' del block[i] continue try: am[key].append(block[i]) except KeyError: am[key]=[block[i]] del block[i] continue # only new, old is empty if block[i].startswith(' '): try: am['new'].append(unws(block[i])) except KeyError: am['new']=[unws(block[i])] del block[i] continue newstart = block[i].rstrip().rfind(' ') # only old, new is empty if newstart < 6: try: am['old'].append(unws(block[i])) except KeyError: am['old']=[unws(block[i])] del block[i] continue #mid=len(block[i])/2 #mid=40 lsep=block[i].rfind(' ', 0, mid) rsep=block[i].find(' ', mid) sep=None if abs(lsep-mid)<abs(rsep-mid): if abs(lsep-mid)<15: sep=lsep else: if abs(rsep-mid)<15: sep=rsep if sep: try: am['old'].append(unws(block[i][:sep])) except KeyError: am['old']=[unws(block[i][:sep])] try: am['new'].append(unws(block[i][sep:])) except KeyError: am['new']=[unws(block[i][sep:])] else: # no sane split found #logger.warn("no split: %s %s\n%s" % (datetime.now().isoformat(), # (sep, mid, len(block[i]), newstart, block[i]), # block[i][mid-1:mid+2])) # fallback to naive splitting try: am['old'].append(unws(block[i][:newstart])) except KeyError: am['old']=[unws(block[i][:newstart])] try: am['new'].append(unws(block[i][newstart:])) except KeyError: am['new']=[unws(block[i][newstart:])] del block[i] strip(block) else: if not 'Does not affect English version.' in block[i:]: log(2, "no table\n%s" % ('\n'.join(block[i:]))) return None #am['content']=block[i:] #return am i=0 # find end of authors while (i<len(block) and unws(block[i]) and not unws(block[i]).lower().startswith('compromise') and not istype(block[i]) and not unws(block[i]).split()[0] in locstarts): i+=1 if i<len(block): if i>0: names=' '.join(block[:i]) am['authors']=names #logger.info("names \n%s" % names) # convert to pt mep _ids for text in filter(None,splitNames(names)): mepid=db.getMep(text,date) if mepid: try: am['meps'].append(mepid) except KeyError: am['meps']=[mepid] else: log(3, "fix %s" % text) del block[:i] strip(block) elif rapporteur: am['authors']=rapporteur if isinstance(rapporteur,list): for text in rapporteur: mepid=db.getMep(text,date) if mepid: try: am['meps'].append(mepid) except KeyError: am['meps']=[mepid] else: log(3, "fix %s" % text) else: for text in filter(None,splitNames(rapporteur)): mepid=db.getMep(text,date) if mepid: try: am['meps'].append(mepid) except KeyError: am['meps']=[mepid] else: log(3, "fix %s" % text) else: log(3, "no authors in Amendment %s %s" % (am['seq'], url)) else: log(2, "no boundaries in Amendment %s %s\n%s" % (am['seq'], url, '\n'.join(block))) am['rest']=block return am # handle compromise info i=0 while (i<len(block) and unws(block[i]) and not istype(block[i]) and not unws(block[i]).split()[0] in locstarts): i+=1 if i<len(block) and i>0: if [unws(x) for x in block[:i]]!=["Draft proposal for a recommendation"]: am['compromise']=block[:i] del block[:i] strip(block) i=0 while (i<len(block) and unws(block[i])): if unws(block[i]).split()[0] in locstarts: try: am['location'].append((' '.join(block[:i]),unws(block[i]))) except KeyError: am['location']=[(' '.join(block[:i]),unws(block[i]))] del block[:i+1] i=0 else: i+=1 if len(block)>0 and ((len(block)==1 or not unws(block[1])) and unws(block[0])!='1' and 'location' in am): am['location'][-1]=(am['location'][-1][0],"%s %s" % (am['location'][-1][1],block[0])) del block[0] strip(block) if block: if not ((len(block)==3 and unws(block[0])=='1' and not unws(block[1]) and block[2].startswith(" ")) or (len(block)==2 and unws(block[0])=='1' and block[1].startswith(" "))): # ignore obvious footnotes log(3, "rest in Amendment %s\n%s" % (am['seq'],'\n'.join(block))) return am