def scrape_basic(tree): res=form2obj((tree.xpath('//table[@id="technicalInformations"]') or [None])[0],detailsheaders) or {} if 'dossier_of_the_committee' in res: res['dossier_of_the_committee']=';'.join(sorted((unws(x) for x in res['dossier_of_the_committee'].split(';')))) table=(tree.xpath('//table[@id="basic_information"]') or [None])[0] if table is None: return res res.update({'stage_reached': (table.xpath('.//p[@class="pf_stage"]/text()') or [''])[0].strip(), 'reference': (table.xpath('.//span[@class="basic_reference"]/text()') or [''])[0].strip(), 'type': (table.xpath('.//p[@class="basic_procedurefile"]/text()') or [''])[0].strip(), 'title': (table.xpath('.//p[@class="basic_title"]/text()') or [''])[0].strip(), }) if '' in res: del res[''] if 'legal_basis' in res: res[u'legal_basis']=sorted((unws(x) for x in res['legal_basis'].split(';'))) fields=table.xpath('.//p[@class="basic_content"]/*') firstline=u' '.join((table.xpath('.//p[@class="basic_content"]/text()') or [''])[0].split()) attrib=u'summary' if len(firstline): if not attrib in res: res[attrib]=[] res[attrib]=[firstline] for elem in fields: if elem.tag=='br' and elem.tail and elem.tail.strip(): if not attrib in res: res[attrib]=[] res[attrib].append(u' '.join(elem.tail.split())) elif elem.tag=='strong': if attrib in res and res[attrib]: res[attrib].sort() attrib=u' '.join(elem.xpath('text()')[0].split()) attrib=detailsheaders.get(attrib,attrib).lower().replace(u" ",u"_") if attrib: res[attrib]=[] return res
def scrape(celexid, path): logger.info("scraping %s%s:NOT" % (EURLEXURL,celexid)) path.reverse() (code,lang)=celexid.split(":")[1:3] st=6 if len(code)>6: if code[6].isalpha(): st=7 eurlex={'id': {u'celexid': celexid, u'sector': code[0], u'year': code[1:5], u'doctype': code[5:st], u'refno': code[st:], u'lang': lang, u'chapter': path, }} else: eurlex={'id': {u'celexid': celexid, u'sector': code[0], u'year': code[1:5], u'doctype': code[5:6], u'lang': lang, u'chapter': path, }} try: eurlex['id'][u'typeDesc']= CELEXCODES[code[0]]['Document Types'][code[5:st]] if code[5:st] != 'C' else CELEXCODES[code[0]]['Sector'] except: eurlex['id'][u'typeDesc']= u"Unknown" logger.warn("[!] unknown typedesc %s" % celexid) eurlex['meta']={u'src': "%s%s:NOT" % (EURLEXURL,celexid)} root = fetch("%s%s:NOT" % (EURLEXURL,celexid)) if len(root.xpath('//h1[text()="No documents matching criteria."]'))>0: logger.warn('[!] nothing to scrape here: %s', "%s%s:NOT" % (EURLEXURL,celexid)) return eurlex[u'title'] = root.xpath('//h2[text()="Title and reference"]/following-sibling::p/text()')[0] # dates dates=root.xpath('//h2[text()="Dates"]/following-sibling::ul/text()') for y in dates: if not unws(y): continue title, rest=unws(y).split(": ",1) item={u'type': title} date=rest[:10] tail=rest[10:] if tail.startswith('; '): tail=tail[2:] if date=='99/99/9999': item[u'date']= datetime(9999,12,31) elif date=='00/00/0000': item[u'date']= datetime(0001,01,01) elif date=='//': continue else: try: item[u'date']= datetime.strptime(date, u"%d/%m/%Y") except ValueError: try: item[u'date']= datetime.strptime(date, u"%m/%d/%Y") except: pass if len(tail): item['note']=tail try: eurlex['dates'].append(item) except: eurlex['dates']=[item]
def getAddress(root): res={} for div in root.xpath('//div[@id="contextzone"]//div[@class="ep_title"]'): # getAddress(map(strip, div.xpath("../..//div[@class='ep_elementcontact']/ul"))) key=unws(''.join(div.xpath('.//text()'))) if key not in ['Bruxelles', 'Strasbourg', 'Postal address', 'Luxembourg']: continue res[key]={} if key in ['Bruxelles', 'Strasbourg', 'Luxembourg']: tmp=div.xpath('../..//li[@class="ep_phone"]/div/text()') if tmp: res[key]['Phone'] = unws(tmp[0]).replace('(0)','') tmp=div.xpath('../..//li[@class="ep_fax"]/div/text()') if tmp: res[key]['Fax'] = unws(tmp[0]).replace('(0)','') tmp=[unws(x) for x in div.xpath('../..//li[@class="ep_address"]/div/text()') if len(unws(x))] if key=='Strasbourg': res[key].update(dict(zip(['Organization','Building', 'Office', 'Street','Zip1', 'Zip2'],tmp))) res[key]['City']=res[key]['Zip2'].split()[1] res[key]['Zip2']=res[key]['Zip2'].split()[0] res[key]['building_code']=buildings[res[key]['Building']] elif key=='Bruxelles': res[key].update(dict(zip(['Organization','Building', 'Office', 'Street','Zip'],tmp))) res[key]['City']=res[key]['Zip'].split()[1] res[key]['Zip']=res[key]['Zip'].split()[0] res[key]['building_code']=buildings[res[key]['Building']] elif key=='Luxembourg': res[key]['Address']=tmp elif key=='Postal address': res[key]=tmp else: logger.error("wtf %s" % key) return res
def getactors(node): res={} ax=[None,[]] for row in node.xpath('.//tr'): cells=row.xpath('./td/p') if not cells: continue # get role Rapporteur|Responsible|Rapporteur for the opinion|Opinions role=cells[0].xpath('text()') if role and unws(role[0]): if ax[0] and ax[1]: res[ax[0]]=sorted(ax[1]) tmp=unws(role[0])[:-1] if tmp=="Rapporteur for the opinion": tmp="Rapporteur" ax=[tmp,[]] tmp=unws((cells[1].xpath('text()') or [None])[0]) if ax[0] in ["Rapporteur", "Rapporteur for the opinion"] and tmp: name=' '.join(tmp.split()[:-1]) item={u'group': tmp.split()[-1][1:-1], u'name': name, u'mepref': getMEPRef(name) } if len(cells)>2: item[u'docs']=getdoclist(cells[2]) ax[1].append(item) continue if ax[0] in ["Opinions", "Responsible"] and tmp: tmp1=tmp.split(u' –',1) if len(tmp1)==2: (comid, rest)=tmp1 elif len(tmp1)==1: skip=False for com in tmp.split(' ,'): if com in COMMITTEE_MAP and len(com)==4: ax[1].append({u'comid': com}) skip=True if skip: continue else: logger.warn("[!] unknown committee: %s" % tmp) raise item={u'comid': comid} if rest==' Decision: no opinion': item[u'response']=u'Decision: no opinion' if not rest and len(comid)>4: for com in comid.split(', '): ax[1].append({u'comid': com}) continue if len(cells)>2: tmp=unws((cells[2].xpath('text()') or [None])[0]) if tmp: name=' '.join(tmp.split()[:-1]) item.update({u'group': tmp.split()[-1][1:-1], u'name': name, u'mepref': getMEPRef(name)}) if len(cells)>3: item[u'docs']=getdoclist(cells[3]) ax[1].append(item) if ax[0] and ax[1]: res[ax[0]]=sorted(ax[1]) return res
def getAddress(root): res={} for div in root.xpath('../following-sibling::div[@class="boxcontent " or @class="boxcontent nobordertop"]/ul[@class="contact"]'): key=unws(''.join(div.xpath('./preceding-sibling::h4/text()'))) if key not in ['Bruxelles', 'Strasbourg', 'Postal address', 'Luxembourg']: continue if key=='Bruxelles': key=u'Brussels' elif key=='Postal address': key=u'Postal' res[key]={} if key in ['Brussels', 'Strasbourg', 'Luxembourg']: tmp=div.xpath('./following-sibling::ul[@class="link_collection_noborder"]//span[@class="phone"]/text()') if tmp: res[key][u'Phone'] = unws(tmp[0]).replace('(0)','') tmp=div.xpath('./following-sibling::ul[@class="link_collection_noborder"]//span[@class="fax"]/text()') if tmp: res[key][u'Fax'] = unws(tmp[0]).replace('(0)','') tmp=[unws(x) for x in div.xpath('./li[@class="address"]//text()') if len(unws(x))] if key=='Strasbourg': res[key][u'Address']=dict(zip([u'Organization',u'Building', u'Office', u'Street',u'Zip1', u'Zip2'],tmp)) res[key][u'Address']['City']=res[key]['Address']['Zip2'].split()[1] res[key][u'Address']['Zip2']=res[key]['Address']['Zip2'].split()[0] res[key][u'Address']['building_code']=buildings.get(res[key]['Address']['Building']) elif key=='Brussels': res[key][u'Address']=dict(zip([u'Organization',u'Building', u'Office', u'Street',u'Zip'],tmp)) res[key][u'Address']['City']=res[key]['Address']['Zip'].split()[1] res[key][u'Address']['Zip']=res[key]['Address']['Zip'].split()[0] res[key][u'Address']['building_code']=buildings.get(res[key]['Address']['Building']) elif key=='Luxembourg': res[key][u'Address']=tmp elif key=='Postal': res[key]=tmp else: logger.error("wtf %s" % key) return res
def getInOut(term=current_term, dir="in", res={}): # returns dict of new incoming meps. this is being checked when # crawling, to set more accurate groups and constituency info i = 0 page = fetch("http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?type=%s" % dir, ignore=[500]) last = None while True: meps = [] for x in page.xpath('//div[@class="ep_elementpeople1"]//a[@class="ep_title"]'): mepid = int(urljoin(BASE_URL, x.get("href")).split("/")[-2]) const = {u"country": unws((x.xpath('..//span[@class="ep_country"]/text()') or [""])[0])} if dir == "out": const["start"], const["end"] = [ datetime.strptime(d, "%B %d, %Y") for d in unws((x.xpath('../span[@class="meps_date_inout"]/text()') or [""])[0]).split(" - ") ] else: const["start"] = datetime.strptime( unws((x.xpath('../span[@class="meps_date_inout"]/text()') or [""])[0]), "%B %d, %Y" ) if not mepid in res: res[mepid] = [const] else: res[mepid].append(const) meps.append((mepid, const)) if meps == last: break last = meps i += 1 page = fetch( "http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?action=%s&webCountry=&webTermId=%s&name=&politicalGroup=&bodyType=&bodyValue=&type=%s&filter=" % (i, term, dir), ignore=[500], ) return res
def toLinks(node): if node is None: return for br in node.xpath("br"): br.text="\n" ret=[] for line in node.xpath(".//text()"): if len(unws(line))<1: continue if line.getparent().tag=='a': ret.append({u'title': unws(line), 'url': unicode(urljoin(BASE_URL,line.getparent().get('href')),'utf8')}) else: ret.append({u'title': unws(line)}) return ret
def toLinks(node): if node is None: return for br in node.xpath("br"): br.text = "\n" ret = [] for line in node.xpath(".//text()"): if len(unws(line)) < 1: continue if line.getparent().tag == "a": ret.append({u"title": unws(line), "url": unicode(urljoin(BASE_URL, line.getparent().get("href")), "utf8")}) else: ret.append({u"title": unws(line)}) return ret
def getComAgendas(): #urltpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html" urltpl="http://www.europarl.europa.eu/committees/en/%s/search-in-documents.html" postdata="docType=AGEN&leg=8&miType=text&tabActif=tabResult#sidesForm" #nexttpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?action=%s&tabActif=tabResult#sidesForm" nexttpl="http://www.europarl.europa.eu/committees/en/%s/search-in-documents.html?action=%s&tabActif=tabResult#sidesForm" for com in (k for k in COMMITTEE_MAP.keys() if len(k)==4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP', 'SURE', 'CRIM', 'CRIS']): url=urltpl % (com) i=0 agendas=[] logger.info('scraping %s' % com) root=fetch(url, params=postdata) prev=[] while True: logger.info("%s %s" % (datetime.now().isoformat(), url)) tmp=[(a.get('href'), unws(a.xpath('text()')[0])) for a in root.xpath('//p[@class="title"]/a') if len(a.get('href',''))>13] if not tmp or prev==tmp: break prev=tmp for u,title in tmp: if title.startswith('DRAFT AGENDA'): yield (u,com) i+=1 url=nexttpl % (com,i) root=fetch(url)
def istype(text): # get type found = False for t in types: if unws(text).lower().startswith(t.lower()): found = True break return found
def getIncomming(term=7): # returns dict of new incoming meps. this is being checked when # crawling, to set more accurate groups and constituency info i=0 page=fetch('http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?type=in') last=None res={} while True: meps=[((u'name', unws(x.xpath('text()')[0])), (u'meta', {u'url': urljoin(urljoin(BASE_URL,x.get('href')),'get.html')}), (u'Constituencies', {u'start': datetime.strptime(unws((x.xpath('../span[@class="meps_date_inout"]/text()') or [''])[0]), "%B %d, %Y"), u'country': unws((x.xpath('..//span[@class="ep_country"]/text()') or [''])[0])}), (u'Groups', {u'start': datetime.strptime(unws((x.xpath('../span[@class="meps_date_inout"]/text()') or [''])[0]), "%B %d, %Y"), u'group': unws((x.xpath('..//span[@class="ep_group"]/text()') or [''])[0]), u'groupid': group_map[unws((x.xpath('..//span[@class="ep_group"]/text()') or [''])[0])], u'role': unws((x.xpath('..//span[@class="ep_group"]/span[@class="ep_title"]/text()') or [''])[0])}), ) for x in page.xpath('//div[@class="ep_elementpeople1"]//a[@class="ep_title"]')] if meps==last: break last=meps for mep in meps: res[int(mep[1][1]['url'].split('/')[-2])]=dict(mep[1:]) i+=1 page=fetch('http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?action=%s&webCountry=&webTermId=%s&name=&politicalGroup=&bodyType=&bodyValue=&type=in&filter=' % (i, term)) return res
def getdoclist(node): txt=[x for x in node.xpath('.//text()') if unws(x)] i=0 res=[] while i+1 < len(txt): if unws(txt[i])[-1]==u"\u2013": res.append({u'type': unws(txt[i])[:-2], u'title': unws(txt[i+1]), u'url': urljoin(BASE_URL, txt[i+1].getparent().get('href'))}) i+=2 elif len(unws(txt[i]).split(u" \u2013 "))>1: res.append({u'type': unws(txt[i].split(u" \u2013 ")[0]), u'title': unws(txt[i].split(u" \u2013 ")[1] if len(txt[i].split(u" \u2013 "))>1 else u'')}) i+=1 else: i+=1 if i < len(txt) and len(unws(txt[i]).split(u" \u2013 "))>1: res.append({u'type': unws(txt[i]).split(u" \u2013 ")[0], u'title': unws(txt[i]).split(u" \u2013 ")[1]}) return res
def get_meps(term='7'): i=0 page=fetch("http://www.europarl.europa.eu/meps/en/performsearch.html?webCountry=&webTermId=%s&name=&politicalGroup=&bodyType=ALL&bodyValue=&type=&filter=&search=Show+result" % (term)) last=None while True: meps=[(x.get('href'), unws(x.xpath('text()')[0])) for x in page.xpath('//div[@class="ep_elementpeople1"]//a[@class="ep_title"]')] if meps==last: break for url,name in meps: yield (urljoin(urljoin(BASE_URL,url),'get.html'), name) last=meps i+=1 page=fetch("http://www.europarl.europa.eu/meps/en/performsearch.html?action=%s&webCountry=&webTermId=%s&name=&politicalGroup=&bodyType=ALL&bodyValue=&type=&filter=" % (i, term))
def getAddress(root): res = {} for div in root.xpath('//div[@id="contextzone"]//div[@class="ep_title"]'): # getAddress(map(strip, div.xpath("../..//div[@class='ep_elementcontact']/ul"))) key = unws("".join(div.xpath(".//text()"))) if key not in ["Bruxelles", "Strasbourg", "Postal address", "Luxembourg"]: continue if key == "Bruxelles": key = u"Brussels" elif key == "Postal address": key = u"Postal" res[key] = {} if key in ["Brussels", "Strasbourg", "Luxembourg"]: tmp = div.xpath('../..//li[@class="ep_phone"]/div/text()') if tmp: res[key][u"Phone"] = unws(tmp[0]).replace("(0)", "") tmp = div.xpath('../..//li[@class="ep_fax"]/div/text()') if tmp: res[key][u"Fax"] = unws(tmp[0]).replace("(0)", "") tmp = [unws(x) for x in div.xpath('../..//li[@class="ep_address"]/div/text()') if len(unws(x))] if key == "Strasbourg": res[key][u"Address"] = dict( zip([u"Organization", u"Building", u"Office", u"Street", u"Zip1", u"Zip2"], tmp) ) res[key][u"Address"]["City"] = res[key]["Address"]["Zip2"].split()[1] res[key][u"Address"]["Zip2"] = res[key]["Address"]["Zip2"].split()[0] res[key][u"Address"]["building_code"] = buildings[res[key]["Address"]["Building"]] elif key == "Brussels": res[key][u"Address"] = dict(zip([u"Organization", u"Building", u"Office", u"Street", u"Zip"], tmp)) res[key][u"Address"]["City"] = res[key]["Address"]["Zip"].split()[1] res[key][u"Address"]["Zip"] = res[key]["Address"]["Zip"].split()[0] res[key][u"Address"]["building_code"] = buildings[res[key]["Address"]["Building"]] elif key == "Luxembourg": res[key][u"Address"] = tmp elif key == "Postal": res[key] = tmp else: logger.error("wtf %s" % key) return res
def getOutgoing(term=7): # returns an iter over ex meps from the current term, these are # missing from the get_meps result i=0 page=fetch('http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?type=out') last=None while True: meps=[((u'url', urljoin(BASE_URL,x.get('href'))), (u'name', unws(x.xpath('text()')[0])), ('dates', unws((x.xpath('../span[@class="meps_date_inout"]/text()') or [''])[0])), ('country', unws((x.xpath('../span[@class="ep_country"]/text()') or [''])[0])), ('group', unws((x.xpath('..//span[@class="ep_group"]/text()') or [''])[0])), ('role', unws((x.xpath('..//span[@class="ep_group"]/span[@class="ep_title"]/text()') or [''])[0])), ) for x in page.xpath('//div[@class="ep_elementpeople1"]//a[@class="ep_title"]')] if meps==last: break last=meps for mep in meps: mep=dict(mep) tmp=mep['dates'].split(' - ') if tmp: mep[u'Constituencies']={u'start': datetime.strptime(tmp[0], "%B %d, %Y"), u'end': datetime.strptime(tmp[1], "%B %d, %Y"), u'country': mep['country']} mep[u'Groups']={u'start': datetime.strptime(tmp[0], "%B %d, %Y"), u'end': datetime.strptime(tmp[1], "%B %d, %Y"), u'group': mep['group'], u'role': mep['role']} del mep['dates'] del mep['country'] del mep['group'] del mep['role'] yield (urljoin(urljoin(BASE_URL,mep['url']),'get.html'), mep) i+=1 page=fetch('http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?action=%s&webCountry=&webTermId=%s&name=&politicalGroup=&bodyType=&bodyValue=&type=out&filter=' % (i, term))
def getComAgendas(): urltpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?&docType=AGEN&leg=7&miType=text" nexttpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?tabActif=tabLast&startValue=%s" for com in (k for k in COMMITTEE_MAP.keys() if len(k)==4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP']): url=urltpl % (com) i=0 agendas=[] logger.info('scraping %s' % com) while True: logger.info("crawling %s" % (url)) root=fetch(url) tmp=[(a.get('href'), unws(a.xpath('text()')[0])) for a in root.xpath('//p[@class="title"]/a') if len(a.get('href',''))>13] if not tmp: break for u,_ in tmp: yield (u,com) i+=10 url=nexttpl % (com,i)
def splitNames(text): text = text.split(' on behalf ',1)[0] res=[] for delim in (', ', ' and ', ' & ', '; ', ','): if not res: res=filter(None,[item[:-1] if item[-1] in [',', "'", ';'] else item for item in unws(text).split(delim) if item]) continue res=filter(None,[item[:-1] if item[-1] in [',', "'", ';'] else item for elem in res for item in elem.split(delim) if item]) # only for devel. # for mep in res: # if mep.startswith('on behalf of'): continue # if mep.endswith('Shadow)'): # logger.info('shadow: %s' % mep) res=[mep if not mep.endswith('Shadow)') else mep[:mep.rfind(' (')] for mep in res if not mep.startswith('on behalf of')] res=[y for x in res for y in mansplits.get(x,[x])] return [mepmaps.get(x,x) for x in res]
def parse_block(block, url, reference, date, committee, rapporteur): am = { u'src': url, u'reference': reference, u'date': date, u'committee': committee } #logger.info(block) # get title try: am[u'seq'] = int(unws(block[0]).split()[1]) except ValueError: am[u'seq'] = unws(block[0]).split()[1] except IndexError: logger.warn("%s wrong seq %s" % (datetime.now().isoformat(), block[0])) am[u'seq'] = unws(block[0]) del block[0] strip(block) # find and strip justification i = len(block) - 1 while i > 2 and not (unws(block[i]) == "Justification" and block[i].startswith(' ' * 6)): i -= 1 if i > 2: if i < len(block) - 1 and (not unws(block[i + 1]) or not block[i + 1].startswith(' ')): am['justification'] = '\n'.join(block[i + 2:]) del block[i:] strip(block) else: logger.warn("%s wrong justification\n%s" % (datetime.now().isoformat(), '\n'.join(block[i:]))) # get original language if 4 < len(unws(block[-1])) <= 6 and unws(block[-1]).startswith('Or.'): am['orig_lang'] = unws(block[-1])[4:] del block[-1] strip(block) # find split column new/old heading i = len(block) - 1 while (i > 2 and not ( (block[i].endswith(" Amendment") or block[i].endswith(" PARTICULARS") or block[i].endswith(" Remedy") or block[i].endswith(" Amended text") or block[i].endswith(" Amendement") or block[i].endswith(" Amendments by Parliament") or block[i].endswith(" Proposal for rejection") or block[i].endswith(" Proposal for a rejection") or block[i].endswith(" Does not affect English version") or block[i].endswith(" (Does not affect English version)") or block[i].endswith(" Amendment by Parliament")) and len(block[i]) > 33) and not (unws(block[i]) == 'Text proposed by the Commission' or unws(block[i]) in types)): i -= 1 if i > 2: #if block[i].endswith(" Proposal for rejection"): # pass # location will be possibly '-' seq = False if unws(block[i]) in ["Amendment", "Amendment by Parliament"]: # sequential format, search for preceeding original text j = i while (j > 2 and not (unws(block[j]) in types or unws(block[j]) == 'Text proposed by the Commission')): j -= 1 if j > 2: i = j seq = True key = 'old' elif unws( block[i] ) == 'Text proposed by the Commission' or block[i].strip() in types: seq = True key = 'old' # throw headers del block[i] while i < len(block) and not unws(block[i]): del block[i] # skip blank lines mid = max([len(x) for x in block]) / 2 while i < len(block): if seq: if unws(block[i]) in [ "Amendment", "Amendment by Parliament", "Text Amended" ]: key = 'new' del block[i] continue try: am[key].append(block[i]) except KeyError: am[key] = [block[i]] del block[i] continue # only new, old is empty if block[i].startswith(' '): try: am['new'].append(unws(block[i])) except KeyError: am['new'] = [unws(block[i])] del block[i] continue newstart = block[i].rstrip().rfind(' ') # only old, new is empty if newstart < 6: try: am['old'].append(unws(block[i])) except KeyError: am['old'] = [unws(block[i])] del block[i] continue #mid=len(block[i])/2 #mid=40 lsep = block[i].rfind(' ', 0, mid) # todo calculate both, and use the one closer to the center rsep = block[i].find(' ', mid) sep = None if abs(lsep - mid) < abs(rsep - mid): if abs(lsep - mid) < 15: sep = lsep else: if abs(rsep - mid) < 15: sep = rsep if sep: try: am['old'].append(unws(block[i][:sep])) except KeyError: am['old'] = [unws(block[i][:sep])] try: am['new'].append(unws(block[i][sep:])) except KeyError: am['new'] = [unws(block[i][sep:])] else: # no sane split found #logger.warn("no split: %s %s\n%s" % (datetime.now().isoformat(), # (sep, mid, len(block[i]), newstart, block[i]), # block[i][mid-1:mid+2])) # fallback to naive splitting try: am['old'].append(unws(block[i][:newstart])) except KeyError: am['old'] = [unws(block[i][:newstart])] try: am['new'].append(unws(block[i][newstart:])) except KeyError: am['new'] = [unws(block[i][newstart:])] del block[i] strip(block) else: logger.warn("%s no table\n%s" % (datetime.now().isoformat(), '\n'.join(block[i:]))) am['content'] = block[i:] return am i = 0 # find end of authors while (i < len(block) and unws(block[i]) and not unws(block[i]).lower().startswith('compromise') and not istype(block[i]) and not unws(block[i]).split()[0] in locstarts): i += 1 if i < len(block): if i > 0: names = ' '.join(block[:i]) am['authors'] = names #logger.info("names \n%s" % names) # convert to pt mep _ids for text in filter(None, splitNames(names)): mep = getMep(text, None, False) if mep: try: am['meps'].append(mep['UserID']) except KeyError: am['meps'] = [mep['UserID']] else: logger.info("fix %s" % text) del block[:i] strip(block) elif rapporteur: am['authors'] = rapporteur for text in filter(None, splitNames(rapporteur)): mep = getMep(text, None, False) if mep: try: am['meps'].append(mep['UserID']) except KeyError: am['meps'] = [mep['UserID']] else: logger.info("fix %s" % text) else: logger.info("%s no authors in Amendment %s" % (datetime.now().isoformat(), am['seq'])) else: logger.warn("%s no boundaries in Amendment %s\n%s" % (datetime.now().isoformat(), am['seq'], '\n'.join(block))) am['rest'] = block return am # handle compromise info i = 0 while (i < len(block) and unws(block[i]) and not istype(block[i]) and not unws(block[i]).split()[0] in locstarts): i += 1 if i < len(block) and i > 0: am['compromise'] = block[:i] del block[:i] strip(block) i = 0 while (i < len(block) and unws(block[i])): if unws(block[i]).split()[0] in locstarts: try: am['location'].append((' '.join(block[:i]), unws(block[i]))) except KeyError: am['location'] = [(' '.join(block[:i]), unws(block[i]))] del block[:i + 1] i = 0 else: i += 1 if len(block) > 0 and ((len(block) == 1 or not unws(block[1])) and unws(block[0]) != '1' and 'location' in am): am['location'][-1] = (am['location'][-1][0], "%s %s" % (am['location'][-1][1], block[0])) del block[0] strip(block) if block: if not ((len(block) == 3 and unws(block[0]) == '1' and not unws(block[1]) and block[2].startswith(" ")) or (len(block) == 2 and unws(block[0]) == '1' and block[1].startswith(" "))): # ignore obvious footnotes logger.info("rest in Amendment %s\n%s" % (am['seq'], '\n'.join(block))) return am
def scrape_epagents(table): heading = ''.join( table.xpath('.//td[@class="players_committee"]')[0].xpath( ".//text()")).strip() responsible = None if heading in ["Committee responsible", "Former committee responsible"]: responsible = True elif heading in ["Committee for opinion", "Former committee for opinion"]: responsible = False else: logger.warn(u"[!] unknown committee heading %s" % heading) # handle shadows shadowelems = table.xpath( '//a[@id="shadowRapporteurHeader"]/../following-sibling::div/p//span[@class="players_rapporter_text"]/a' ) tips = [ t.xpath('text()')[0] if len(t.xpath('text()')) > 0 else groupurlmap[t.xpath("a")[0].get('href')] if len(t.xpath("a")) > 0 else groupurlmap[t.xpath("img")[0].get('src')] for t in table.xpath( '//a[@id="shadowRapporteurHeader"]/../following-sibling::div//span[@class="tiptip"]' ) ] shadows = {} for shadow, group in izip_longest(shadowelems, tips): committee = shadow.xpath( './ancestor::td/preceding-sibling::td//acronym/text()')[0] if not committee in shadows: shadows[committee] = [] if group == 'NI': group = u'NI' mep = { u'name': unicode(shadow.xpath('text()')[0]), u'group': unicode(group) } tmp = getMEPRef(shadow.xpath('text()')[0]) if tmp: mep[u'mepref'] = tmp #else: # raise IndexError shadows[committee].append(mep) # delete the uneccessary shadow elements - so the following regular lst2obj get's what it expects for todel in table.xpath('//a[@id="shadowRapporteurHeader"]/..'): parent = todel.xpath('..')[0] parent.remove(todel.xpath('following-sibling::div')[0]) parent.remove(todel) # handle each row of agents agents = [] for agent in lst2obj(table, epagents, 1): agent[u'responsible'] = responsible agent[u'body'] = u'EP' if agent.get('rapporteur'): meps = [] for mep in agent['rapporteur']: if unws(mep['name']).startswith( "The committee decided not to give an opinion"): del agent['rapporteur'][agent['rapporteur'].index(mep)] agent[u'opinion'] = None continue tmp = getMEPRef(mep['name']) if tmp: meps.append({ u'mepref': tmp, u'group': mep['group'], u'name': mep['name'] }) else: meps.append({u'group': mep['group'], u'name': mep['name']}) agent[u'rapporteur'] = meps abbr = agent['committee'][:4] if abbr == 'BUDE': abbr = 'BUDG' if not abbr in COMMITTEE_MAP.keys(): logger.warn(u"[!] uknown committee abbrev %s" % abbr) agent[u'committee_full'] = agent['committee'] if agent['committee'][4] == ' ' and abbr.isalpha(): agent[u'committee'] = abbr else: agent[u'committee_full'] = agent['committee'][5:] agent[u'committee'] = abbr if agent.get(u'committee') in shadows.keys(): agent[u'shadows'] = shadows[agent['committee']] if not agent in agents: agents.append(agent) return agents
def scrape(url, rapporteur=None): if (url in [ 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-483.680%2b02%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-454.387%2b01%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-456.679%2b01%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-494.504%2b01%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.705%2b01%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.767%2b02%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-454.385%2b01%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-465.012%2b01%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-496.504%2b01%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.724%2b01%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.721%2b02%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.723%2b03%2bDOC%2bPDF%2bV0%2f%2fEN' ] or not url.endswith('EN')): logger.info("skipping unparsable url") return [] prolog = True res = [] block = None reference = None date = None committee = [] text = getraw(url).split('\n') for line in text: if prolog: if amstart.match(line): if reference == None: logger.warn("%s [!] couldn't find ref: %s" % (datetime.now().isoformat(), unws([x for x in text[:20] if unws(x)][2]))) # marking as scraped though db.ep_ams.save({ 'src': url, 'error': "couldn't find reference in source pdf" }) return [] if date == None or committee == []: return [] #raise ValueError block = [line] prolog = False continue line = unws(line) if not line: continue if line in COMMITTEE_MAP: committee.append(COMMITTEE_MAP[line]) continue if (committee and not reference and re.match(refre, line)): reference = line if url == 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-506.166%2b03%2bDOC%2bPDF%2bV0%2f%2fEN': logger.info("adjusting reference to eudatap") reference = "2012/0011(COD)" continue if (reference and not date): try: date = parse(unws(line), dayfirst=True) except ValueError: pass continue if amstart.match(line): # parse block res.append( parse_block(block, url, reference, date, committee, rapporteur)) block = [line] continue block.append(line) if block and filter(None, block): res.append( parse_block(block, url, reference, date, committee, rapporteur)) return res
def scrape(url, comid): root=fetch(url) lines=[x for x in root.xpath('//td[@class="contents"]/div/*') if unws(' '.join(x.xpath('.//text()')))] if not len(lines): return if not unws(' '.join(lines[2].xpath('.//text()')))=='DRAFT AGENDA': logger.error("NOT DRAFT AGENDA %s" % unws(' '.join(lines[2].xpath('.//text()')))) agenda={u'committee': comid, u'committee_full': unws(' '.join(lines[0].xpath('.//text()'))), u'src': url, } i=1 if unws(' '.join(lines[3].xpath('.//text()')))=="INTERPARLIAMENTARY COMMITTEE MEETING": logger.warn("skipping interparl com meet") return if unws(' '.join(lines[6].xpath('.//text()'))).startswith('Room'): agenda.update({u'docid': unws(' '.join(lines[1].xpath('.//text()'))), u'type': unws(' '.join(lines[3].xpath('.//text()'))), u'time': toTime(unws(' '.join(lines[4].xpath('.//text()')))), u'city': unws(' '.join(lines[5].xpath('.//text()'))), u'room': unws(' '.join(lines[6].xpath('.//text()')))[6:], }) i=7 itemcnt=0 item={} schedule=None res=[] while i < len(lines): line=lines[i] i+=1 txt=unws(' '.join(line.xpath('.//text()'))) if txt in ['* * *', '***']: continue # skip end of schedule block # 20 December 2011, 16.00 – 16.30 tmp=toTime(txt) if tmp: schedule=tmp if i<len(lines) and unws(' '.join(lines[i].xpath('.//text()'))) == 'In camera': schedule[u'incamera']=True i+=1 continue if line.tag=='div': item[u'actors']=getactors(line) continue firsttoken=txt.split()[0] # 6. Alternative dispute resolution for consumer disputes and # amending Regulation (EC) No 2006/2004 and Directive # 2009/22/EC (Directive on consumer ADR) if firsttoken[-1]=='.' and firsttoken[:-1].isdigit() and itemcnt+1==int(firsttoken[:-1]): if item: res.append(item) itemcnt+=1 item=copy.deepcopy(agenda) item.update({u'title': ' '.join(txt.split()[1:]), u'seq_no': itemcnt,}) if schedule: item.update(schedule) continue # trailing list of "details" # · Presentation by the Commission of the proposal & Impact Assessment # · Exchange of views if firsttoken==u"·": if not 'list' in item: item[u'list']=[] tmp=' '.join(txt.split()[1:]) if tmp.startswith('Deadline for tabling amendments:'): try: item[u'tabling_deadline']=datetime.strptime(tmp.split(':')[1].strip(),"%d %B %Y, %H.%M") except ValueError: try: item[u'tabling_deadline']=datetime.strptime(tmp.split(':')[1].strip(),"%d.%m.%Y at %H.%M") except: logger.warn('[$] unknown tabling deadline format %s' % unws(tmp)) item[u'list'].append(tmp) continue # committee dossier # IMCO/7/08130 if txt.startswith("%s/7/" % comid) and len(txt)==12: item[u'comdossier']=txt continue # ***I 2011/0373(COD) COM(2011)0793 – C7-0454/2011 tmp=getdocs(txt) if tmp: item.update(tmp) continue # fall-through line logger.debug("(falltrough) %s %s" % (line.tag, txt.encode('utf8'))) if item: res.append(item) return res
def parseMember(userid): url = "http://www.europarl.europa.eu/meps/en/%s/get.html" % userid logger.info("scraping %s" % url) root = fetch(url, ignore=[500]) data = {u"active": False, "meta": {u"url": url}} # return {'active': False} mepdiv = root.xpath('//div[@class="ep_elementpeople2"]') if len(mepdiv) == 1: mepdiv = mepdiv[0] else: logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv))) data[u"Name"] = mangleName(unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[0])) data[u"Photo"] = unicode(urljoin(BASE_URL, mepdiv.xpath('.//span[@class="ep_img"]/img')[0].get("src")), "utf8") borntxt = mepdiv.xpath('.//div[@class="ep_elementtext"]/p/text()') if len(borntxt) > 0: (d, p) = borntxt[0].split(",", 1) try: data[u"Birth"] = {u"date": datetime.strptime(unws(d), u"Born on %d %B %Y"), u"place": unws(p)} except ValueError: logger.warn("[!] failed to scrape birth data %s" % url) logger.warn(traceback.format_exc()) else: logger.warn("[!] no birth data %s" % url) const = {u"country": unws(mepdiv.xpath('.//span[@class="ep_country"]/text()')[0]), u"start": datetime(2009, 7, 14)} data[u"Constituencies"] = [const] try: data[u"party"] = unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[1]) except IndexError: pass else: group = unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[0]) try: role = unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[1]) except IndexError: role = u"Member" data[u"Groups"] = [{u"role": role, u"Organization": group, u"groupid": group_map[group]}] cdiv = root.xpath('//div[@class="ep_elementcontact"]') if len(cdiv): addif( data, u"RSS", [unicode(urljoin(BASE_URL, x.get("href")), "utf8") for x in cdiv[0].xpath('.//li[@class="ep_rss"]//a')], ) addif( data, u"Homepage", [unicode(x.get("href"), "utf8") for x in cdiv[0].xpath('.//li[@class="ep_website"]//a')] ) addif( data, u"Mail", [decodemail(unws(x)) for x in cdiv[0].xpath('.//li[@class="ep_email"]//text()') if len(unws(x))], ) for span in root.xpath('//div[@id="contextzone"]//span[@class="ep_title"]'): title = unws("".join(span.xpath(".//text()"))) if title in ["Accredited assistants", "Local assistants"]: if not "assistants" in data: data["assistants"] = {} addif( data["assistants"], title.lower().split()[0], [unws(x) for x in span.xpath("../../..//li/div/text()")] ) addif(data, u"Addresses", getAddress(root)) for div in root.xpath('//div[@class="ep_content"]'): key = unws(u"".join(div.xpath('.//span[@class="ep_title"]/text()'))) if not len(key): continue elif key.lower() == "curriculum vitae": data[u"CV"] = [unws(x) for x in div.xpath('.//div[@class="ep_elementtext"]//li/div/text()')] elif key in ["Member", "Substitute", "Chair", "Vice-Chair", "Co-President", "President", "Vice-President"]: for span in div.xpath('.//span[@class="commission_label"]'): item = {u"role": key, u"abbr": unws("".join(span.xpath(".//text()"))), u"Organization": unws(span.tail)} for start, field in orgmaps: if item["abbr"] in COMMITTEE_MAP or item["Organization"].startswith(start): if not field in data: data[field] = [] if field == "Committees" and item["Organization"] in COMMITTEE_MAP: item[u"committee_id"] = COMMITTEE_MAP[item["Organization"]] data[field].append(item) break else: logger.error("[!] unknown field %s" % key) return data
pass if len(tail): item['note'] = tail try: eurlex['dates'].append(item) except: eurlex['dates'] = [item] for t, l in GENERIC_FIELDS: try: s = root.xpath('//h2[text()="%s"]/following-sibling::ul' % t)[0] except: continue if not len(s): continue tmp = dict([(field, [ unws(x) if x.getparent().tag != 'a' else { u'text': unws(x), u'url': x.getparent().get('href') } for x in s.xpath('./li/strong[text()="%s"]/..//text()' % field) if unws(x) and unws(x) != '/' ][1:]) for field in l]) # merge multi-text items into one dict for k in ['Amended by:', "Legal basis:", 'Amendment to:']: tmp1 = {} for v in tmp.get(k, []): if type(v) == type(dict()): if not v['url'] in tmp1: tmp1[v['url']] = { u'url': v['url'], u'text': [v['text']]
res[key][u'Address']=tmp elif key=='Postal': res[key]=tmp else: logger.error("wtf %s" % key) return res def getMEPGender(id): try: mepraw=fetch("http://www.europarl.europa.eu/meps/fr/%s/_home.html" % (id), ignore=[500]) except Exception, e: logger.error("mepgender %s" % e) return 'n/a' borntxt=mepraw.xpath('//div[@class="zone_info_mep_transparent_mep_details"]//span[@class="more_info"]/text()') if len(borntxt)>0: if unws(borntxt[-1]).startswith(u'décédé'): hint=borntxt[-2].replace(u"\u00A0",' ').split()[0] else: hint=borntxt[-1].replace(u"\u00A0",' ').split()[0] if hint==u"Née": return "F" elif hint==u"Né": return "M" logger.warn('[!] no birth/gender data http://www.europarl.europa.eu/meps/fr/%s/get.html' % id) return 'n/a' def getMEPDeclarations(id): try: dom = fetch("http://www.europarl.europa.eu/meps/en/%s/_declarations.html" % (id), ignore=[500]) except Exception, e: logger.error("mepdeclaration %s" % e)
def parseMember(userid): url='http://www.europarl.europa.eu/meps/en/%s/get.html' % userid logger.info("scraping %s" % url) root = fetch(url) data = {u'active': True, 'meta': {u'url': url}} # return {'active': False} mepdiv=root.xpath('//div[@class="ep_elementpeople2"]') if len(mepdiv) == 1: mepdiv = mepdiv[0] else: logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv))) data[u'Name'] = mangleName(unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[0])) data[u'Photo'] = unicode(urljoin(BASE_URL,mepdiv.xpath('.//span[@class="ep_img"]/img')[0].get('src')),'utf8') (d, p) = mepdiv.xpath('.//div[@class="ep_elementtext"]/p/text()')[0].split(',', 1) try: data[u'Birth'] = { u'date': datetime.strptime(unws(d), "Born on %d %B %Y"), u'place': unws(p) } except ValueError: logger.warn('[!] failed to scrape birth data %s' % url) logger.warn(traceback.format_exc()) const={u'country': unws(mepdiv.xpath('.//span[@class="ep_country"]/text()')[0])} data[u'Constituencies']=[const] try: const[u'party']=unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[1]), except IndexError: data[u'active']=False else: group=unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[0]) data[u'Groups'] = [{ u'role': unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[1]), u'group': group, u'groupid': group_map[group]}] cdiv=root.xpath('//div[@class="ep_elementcontact"]') if len(cdiv): addif(data,u'RSS',[unicode(urljoin(BASE_URL,x.get('href')),'utf8') for x in cdiv[0].xpath('.//li[@class="ep_rss"]//a')]) addif(data,u'Homepage',[unicode(x.get('href'),'utf8') for x in cdiv[0].xpath('.//li[@class="ep_website"]//a')]) addif(data,u'Mail',[decodemail(unws(x)) for x in cdiv[0].xpath('.//li[@class="ep_email"]//text()') if len(unws(x))]) for span in root.xpath('//div[@id="contextzone"]//span[@class="ep_title"]'): title=unws(''.join(span.xpath('.//text()'))) if title in ['Accredited assistants', 'Local assistants']: addif(data,title,[unws(x) for x in span.xpath('../../..//li/div/text()')]) addif(data,u'Addresses',getAddress(root)) for div in root.xpath('//div[@class="ep_content"]'): key=unws(u''.join(div.xpath('.//span[@class="ep_title"]/text()'))) if not len(key): continue elif key.lower()=='curriculum vitae': data[u'CV'] = [unws(x) for x in div.xpath('.//div[@class="ep_elementtext"]//li/div/text()')] elif key in ['Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President', 'President', 'Vice-President']: for span in div.xpath('.//span[@class="commission_label"]'): item={u'role': key, u'abbr': unws(''.join(span.xpath('text()'))), u'Organization': unws(span.tail)} for start, field in orgmaps: if item['Organization'].startswith(start): if not field in data: data[field]=[] if field=='Committees' and item['Organization'] in COMMITTEE_MAP: item[u'committee_id']=COMMITTEE_MAP[item['Organization']] data[field].append(item) break else: logger.error('[!] unknown field %s' % key) return data
def parseMember(userid): url = 'http://www.europarl.europa.eu/meps/en/%s/_history.html' % userid logger.info("scraping %s" % url) root = fetch(url, ignore=[500]) data = { u'active': False, u'Photo': unicode(urljoin(BASE_URL, "/mepphoto/%s.jpg" % userid)), u'meta': { u'url': url } } mepdiv = root.xpath( '//div[@class="zone_info_mep_transparent_mep_details"]') if len(mepdiv) == 1: mepdiv = mepdiv[0] else: logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv))) data[u'Name'] = mangleName( unws(' '.join(mepdiv.xpath('.//li[@class="mep_name"]//text()')))) borntxt = mepdiv.xpath('.//span[@class="more_info"]/text()') if len(borntxt) > 0: if unws(borntxt[-1]).startswith('died on '): try: data[u'Death'] = datetime.strptime(unws(borntxt[-1]), u"died on %d %B %Y") except ValueError: logger.warn('[!] failed to scrape birth data %s' % url) logger.warn(traceback.format_exc()) tmp = borntxt[-2].split(',', 1) else: tmp = borntxt[-1].split(',', 1) if len(tmp) == 2: (d, p) = tmp else: d, p = tmp[0], None try: data[u'Birth'] = { u'date': datetime.strptime(unws(d), u"Born on %d %B %Y") } except ValueError: logger.warn('[!] failed to scrape birth data %s' % url) logger.warn(traceback.format_exc()) finally: if p: if 'Birth' in data: data[u'Birth'][u'place'] = unws(p) else: data[u'Birth'] = unws(p) else: logger.warn('[!] no birth data %s' % url) # scrape stuff from right column addif(data, u'RSS', [ unicode(urljoin(BASE_URL, x.get('href')), 'utf8') for x in root.xpath( '//ul[@class="link_collection_noborder"]/li/a[@class="link_rss"]') ]) addif(data, u'Homepage', [ x.get('href') for x in root.xpath( '//ul[@class="link_collection_noborder"]/li/a[@class="link_website"]' ) ]) addif(data, u'Twitter', [ x.get('href') for x in root.xpath( '//ul[@class="link_collection_noborder"]/li/a[@class="link_twitt"]' ) ]) addif(data, u'Facebook', [ x.get('href') for x in root.xpath( '//ul[@class="link_collection_noborder"]/li/a[@class="link_fb"]') ]) addif(data, u'Mail', [ x.get('href')[7:].replace('[dot]', '.').replace('[at]', '@')[::-1] for x in root.xpath( '//ul[@class="link_collection_noborder"]/li/a[@class="link_email"]' ) ]) # contact information for span in root.xpath('//div[@id="content_right"]//h3'): title = unws(''.join(span.xpath('.//text()'))) if title in ['Accredited assistants', 'Local assistants']: if not 'assistants' in data: data['assistants'] = {} addif( data['assistants'], title.lower().split()[0], [ unws(x) for x in span.xpath( '../following-sibling::div[@class="boxcontent"][1]//li/text()' ) ]) elif title == "Contacts": addif(data, u'Addresses', getAddress(span)) # scrape main content for section in root.xpath( '//div[@id="content_left"]/div[@class="boxcontent nobackground"]/h4' ): key = unws(''.join(section.xpath('.//text()'))) if key == "National parties": # constituencies key = 'Constituencies' for constlm in section.xpath( './following-sibling::ul[@class="events_collection bullets"][1]/li' ): line = unws(u' '.join( [unicode(x) for x in constlm.xpath('.//text()')])) try: interval, party = line.split(' : ', 1) except ValueError: continue tmp = interval.split(' / ') if not key in data: data[key] = [] if len(tmp) == 2: (start, end) = tmp else: start = interval.split()[0] end = "31.12.9999" cstart = party.rfind(' (') if party[cstart + 2:-1] in SEIRTNUOC: country = party[cstart + 2:-1] party = party[:cstart] else: logger.warn('unknown country: %s' % party[cstart + 2:-1]) country = 'unknown' #print etree.tostring(constlm, pretty_print=True) data[key].append({ u'party': party, u'country': country, u'start': datetime.strptime(unws(start), u"%d.%m.%Y"), u'end': datetime.strptime(unws(end), u"%d.%m.%Y"), }) elif key in [ 'Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President', 'President', 'Vice-President', 'Observer', 'Quaestor' ]: # memberships in various committees, delegations and EP mgt for constlm in section.xpath( './following-sibling::ul[@class="events_collection bullets"][1]/li' ): line = unws(u' '.join( [unicode(x) for x in constlm.xpath('.//text()')])) try: interval, org = line.split(' : ', 1) except ValueError: continue tmp = interval.split(' / ') if len(tmp) == 2: (start, end) = tmp else: start = interval.split()[0] end = "31.12.9999" item = { u'role': key, u'abbr': COMMITTEE_MAP.get(org), u'Organization': org, u'start': datetime.strptime(unws(start), u"%d.%m.%Y"), u'end': datetime.strptime(unws(end), u"%d.%m.%Y"), } for start, field in orgmaps: if item['abbr'] in COMMITTEE_MAP or item[ 'Organization'].startswith(start): if not field in data: data[field] = [] if field == 'Committees' and item[ 'Organization'] in COMMITTEE_MAP: item[u'committee_id'] = COMMITTEE_MAP[ item['Organization']] data[field].append(item) break elif key == u'Political groups': for constlm in section.xpath( './following-sibling::ul[@class="events_collection bullets"][1]/li' ): line = unws(u' '.join( [unicode(x) for x in constlm.xpath('.//text()')])) interval, org = line.split(' : ', 1) tmp = org.split(u' - ') if len(tmp) > 1: org = ' - '.join(tmp[:-1]) role = tmp[-1] elif org.endswith(' -'): org = org[:-2] role = '' else: logger.error('[!] political group line %s' % line) continue tmp = interval.split(' / ') if len(tmp) == 2: (start, end) = tmp else: start = interval.split()[0] end = "31.12.9999" if not u'Groups' in data: data[u'Groups'] = [] data[u'Groups'].append({ u'role': role, u'Organization': org, u'country': COUNTRIES.get( unws(constlm.get('class')).upper(), 'unknown country: %s' % unws(constlm.get('class'))), u'groupid': group_map[org], u'start': datetime.strptime(unws(start), u"%d.%m.%Y"), u'end': datetime.strptime(unws(end), u"%d.%m.%Y"), }) else: logger.error('[!] unknown field %s' % key) # sort all lists in descending order for fld in [ 'Constituencies', 'Groups', 'Committees', 'Delegations', 'Staff' ]: if not fld in data: continue data[fld] = sorted(data[fld], key=lambda x: x.get('end', x['start']), reverse=True) # get CV - page (is on separate http path :/) cvurl = 'http://www.europarl.europa.eu/meps/en/%s/_cv.html' % userid root = fetch(cvurl, ignore=[500]) data[u'CV'] = [ unws(x) for x in root.xpath('//p[@class="details_cv"]/text()') ] return data
def parseMember(userid): url = 'http://www.europarl.europa.eu/meps/en/%s/get.html' % userid logger.info("scraping %s" % url) root = fetch(url, ignore=[500]) data = {u'active': True, 'meta': {u'url': url}} # return {'active': False} mepdiv = root.xpath('//div[@class="ep_elementpeople2"]') if len(mepdiv) == 1: mepdiv = mepdiv[0] else: logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv))) data[u'Name'] = mangleName( unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[0])) data[u'Photo'] = unicode( urljoin(BASE_URL, mepdiv.xpath('.//span[@class="ep_img"]/img')[0].get('src')), 'utf8') (d, p) = mepdiv.xpath('.//div[@class="ep_elementtext"]/p/text()')[0].split( ',', 1) try: data[u'Birth'] = { u'date': datetime.strptime(unws(d), u"Born on %d %B %Y"), u'place': unws(p) } except ValueError: logger.warn('[!] failed to scrape birth data %s' % url) logger.warn(traceback.format_exc()) const = { u'country': unws(mepdiv.xpath('.//span[@class="ep_country"]/text()')[0]), u'start': datetime(2009, 7, 14) } data[u'Constituencies'] = [const] try: const[u'party'] = unws( mepdiv.xpath('.//span[@class="ep_group"]/text()')[1]) except IndexError: data[u'active'] = False else: group = unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[0]) try: role = unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[1]) except IndexError: role = u"Member" data[u'Groups'] = [{ u'role': role, u'Organization': group, u'groupid': group_map[group] }] cdiv = root.xpath('//div[@class="ep_elementcontact"]') if len(cdiv): addif(data, u'RSS', [ unicode(urljoin(BASE_URL, x.get('href')), 'utf8') for x in cdiv[0].xpath('.//li[@class="ep_rss"]//a') ]) addif(data, u'Homepage', [ unicode(x.get('href'), 'utf8') for x in cdiv[0].xpath('.//li[@class="ep_website"]//a') ]) addif(data, u'Mail', [ decodemail(unws(x)) for x in cdiv[0].xpath('.//li[@class="ep_email"]//text()') if len(unws(x)) ]) for span in root.xpath( '//div[@id="contextzone"]//span[@class="ep_title"]'): title = unws(''.join(span.xpath('.//text()'))) if title in ['Accredited assistants', 'Local assistants']: if not 'assistants' in data: data['assistants'] = {} addif(data['assistants'], title.lower().split()[0], [unws(x) for x in span.xpath('../../..//li/div/text()')]) addif(data, u'Addresses', getAddress(root)) for div in root.xpath('//div[@class="ep_content"]'): key = unws(u''.join(div.xpath('.//span[@class="ep_title"]/text()'))) if not len(key): continue elif key.lower() == 'curriculum vitae': data[u'CV'] = [ unws(x) for x in div.xpath( './/div[@class="ep_elementtext"]//li/div/text()') ] elif key in [ 'Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President', 'President', 'Vice-President' ]: for span in div.xpath('.//span[@class="commission_label"]'): item = { u'role': key, u'abbr': unws(''.join(span.xpath('.//text()'))), u'Organization': unws(span.tail) } for start, field in orgmaps: if item['abbr'] in COMMITTEE_MAP or item[ 'Organization'].startswith(start): if not field in data: data[field] = [] if field == 'Committees' and item[ 'Organization'] in COMMITTEE_MAP: item[u'committee_id'] = COMMITTEE_MAP[ item['Organization']] data[field].append(item) break else: logger.error('[!] unknown field %s' % key) return data
def splitNames(text): text = text.split(" on behalf ", 1)[0] res = [] for delim in (", ", " and ", " & ", "; ", ","): if not res: res = filter( None, [item[:-1] if item[-1] in [",", "'", ";"] else item for item in unws(text).split(delim) if item] ) continue res = filter( None, [item[:-1] if item[-1] in [",", "'", ";"] else item for elem in res for item in elem.split(delim) if item], ) # only for devel. # for mep in res: # if mep.startswith('on behalf of'): continue # if mep.endswith('Shadow)'): # logger.info('shadow: %s' % mep) res = [ mep if not mep.endswith("Shadow)") else mep[: mep.rfind(" (")] for mep in res if not mep.startswith("on behalf of") ] res = [y for x in res for y in mansplits.get(x, [x])] return [mepmaps.get(x, x) for x in res]
def scrape(celexid, path): logger.info("scraping %s%s:NOT" % (EURLEXURL, celexid)) path.reverse() (code, lang) = celexid.split(":")[1:3] st = 6 if len(code) > 6: if code[6].isalpha(): st = 7 eurlex = { 'id': { u'celexid': celexid, u'sector': code[0], u'year': code[1:5], u'doctype': code[5:st], u'refno': code[st:], u'lang': lang, u'chapter': path, } } else: eurlex = { 'id': { u'celexid': celexid, u'sector': code[0], u'year': code[1:5], u'doctype': code[5:6], u'lang': lang, u'chapter': path, } } try: eurlex['id'][u'typeDesc'] = CELEXCODES[code[0]]['Document Types'][ code[5:st]] if code[5:st] != 'C' else CELEXCODES[code[0]]['Sector'] except: eurlex['id'][u'typeDesc'] = u"Unknown" logger.warn("[!] unknown typedesc %s" % celexid) eurlex['meta'] = {u'src': "%s%s:NOT" % (EURLEXURL, celexid)} root = fetch("%s%s:NOT" % (EURLEXURL, celexid)) if len(root.xpath('//h1[text()="No documents matching criteria."]')) > 0: logger.warn('[!] nothing to scrape here: %s', "%s%s:NOT" % (EURLEXURL, celexid)) return eurlex[u'title'] = root.xpath( '//h2[text()="Title and reference"]/following-sibling::p/text()')[0] # dates dates = root.xpath('//h2[text()="Dates"]/following-sibling::ul/text()') for y in dates: if not unws(y): continue title, rest = unws(y).split(": ", 1) item = {u'type': title} date = rest[:10] tail = rest[10:] if tail.startswith('; '): tail = tail[2:] if date == '99/99/9999': item[u'date'] = datetime(9999, 12, 31) elif date == '00/00/0000': item[u'date'] = datetime(0001, 01, 01) elif date == '//': continue else: try: item[u'date'] = datetime.strptime(date, u"%d/%m/%Y") except ValueError: try: item[u'date'] = datetime.strptime(date, u"%m/%d/%Y") except: pass if len(tail): item['note'] = tail try: eurlex['dates'].append(item) except: eurlex['dates'] = [item]
try: item[u'date']= datetime.strptime(date, u"%m/%d/%Y") except: pass if len(tail): item['note']=tail try: eurlex['dates'].append(item) except: eurlex['dates']=[item] for t,l in GENERIC_FIELDS: try: s=root.xpath('//h2[text()="%s"]/following-sibling::ul' % t)[0] except: continue if not len(s): continue tmp=dict([(field, [unws(x) if x.getparent().tag!='a' else {u'text': unws(x), u'url': x.getparent().get('href')} for x in s.xpath('./li/strong[text()="%s"]/..//text()' % field) if unws(x) and unws(x)!='/'][1:]) for field in l]) # merge multi-text items into one dict for k in ['Amended by:', "Legal basis:", 'Amendment to:']: tmp1={} for v in tmp.get(k,[]): if type(v)==type(dict()): if not v['url'] in tmp1: tmp1[v['url']]={u'url': v['url'], u'text': [v['text']]} elif not v['text'] in tmp1[v['url']]['text']: tmp1[v['url']]['text'].append(v['text']) if tmp1:
def parse_block(block, url, reference, date, committee, rapporteur): am = {u"src": url, u"reference": reference, u"date": date, u"committee": committee} # logger.info(block) # get title try: am[u"seq"] = int(unws(block[0]).split()[1]) except ValueError: am[u"seq"] = unws(block[0]).split()[1] except IndexError: logger.warn("%s wrong seq %s" % (datetime.now().isoformat(), block[0])) am[u"seq"] = unws(block[0]) del block[0] strip(block) # find and strip justification i = len(block) - 1 while i > 2 and not (unws(block[i]) == "Justification" and block[i].startswith(" " * 6)): i -= 1 if i > 2: if i < len(block) - 1 and (not unws(block[i + 1]) or not block[i + 1].startswith(" ")): am["justification"] = "\n".join(block[i + 2 :]) del block[i:] strip(block) else: logger.warn("%s wrong justification\n%s" % (datetime.now().isoformat(), "\n".join(block[i:]))) # get original language if 4 < len(unws(block[-1])) <= 6 and unws(block[-1]).startswith("Or."): am["orig_lang"] = unws(block[-1])[4:] del block[-1] strip(block) # find split column new/old heading i = len(block) - 1 while ( i > 2 and not ( ( block[i].endswith(" Amendment") or block[i].endswith(" PARTICULARS") or block[i].endswith(" Remedy") or block[i].endswith(" Amended text") or block[i].endswith(" Amendement") or block[i].endswith(" Amendments by Parliament") or block[i].endswith(" Proposal for rejection") or block[i].endswith(" Proposal for a rejection") or block[i].endswith(" Does not affect English version") or block[i].endswith(" (Does not affect English version)") or block[i].endswith(" Amendment by Parliament") ) and len(block[i]) > 33 ) and not (unws(block[i]) == "Text proposed by the Commission" or unws(block[i]) in types) ): i -= 1 if i > 2: # if block[i].endswith(" Proposal for rejection"): # pass # location will be possibly '-' seq = False if unws(block[i]) in ["Amendment", "Amendment by Parliament"]: # sequential format, search for preceeding original text j = i while j > 2 and not (unws(block[j]) in types or unws(block[j]) == "Text proposed by the Commission"): j -= 1 if j > 2: i = j seq = True key = "old" elif unws(block[i]) == "Text proposed by the Commission" or block[i].strip() in types: seq = True key = "old" # throw headers del block[i] while i < len(block) and not unws(block[i]): del block[i] # skip blank lines mid = max([len(x) for x in block]) / 2 while i < len(block): if seq: if unws(block[i]) in ["Amendment", "Amendment by Parliament", "Text Amended"]: key = "new" del block[i] continue try: am[key].append(block[i]) except KeyError: am[key] = [block[i]] del block[i] continue # only new, old is empty if block[i].startswith(" "): try: am["new"].append(unws(block[i])) except KeyError: am["new"] = [unws(block[i])] del block[i] continue newstart = block[i].rstrip().rfind(" ") # only old, new is empty if newstart < 6: try: am["old"].append(unws(block[i])) except KeyError: am["old"] = [unws(block[i])] del block[i] continue # mid=len(block[i])/2 # mid=40 lsep = block[i].rfind(" ", 0, mid) # todo calculate both, and use the one closer to the center rsep = block[i].find(" ", mid) sep = None if abs(lsep - mid) < abs(rsep - mid): if abs(lsep - mid) < 15: sep = lsep else: if abs(rsep - mid) < 15: sep = rsep if sep: try: am["old"].append(unws(block[i][:sep])) except KeyError: am["old"] = [unws(block[i][:sep])] try: am["new"].append(unws(block[i][sep:])) except KeyError: am["new"] = [unws(block[i][sep:])] else: # no sane split found # logger.warn("no split: %s %s\n%s" % (datetime.now().isoformat(), # (sep, mid, len(block[i]), newstart, block[i]), # block[i][mid-1:mid+2])) # fallback to naive splitting try: am["old"].append(unws(block[i][:newstart])) except KeyError: am["old"] = [unws(block[i][:newstart])] try: am["new"].append(unws(block[i][newstart:])) except KeyError: am["new"] = [unws(block[i][newstart:])] del block[i] strip(block) else: logger.warn("%s no table\n%s" % (datetime.now().isoformat(), "\n".join(block[i:]))) am["content"] = block[i:] return am i = 0 # find end of authors while ( i < len(block) and unws(block[i]) and not unws(block[i]).lower().startswith("compromise") and not istype(block[i]) and not unws(block[i]).split()[0] in locstarts ): i += 1 if i < len(block): if i > 0: names = " ".join(block[:i]) am["authors"] = names # logger.info("names \n%s" % names) # convert to pt mep _ids for text in filter(None, splitNames(names)): mep = getMep(text, None, False) if mep: try: am["meps"].append(mep["UserID"]) except KeyError: am["meps"] = [mep["UserID"]] else: logger.info("fix %s" % text) del block[:i] strip(block) elif rapporteur: am["authors"] = rapporteur for text in filter(None, splitNames(rapporteur)): mep = getMep(text, None, False) if mep: try: am["meps"].append(mep["UserID"]) except KeyError: am["meps"] = [mep["UserID"]] else: logger.info("fix %s" % text) else: logger.info("%s no authors in Amendment %s" % (datetime.now().isoformat(), am["seq"])) else: logger.warn("%s no boundaries in Amendment %s\n%s" % (datetime.now().isoformat(), am["seq"], "\n".join(block))) am["rest"] = block return am # handle compromise info i = 0 while i < len(block) and unws(block[i]) and not istype(block[i]) and not unws(block[i]).split()[0] in locstarts: i += 1 if i < len(block) and i > 0: am["compromise"] = block[:i] del block[:i] strip(block) i = 0 while i < len(block) and unws(block[i]): if unws(block[i]).split()[0] in locstarts: try: am["location"].append((" ".join(block[:i]), unws(block[i]))) except KeyError: am["location"] = [(" ".join(block[:i]), unws(block[i]))] del block[: i + 1] i = 0 else: i += 1 if len(block) > 0 and ((len(block) == 1 or not unws(block[1])) and unws(block[0]) != "1" and "location" in am): am["location"][-1] = (am["location"][-1][0], "%s %s" % (am["location"][-1][1], block[0])) del block[0] strip(block) if block: if not ( (len(block) == 3 and unws(block[0]) == "1" and not unws(block[1]) and block[2].startswith(" ")) or (len(block) == 2 and unws(block[0]) == "1" and block[1].startswith(" ")) ): # ignore obvious footnotes logger.info("rest in Amendment %s\n%s" % (am["seq"], "\n".join(block))) return am
try: item[u'date']= datetime.strptime(date, u"%m/%d/%Y") except: pass if len(tail): item['note']=tail try: eurlex['dates'][title]=item except: eurlex['dates']={title: item} for t,l in GENERIC_FIELDS: try: s=root.xpath('//h2[text()="%s"]/following-sibling::ul' % t)[0] except: continue if not len(s): continue tmp=dict([(field, [{u'text': unws(x), u'url': x.getparent().get('href')} for x in s.xpath('./li/strong[text()="%s"]/..//text()' % field)[2:] if unws(x) and unws(x)!='/']) for field in l if field is not "Directory code:"]) # merge multi-text items into one dict for k in ['Amended by:', "Legal basis:", 'Amendment to:']: tmp1={} for v in tmp.get(k,[]): if type(v)==type(dict()): if not v['url'] in tmp1: tmp1[v['url']]={u'url': v['url'], u'text': [v['text']]} elif not v['text'] in tmp1[v['url']]['text']: tmp1[v['url']]['text'].append(v['text']) if tmp1:
def strip(block): while len(block) and not unws(block[0]): del block[0] while len(block) and not unws(block[-1]): del block[-1]
def scrape(url, rapporteur=None): if url in [ "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-483.680%2b02%2bDOC%2bPDF%2bV0%2f%2fEN", "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-454.387%2b01%2bDOC%2bPDF%2bV0%2f%2fEN", "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-456.679%2b01%2bDOC%2bPDF%2bV0%2f%2fEN", "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-494.504%2b01%2bDOC%2bPDF%2bV0%2f%2fEN", "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.705%2b01%2bDOC%2bPDF%2bV0%2f%2fEN", "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.767%2b02%2bDOC%2bPDF%2bV0%2f%2fEN", "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-454.385%2b01%2bDOC%2bPDF%2bV0%2f%2fEN", "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-465.012%2b01%2bDOC%2bPDF%2bV0%2f%2fEN", "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-496.504%2b01%2bDOC%2bPDF%2bV0%2f%2fEN", "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.724%2b01%2bDOC%2bPDF%2bV0%2f%2fEN", "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.721%2b02%2bDOC%2bPDF%2bV0%2f%2fEN", "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.723%2b03%2bDOC%2bPDF%2bV0%2f%2fEN", ] or not url.endswith("EN"): logger.info("skipping unparsable url") return [] prolog = True res = [] block = None reference = None date = None committee = [] text = getraw(url).split("\n") for line in text: if prolog: if amstart.match(line): if reference == None: logger.warn( "%s [!] couldn't find ref: %s" % (datetime.now().isoformat(), unws([x for x in text[:20] if unws(x)][2])) ) # marking as scraped though db.ep_ams.save({"src": url, "error": "couldn't find reference in source pdf"}) return [] if date == None or committee == []: return [] # raise ValueError block = [line] prolog = False continue line = unws(line) if not line: continue if line in COMMITTEE_MAP: committee.append(COMMITTEE_MAP[line]) continue if committee and not reference and re.match(refre, line): reference = line if ( url == "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-506.166%2b03%2bDOC%2bPDF%2bV0%2f%2fEN" ): logger.info("adjusting reference to eudatap") reference = "2012/0011(COD)" continue if reference and not date: try: date = parse(unws(line), dayfirst=True) except ValueError: pass except TypeError: pass continue if amstart.match(line): # parse block res.append(parse_block(block, url, reference, date, committee, rapporteur)) block = [line] continue block.append(line) if block and filter(None, block): res.append(parse_block(block, url, reference, date, committee, rapporteur)) return res
def parseMember(userid): url='http://www.europarl.europa.eu/meps/en/%s/_history.html' % userid logger.info("scraping %s" % url) root = fetch(url, ignore=[500]) data = { u'active': False, u'Photo': unicode(urljoin(BASE_URL,"/mepphoto/%s.jpg" % userid)), u'meta': {u'url': url} } mepdiv=root.xpath('//div[@class="zone_info_mep_transparent_mep_details"]') if len(mepdiv) == 1: mepdiv = mepdiv[0] else: logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv))) data[u'Name'] = mangleName(unws(' '.join(mepdiv.xpath('.//li[@class="mep_name"]//text()')))) borntxt=mepdiv.xpath('.//span[@class="more_info"]/text()') if len(borntxt)>0: if unws(borntxt[-1]).startswith('Date of death:'): try: data[u'Death'] = datetime.strptime(unws(borntxt[-1]), u"Date of death: %d %B %Y") except ValueError: logger.warn('[!] failed to scrape birth data %s' % url) logger.warn(traceback.format_exc()) tmp = borntxt[-2].split(',', 1) else: tmp = borntxt[-1].split(',', 1) if len(tmp)==2: (d, p) = tmp else: d,p = tmp[0], None try: data[u'Birth'] = { u'date': datetime.strptime(unws(d), u"Date of birth: %d %B %Y")} except ValueError: logger.warn(traceback.format_exc()) finally: if p: if 'Birth' in data: data[u'Birth'][u'place'] = unws(p) else: data[u'Birth'] = unws(p) else: logger.warn('[!] no birth data %s' % url) # scrape stuff from right column addif(data,u'RSS',[unicode(urljoin(BASE_URL,x.get('href')),'utf8') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_rss"]')]) addif(data,u'Homepage',[x.get('href') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_website"]')]) addif(data,u'Twitter',[x.get('href') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_twitt"]')]) addif(data,u'Facebook',[x.get('href') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_fb"]')]) addif(data,u'Mail',[x.get('href')[7:].replace('[dot]','.').replace('[at]','@')[::-1] for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_email"]')]) # contact information for span in root.xpath('//div[@id="content_right"]//h3'): title=unws(''.join(span.xpath('.//text()'))) if title == "Contacts": addif(data,u'Addresses',getAddress(span)) # scrape main content for section in root.xpath('//div[@id="content_left"]/div[@class="boxcontent nobackground"]/h4'): key=unws(''.join(section.xpath('.//text()'))) if key=="National parties": # constituencies key='Constituencies' for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'): line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')])) try: interval, party = line.split(' : ',1) except ValueError: continue tmp = interval.split(' / ') if not key in data: data[key]=[] if len(tmp)==2: (start, end) = tmp else: start = interval.split()[0] end = "31.12.9999" cstart = party.rfind(' (') if party[cstart+2:-1] in SEIRTNUOC: country = party[cstart+2:-1] party = party[:cstart] else: logger.warn('unknown country: %s' % party[cstart+2:-1]) country='unknown' #print etree.tostring(constlm, pretty_print=True) data[key].append({ u'party': party, u'country': country, u'start': datetime.strptime(unws(start), u"%d.%m.%Y"), u'end': datetime.strptime(unws(end), u"%d.%m.%Y"), }) elif key in ['Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President', 'President', 'Vice-President', 'Observer', 'Quaestor']: # memberships in various committees, delegations and EP mgt for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'): line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')])) try: interval, org = line.split(' : ',1) except ValueError: continue tmp = interval.split(' / ') if len(tmp)==2: (start, end) = tmp else: start = interval.split()[0] end = "31.12.9999" item={u'role': key, u'abbr': COMMITTEE_MAP.get(org), u'Organization': org, u'start': datetime.strptime(unws(start), u"%d.%m.%Y"), u'end': datetime.strptime(unws(end), u"%d.%m.%Y"), } for start, field in orgmaps: if item['abbr'] in COMMITTEE_MAP or item['Organization'].startswith(start): if not field in data: data[field]=[] if field=='Committees' and item['Organization'] in COMMITTEE_MAP: item[u'committee_id']=COMMITTEE_MAP[item['Organization']] data[field].append(item) break elif key == u'Political groups': for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'): line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')])) interval, org = line.split(' : ',1) tmp = org.split(u' - ') if len(tmp)>1: org = ' - '.join(tmp[:-1]) role = tmp[-1] elif org.endswith(' -'): org=org[:-2] role='' else: logger.error('[!] political group line %s' % line) continue tmp = interval.split(' / ') if len(tmp)==2: (start, end) = tmp else: start = interval.split()[0] end = "31.12.9999" if not u'Groups' in data: data[u'Groups']=[] data[u'Groups'].append( {u'role': role, u'Organization': org, u'country': COUNTRIES.get(unws(constlm.get('class')).upper(), 'unknown country: %s' % unws(constlm.get('class'))), u'groupid': group_map[org], u'start': datetime.strptime(unws(start), u"%d.%m.%Y"), u'end': datetime.strptime(unws(end), u"%d.%m.%Y"), }) else: logger.error('[!] unknown field %s' % key) # sort all lists in descending order for fld in ['Constituencies', 'Groups', 'Committees', 'Delegations', 'Staff']: if not fld in data: continue data[fld]=sorted(data[fld], key=lambda x: x.get('end',x['start']), reverse=True) # get CV - page (is on separate http path :/) cvurl='http://www.europarl.europa.eu/meps/en/%s/_cv.html' % userid root = fetch(cvurl, ignore=[500]) data[u'CV']=[unws(x) for x in root.xpath('//p[@class="details_cv"]/text()')] # get assistants also on a separate page :/ assurl='http://www.europarl.europa.eu/meps/en/%s/_assistants.html' % userid root = fetch(assurl, ignore=[500]) for h3 in root.xpath('//h3[@id="section"]'): title=unws(''.join(h3.xpath('.//text()'))) if title in ['Accredited assistants', 'Local assistants']: if not 'assistants' in data: data['assistants']={} addif(data['assistants'], title.lower().split()[0], [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')]) elif title in ['Accredited assistants (grouping)', 'Local assistants (grouping)', 'Service providers', ' Trainees', 'Paying agents (grouping)', 'Paying agents']: if not 'assistants' in data: data['assistants']={} addif(data['assistants'], title.lower(), [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')]) return data
def getactors(node): res = {} ax = [None, []] for row in node.xpath('.//tr'): cells = row.xpath('./td/p') if not cells: continue # get role Rapporteur|Responsible|Rapporteur for the opinion|Opinions role = cells[0].xpath('text()') if role and unws(role[0]): if ax[0] and ax[1]: res[ax[0]] = sorted(ax[1]) tmp = unws(role[0])[:-1] if tmp == "Rapporteur for the opinion": tmp = "Rapporteur" ax = [tmp, []] tmp = unws((cells[1].xpath('text()') or [None])[0]) if ax[0] in ["Rapporteur", "Rapporteur for the opinion"] and tmp: name = ' '.join(tmp.split()[:-1]) item = { u'group': tmp.split()[-1][1:-1], u'name': name, u'mepref': getMEPRef(name) } if len(cells) > 2: item[u'docs'] = getdoclist(cells[2]) ax[1].append(item) continue if ax[0] in ["Opinions", "Responsible"] and tmp: tmp1 = tmp.split(u' –', 1) if len(tmp1) == 2: (comid, rest) = tmp1 elif len(tmp1) == 1: skip = False for com in tmp.split(' ,'): if com in COMMITTEE_MAP and len(com) == 4: ax[1].append({u'comid': com}) skip = True if skip: continue else: logger.warn("[!] unknown committee: %s" % tmp) raise item = {u'comid': comid} if rest == ' Decision: no opinion': item[u'response'] = u'Decision: no opinion' if not rest and len(comid) > 4: for com in comid.split(', '): ax[1].append({u'comid': com}) continue if len(cells) > 2: tmp = unws((cells[2].xpath('text()') or [None])[0]) if tmp: name = ' '.join(tmp.split()[:-1]) item.update({ u'group': tmp.split()[-1][1:-1], u'name': name, u'mepref': getMEPRef(name) }) if len(cells) > 3: item[u'docs'] = getdoclist(cells[3]) ax[1].append(item) if ax[0] and ax[1]: res[ax[0]] = sorted(ax[1]) return res
def scrape(url, comid): root = fetch(url) lines = [ x for x in root.xpath('//td[@class="contents"]/div/*') if unws(' '.join(x.xpath('.//text()'))) ] if not len(lines): return if not unws(' '.join(lines[2].xpath('.//text()'))) == 'DRAFT AGENDA': logger.error("NOT DRAFT AGENDA %s" % unws(' '.join(lines[2].xpath('.//text()')))) agenda = { u'committee': comid, u'committee_full': unws(' '.join(lines[0].xpath('.//text()'))), u'src': url, } i = 1 if unws(' '.join(lines[6].xpath('.//text()'))).startswith('Room'): agenda.update({ u'docid': unws(' '.join(lines[1].xpath('.//text()'))), u'type': unws(' '.join(lines[3].xpath('.//text()'))), u'time': toTime(unws(' '.join(lines[4].xpath('.//text()')))), u'city': unws(' '.join(lines[5].xpath('.//text()'))), u'room': unws(' '.join(lines[6].xpath('.//text()')))[6:], }) i = 7 itemcnt = 0 item = {} schedule = None res = [] while i < len(lines): line = lines[i] i += 1 txt = unws(' '.join(line.xpath('.//text()'))) if txt in ['* * *', '***']: continue # skip end of schedule block # 20 December 2011, 16.00 – 16.30 tmp = toTime(txt) if tmp: schedule = tmp if i < len(lines) and unws(' '.join( lines[i].xpath('.//text()'))) == 'In camera': schedule[u'incamera'] = True i += 1 continue if line.tag == 'div': item[u'actors'] = getactors(line) continue firsttoken = txt.split()[0] # 6. Alternative dispute resolution for consumer disputes and # amending Regulation (EC) No 2006/2004 and Directive # 2009/22/EC (Directive on consumer ADR) if firsttoken[-1] == '.' and firsttoken[:-1].isdigit( ) and itemcnt + 1 == int(firsttoken[:-1]): if item: res.append(item) itemcnt += 1 item = copy.deepcopy(agenda) item.update({ u'title': ' '.join(txt.split()[1:]), u'seq_no': itemcnt, }) if schedule: item.update(schedule) continue # trailing list of "details" # · Presentation by the Commission of the proposal & Impact Assessment # · Exchange of views if firsttoken == u"·": if not 'list' in item: item[u'list'] = [] tmp = ' '.join(txt.split()[1:]) if tmp.startswith('Deadline for tabling amendments:'): try: item[u'tabling_deadline'] = datetime.strptime( tmp.split(':')[1].strip(), "%d %B %Y, %H.%M") except ValueError: try: item[u'tabling_deadline'] = datetime.strptime( tmp.split(':')[1].strip(), "%d.%m.%Y at %H.%M") except: logger.warn('[$] unknown tabling deadline format', tmp.split(':')[1].strip()) item[u'list'].append(tmp) continue # committee dossier # IMCO/7/08130 if txt.startswith("%s/7/" % comid) and len(txt) == 12: item[u'comdossier'] = txt continue # ***I 2011/0373(COD) COM(2011)0793 – C7-0454/2011 tmp = getdocs(txt) if tmp: item.update(tmp) continue # fall-through line logger.debug("(falltrough) %s %s" % (line.tag, txt.encode('utf8'))) if item: res.append(item) return res
def scrape_epagents(table): heading=''.join(table.xpath('.//td[@class="players_committee"]')[0].xpath(".//text()")).strip() responsible=None if heading in [ "Committee responsible", "Former committee responsible"]: responsible=True elif heading in ["Committee for opinion", "Former committee for opinion"]: responsible=False else: logger.warn(u"[!] unknown committee heading %s" % heading) # handle shadows shadowelems=table.xpath('//a[@id="shadowRapporteurHeader"]/../following-sibling::div/p//span[@class="players_rapporter_text"]/a') tips=[t.xpath('text()')[0] if len(t.xpath('text()'))>0 else groupurlmap[t.xpath("a")[0].get('href')] for t in table.xpath('//a[@id="shadowRapporteurHeader"]/../following-sibling::div//span[@class="tiptip"]')] shadows={} for shadow, group in izip_longest(shadowelems, tips): committee=shadow.xpath('./ancestor::td/preceding-sibling::td//acronym/text()')[0] if not committee in shadows: shadows[committee]=[] if group=='NI': group=u'NI' mep={u'name': unicode(shadow.xpath('text()')[0]), u'group': unicode(group)} tmp=getMEPRef(shadow.xpath('text()')[0]) if tmp: mep[u'mepref']=tmp #else: # raise IndexError shadows[committee].append(mep) # delete the uneccessary shadow elements - so the following regular lst2obj get's what it expects for todel in table.xpath('//a[@id="shadowRapporteurHeader"]/..'): parent=todel.xpath('..')[0] parent.remove(todel.xpath('following-sibling::div')[0]) parent.remove(todel) # handle each row of agents agents=[] for agent in lst2obj(table,epagents,1): agent[u'responsible']=responsible agent[u'body']=u'EP' if agent.get('rapporteur'): meps=[] for mep in agent['rapporteur']: if unws(mep['name']).startswith("The committee decided not to give an opinion"): del agent['rapporteur'][agent['rapporteur'].index(mep)] agent[u'opinion']=None continue tmp=getMEPRef(mep['name']) if tmp: meps.append({u'mepref': tmp, u'group': mep['group'], u'name': mep['name']}) else: meps.append({u'group': mep['group'], u'name': mep['name']}) agent[u'rapporteur']=meps abbr=agent['committee'][:4] if abbr=='BUDE': abbr='BUDG' if not abbr in COMMITTEE_MAP.keys(): logger.warn(u"[!] uknown committee abbrev %s" % abbr) agent[u'committee_full']=agent['committee'] if agent['committee'][4]==' ' and abbr.isalpha(): agent[u'committee']=abbr else: agent[u'committee_full']=agent['committee'][5:] agent[u'committee']=abbr if agent.get(u'committee') in shadows.keys(): agent[u'shadows']=shadows[agent['committee']] if not agent in agents: agents.append(agent) return agents
return res def getMEPGender(id): try: mepraw = fetch("http://www.europarl.europa.eu/meps/fr/%s/_home.html" % (id), ignore=[500]) except Exception, e: logger.error("mepgender %s" % e) return 'n/a' borntxt = mepraw.xpath( '//div[@class="zone_info_mep_transparent_mep_details"]//span[@class="more_info"]/text()' ) if len(borntxt) > 0: if unws(borntxt[-1]).startswith(u'décédé'): hint = borntxt[-2].replace(u"\u00A0", ' ').split()[0] else: hint = borntxt[-1].replace(u"\u00A0", ' ').split()[0] if hint == u"Née": return "F" elif hint == u"Né": return "M" logger.warn( '[!] no birth/gender data http://www.europarl.europa.eu/meps/fr/%s/get.html' % id) return 'n/a' def getMEPDeclarations(id): try: