def get_all_dossiers(**kwargs): for year in range(datetime.date.today().year, 1971, -1): tree = fetch( 'https://oeil.secure.europarl.europa.eu/oeil/widgets/resultwidget.do?lang=en&noHeader=false&q=objectReferenceN:N-%s/*\(*\)' % (year)) tmp = tree.xpath( '//span[@class="ep_name" and (starts-with(normalize-space(),"Results found :") or starts-with(normalize-space(),"Result found :"))]/text()' ) if not tmp: log(1, "no dossiers found for %d" % year) raise ValueError("failed to find number of dossiers for year %d" % year) tmp = unws(tmp[0]) count = int(tmp[tmp.index(":") + 1:]) log(4, "year %d, count %d" % (year, count)) #tree=fetch('https://oeil.secure.europarl.europa.eu/oeil/popups/printresultlist.xml?q=objectReferenceN:N-%s/????\(*\)&lang=en&s1&all&limit=%s&lang=en' # % (year, count), prune_xml=True) tree = fromstring( fetch_raw( 'https://oeil.secure.europarl.europa.eu/oeil/popups/printresultlist.xml?q=objectReferenceN:N-%s/*\(*\)&lang=en&s1&all&limit=%s&lang=en' % (year, count)).encode("utf8")) items = tree.xpath('//item') i = 0 for item in items: url = html.unescape( urljoin(BASE_URL, str(item.xpath('./link/text()')[0]))) ref = unws(item.xpath('./reference/text()')[0]) if '*' in ref: ref = ref[:ref.index('*')] log(4, 'adding dossier scraping job %s' % url) payload = dict(kwargs) payload['url'] = url add_job('dossier', payload=payload) i += 1 if i != count: log(1, "total %d, expected %d" % (i, count))
def splitNames(text): text = text.split(' on behalf ', 1)[0] res = [] for delim in (', ', ' and ', ' & ', '; ', ','): if not res: res = filter(None, [ item[:-1] if item[-1] in [',', "'", ';'] else item for item in unws(text).split(delim) if item ]) continue res = filter(None, [ item[:-1] if item[-1] in [',', "'", ';'] else item for elem in res for item in elem.split(delim) if item ]) # only for devel. # for mep in res: # if mep.startswith('on behalf of'): continue # if mep.endswith('Shadow)'): # logger.info('shadow: %s' % mep) res = [ mep if not mep.endswith('Shadow)') else mep[:mep.rfind(' (')] for mep in res if not mep.startswith('on behalf of') ] res = [unws(y) for x in res for y in mansplits.get(x, [x])] return [mepmaps.get(x, x) for x in res]
def getAddress(root): res={} for div in root.xpath('../following-sibling::div[@class="boxcontent " or @class="boxcontent nobordertop"]/ul[@class="contact"]'): key=unws(''.join(div.xpath('./preceding-sibling::h4/text()'))) if key not in ['Bruxelles', 'Strasbourg', 'Postal address', 'Luxembourg']: continue if key=='Bruxelles': key=u'Brussels' elif key=='Postal address': key=u'Postal' res[key]={} if key in ['Brussels', 'Strasbourg', 'Luxembourg']: tmp=div.xpath('./following-sibling::ul[@class="link_collection_noborder"]//span[@class="phone"]/text()') if tmp: res[key][u'Phone'] = unws(tmp[0]).replace('(0)','') tmp=div.xpath('./following-sibling::ul[@class="link_collection_noborder"]//span[@class="fax"]/text()') if tmp: res[key][u'Fax'] = unws(tmp[0]).replace('(0)','') tmp=[unws(x) for x in div.xpath('./li[@class="address"]//text()') if len(unws(x))] if key=='Strasbourg': res[key][u'Address']=dict(zip([u'Organization',u'Building', u'Office', u'Street',u'Zip1', u'Zip2'],tmp)) res[key][u'Address']['City']=res[key]['Address']['Zip2'].split()[1] res[key][u'Address']['Zip2']=res[key]['Address']['Zip2'].split()[0] res[key][u'Address']['building_code']=buildings.get(res[key]['Address']['Building']) elif key=='Brussels': res[key][u'Address']=dict(zip([u'Organization',u'Building', u'Office', u'Street',u'Zip'],tmp)) res[key][u'Address']['City']=res[key]['Address']['Zip'].split()[1] res[key][u'Address']['Zip']=res[key]['Address']['Zip'].split()[0] res[key][u'Address']['building_code']=buildings.get(res[key]['Address']['Building']) elif key=='Luxembourg': res[key][u'Address']=tmp elif key=='Postal': res[key]=tmp else: logger.error("wtf %s" % key) return res
def scrape_basic(tree): res=form2obj((tree.xpath('//table[@id="technicalInformations"]') or [None])[0],detailsheaders) or {} if 'dossier_of_the_committee' in res: res['dossier_of_the_committee']=';'.join(sorted((unws(x) for x in res['dossier_of_the_committee'].split(';')))) table=(tree.xpath('//table[@id="basic_information"]') or [None])[0] if table is None: return res res.update({'stage_reached': (table.xpath('.//p[@class="pf_stage"]/text()') or [''])[0].strip(), 'reference': (table.xpath('.//span[@class="basic_reference"]/text()') or [''])[0].strip(), 'type': (table.xpath('.//p[@class="basic_procedurefile"]/text()') or [''])[0].strip(), 'title': (table.xpath('.//p[@class="basic_title"]/text()') or [''])[0].strip(), }) if '' in res: del res[''] if 'legal_basis' in res: res[u'legal_basis']=sorted((unws(x) for x in res['legal_basis'].split(';'))) fields=table.xpath('.//p[@class="basic_content"]/*') firstline=u' '.join((table.xpath('.//p[@class="basic_content"]/text()') or [''])[0].split()) attrib=u'summary' if len(firstline): if not attrib in res: res[attrib]=[] res[attrib]=[firstline] for elem in fields: if elem.tag=='br' and elem.tail and elem.tail.strip(): if not attrib in res: res[attrib]=[] res[attrib].append(u' '.join(elem.tail.split())) elif elem.tag=='strong': if attrib in res and res[attrib]: res[attrib].sort() attrib=u' '.join(elem.xpath('text()')[0].split()) attrib=detailsheaders.get(attrib,attrib).lower().replace(u" ",u"_") if attrib: res[attrib]=[] return res
def getdoclist(node): txt = [x for x in node.xpath('.//text()') if unws(x)] i = 0 res = [] while i + 1 < len(txt): if unws(txt[i])[-1] == u"\u2013": res.append({ u'type': unws(txt[i])[:-2], u'title': unws(txt[i + 1]), u'url': urljoin(BASE_URL, txt[i + 1].getparent().get('href')) }) i += 2 elif len(unws(txt[i]).split(u" \u2013 ")) > 1: res.append({ u'type': unws(txt[i].split(u" \u2013 ")[0]), u'title': unws(txt[i].split(u" \u2013 ")[1] if len(txt[i].split(u" \u2013 ")) > 1 else u'') }) i += 1 else: i += 1 if i < len(txt) and len(unws(txt[i]).split(u" \u2013 ")) > 1: res.append({ u'type': unws(txt[i]).split(u" \u2013 ")[0], u'title': unws(txt[i]).split(u" \u2013 ")[1] }) return res
def parse_hist_date(txt): tmp = txt.split(' / ') if len(tmp) == 2: (start, end) = tmp elif len(tmp) == 1: start = txt.split()[0] end = "31-12-9999" else: raise ValueError return datetime.strptime(unws(start), u"%d-%m-%Y"), datetime.strptime( unws(end), u"%d-%m-%Y")
def parse_addr(root): # addresses addrs = {} for li in root.xpath('//section[@id="contacts"]//div[@class="card-body"]'): key = unws(''.join(li.xpath('./div[1]//text()'))) if key == 'Bruxelles': key = 'Brussels' addrs[key] = {} if key in ['Brussels', 'Strasbourg']: phone = li.xpath( './/li/i[@class="erpl_icon erpl_icon-phone"]/../a/@href') if phone: addrs[key]['Phone'] = phone[0][4:].replace( "+33(0)388", "+333 88").replace("+32(0)228", "+322 28") fax = li.xpath( './/li/i[@class="erpl_icon erpl_icon-fax"]/../a/@href') if fax: addrs[key]['Fax'] = fax[0][4:].replace( "+33(0)388", "+333 88").replace("+32(0)228", "+322 28") #tmp=[unws(x) for x in li.xpath('.//li[1]//text()') if len(unws(x))] tmp = [ unws(x) for x in li.xpath( './/div[@class="erpl_contact-card-list"]/span/text()') if len(unws(x)) ] if key == 'Strasbourg': addrs[key][u'Address'] = dict( zip([ u'Organization', u'Building', u'Office', u'Street', u'Zip1', u'Zip2' ], tmp)) addrs[key][u'Address']['City'] = addrs[key]['Address'][ 'Zip2'].split()[1] addrs[key][u'Address']['Zip2'] = addrs[key]['Address'][ 'Zip2'].split()[0] addrs[key][u'Address']['building_code'] = buildings.get( addrs[key]['Address']['Building']) elif key == u'Brussels': addrs[key][u'Address'] = dict( zip([ u'Organization', u'Building', u'Office', u'Street', u'Zip' ], tmp)) addrs[key][u'Address']['City'] = addrs[key]['Address'][ 'Zip'].split()[1] addrs[key][u'Address']['Zip'] = addrs[key]['Address']['Zip'].split( )[0] addrs[key][u'Address']['building_code'] = buildings.get( addrs[key]['Address']['Building']) elif key == 'Luxembourg': addrs[key][u'Address'] = tmp elif key == 'Postal address': addrs['Postal'] = tmp return addrs
def toLinks(node): if node is None: return for br in node.xpath("br"): br.text="\n" ret=[] for line in node.xpath(".//text()"): if len(unws(line))<1: continue if line.getparent().tag=='a': ret.append({u'title': unws(line), 'url': unicode(urljoin(BASE_URL,line.getparent().get('href')),'utf8')}) else: ret.append({u'title': unws(line)}) return ret
def scan(d, node): """ helper for dump_schema""" if not 'types' in node: node['types'] = {} if isinstance(d, dict): for k, v in d.items(): if not 'items' in node: node['items'] = {} if not k in node['items']: node['items'][k] = {'name': k} node['items'][k] = scan(v, node['items'][k]) elif isinstance(d, list): if not 'elems' in node: node['elems'] = {} for v in d: stype = type(v) node['elems'][stype] = scan(v, node['elems'].get(stype, {})) if isinstance(d, str): d = unws(d) or None mtype = type(d) tmp = node['types'].get(mtype, {'count': 0, 'example': None}) tmp['count'] += 1 if d and not tmp['example'] and not isinstance(d, dict): tmp['example'] = d node['types'][mtype] = tmp return node
def extract_table(table, url, date=None): trs = table.xpath('.//tr') header = trs[0] tds = header.xpath('.//td') if len(tds) < 2: log( 1, "vote table has less than two columns in the header: %s %s" % (url, tostring(trs[0]))) raise ValueError type = junws(tds[1]) if type not in {"+", "-", "0"}: log( 1, "vote header type is unexpected value %s in %s" % (repr(type), url)) raise ValueError res = {'total': int(junws(tds[0])), 'type': type, 'meps': []} for tr in trs[1:]: tds = tr.xpath('.//td') if len(tds) < 2: log( 1, "vote table has less than two columns in the body: %s %s" % (url, tostring(tr))) raise ValueError #grp = junws(tds[0]).split() for meps in tds[1].xpath(".//p"): meps = junws(meps) if not meps: continue for m in meps.split(','): m = unws(m) if not m: continue mepid = db.getMep(m, date=date) if not mepid: log(2, "could not resolve MEP name: %s" % m) res['meps'].append(mepid or m) return res
def istype(text): # get type found=False for t in types: if unws(text).lower().startswith(t.lower()): found=True break return found
def sidebar_check(root, url): sidebar = root.xpath( '//div[@id="sectionsNavPositionInitial"]//div[@class="erpl_side-navigation"]/div/ul' ) if len(sidebar) != 1: log(1, "sidebar has not 1 element: %s" % url) raise ValueError for li in sidebar[0].xpath('./li'): title = li.xpath('./a/span[@class="t-x"]/text()') if len(title) != 1: log(1, "title has not 1 element: %s" % url) raise ValueError title = unws(title[0]) if title not in known_sidebar: log(2, '"%s" not in known_sidebar items, in %s' % (title, url)) subtitles = li.xpath('.//div/ul/li/a/span[@class="t-x"]/text()') for s in subtitles: s = unws(s) if s not in known_sidebar[title]: log( 2, '"%s" -> "%s" not in known_sidebar items, in %s' % (title, s, url))
def addchangednames(mep): mepid = mep['UserID'] m = db.get('ep_meps', mepid) if not m: return mep prevnames = [ c['data'][0] for changes in m.get('changes', {}).values() for c in changes if c['path'] == ['Name', 'full'] ] aliases = set(mep['Name']['aliases']) for name in prevnames: aliases |= set(mangleName(name, mepid)['aliases']) mep['Name']['aliases'] = sorted( [x for x in set(unws(n) for n in aliases) if x]) return mep
def mangleName(name, id): sur = [] family = [] tmp = name.split(' ') title = None for i, token in enumerate(tmp): if ((token.isupper() and not isabbr(token)) or token in ['de', 'van', 'von', 'del'] or (token == 'in' and tmp[i + 1] == "'t") or (token[:2] == 'Mc' and token[2:].isupper())): family = tmp[i:] break else: sur.append(token) sur = u' '.join(sur) family = u' '.join(family) for t in TITLES: if sur.endswith(t): sur = sur[:-len(t)] title = t break if sur.startswith(t): sur = sur[len(t) + 1:] title = t break res = {u'full': name, u'sur': sur, u'family': family} aliases = set( [family, name, u"%s %s" % (sur, family), u"%s %s" % (family, sur)]) if title: res[u'title'] = title aliases |= set([(u"%s %s" % (title, family)), (u"%s %s %s" % (title, family, sur)), (u"%s %s %s" % (title, sur, family)), (u"%s %s %s" % (sur, title, family)), (u"%s %s %s" % (sur, family, title)), (u"%s %s %s" % (family, sur, title)), (u"%s %s %s" % (family, title, sur))]) if id in MEPS_ALIASES: aliases |= set(MEPS_ALIASES[id]) res[u'aliases'] = sorted([x for x in set(unws(n) for n in aliases) if x]) return res
def scrape(url, **kwargs): log(3,"scraping %s" % (url)) root = getXML(url) if root is None: log(1,"could not get votes for", url) return # angrily o/ log(3, "processing plenary votes xml from %s" % url) # root is: #PV.RollCallVoteResults EP.Number="PE 533.923" EP.Reference="P7_PV(2014)04-17" Sitting.Date="2014-04-17" Sitting.Identifier="1598443" votes=[] for vote in root.xpath('//RollCallVote.Result'): # hrmpf, some EP seriously used the braindead Y-d-m format sometimes in vote timestamps :/ time = vote.get('Date') if len(time.split()) == 2: ts = datetime.strptime(time, "%Y-%m-%d %H:%M:%S") else: ts = datetime.strptime(time, "%Y-%m-%d") tmp=vote.get('Identifier') if tmp: voteid = int(tmp) else: tmp = vote.get('Number') if not tmp: log(1, "blimey, could not deduce an id for the vote in %s" % url) raise ValueError("no id for vote in %s" % url) voteid = "%s-%s" % (ts,tmp) title = vote.xpath("RollCallVote.Description.Text") if len(title) != 1: log(2, "holy ambiguity Batman! This vote doesn't have one title, but %d: %d %s" % (len(title), voteid, url)) title="!unknown!" else: title=junws(title[0]) v={u"ts": ts, u"url": url, u"voteid": voteid, u"title": title, 'votes':{}} v.update(votemeta(v['title'], v['ts'])) if 'epref' not in v: ref = vote.xpath("RollCallVote.Description.Text/a/text()") if ref: v['epref']=unws(ref[0]) for type, stype in [('Result.For','+'), ('Result.Against','-'), ('Result.Abstention','0')]: type = vote.xpath(type) if not type: continue if len(type)>1: log(2, "[pff] more than one %s entry in vote (id:%d) in %s" % (stype, v['voteid'], url)) type = type[0] v['votes'][stype]={'total': int(type.get('Number')), 'groups': {}} for group in type.xpath('Result.PoliticalGroup.List'): g = str(group.get('Identifier')) if not g in v['votes'][stype]['groups']: v['votes'][stype]['groups'][g]=[] for tag in ['Member.Name', 'PoliticalGroup.Member.Name']: for mep in group.xpath(tag): m = {} name = junws(mep) mepid = mep.get("PersId") if mepid: mepid = int(mepid) else: mepid = db.getMep(name, v['ts'], abbr=g) if mepid: m['mepid']= mepid #if int(mep.get('MepId')) in ambiguous_meps: # oid = int(mep.get('MepId')) # ambiguous_meps.remove(oid) # log(2,'found mepid for previously ambigous obscure_id: "%s": %s' % (oid, mepid)) else: mepid = lost_meps.get(mep.get('MepId')) if mepid: m['mepid']= mepid else: m['name']= name m['obscure_id']=int(mep.get('MepId')) # it's a totally useless and confusing id that is nowhere else used v['votes'][stype]['groups'][g].append(m) # save process(v, v['voteid'], db.vote, 'ep_votes', v['title']) votes.append(v) return votes
def scrape(id, terms, mepname, **kwargs): activity_types = ( ('plenary-speeches', 'CRE'), ('reports', "REPORT"), ('reports-shadow', "REPORT-SHADOW"), ('opinions', "COMPARL"), ('opinions-shadow', "COMPARL-SHADOW"), ('motions-instit', "MOTION"), ('oral-questions', "OQ"), # other activities ('written-explanations', 'WEXP'), ('major-interpellations', 'MINT'), ('written-questions', "WQ"), ('motions-indiv', "IMOTION"), ('written-declarations', "WDECL"), ) activities = {} for type, TYPE in activity_types: for term in terms: page = 0 cnt = 20 url = "http://www.europarl.europa.eu/meps/en/%s/loadmore-activities/%s/%s/?page=%s&count=%s" % ( id, type, term, page, cnt) try: root = fetch(url) except: log(1, "failed to fetch {}".format(url)) raise ValueError #continue #print(url, file=sys.stderr) while (len(root.xpath('//div[@class="erpl_document"]')) > 0): for node in root.xpath('//div[@class="erpl_document"]'): if type == 'written-explanations': item = { 'title': unws(''.join( node.xpath( './div/h3/span[@class="t-item"]//text()')) ), 'date': datetime.strptime( node.xpath('./div[1]/div[1]/span[1]/text()') [0], u"%d-%m-%Y"), 'text': unws(''.join(node.xpath('./div[2]/div//text()'))) } elif type == 'written-declarations': if len(node.xpath('./div[1]/div')) != 3: log( 2, "written decl item has not 3 divs but %d %s" % (len(node.xpath('./div[1]/div')), url)) continue if len(node.xpath('./div[1]/div[1]/span')) != 3: log( 2, "written decl item has not 3 but %d spans in the 1st div at %s" % (len(node.xpath('./div[1]/div[1]/span')), url)) continue item = { 'title': unws(''.join( node.xpath( './div/h3/span[@class="t-item"]//text()')) ), 'date': datetime.strptime( node.xpath('./div[1]/div[1]/span[1]/text()') [0], u"%d-%m-%Y"), 'id': unws(''.join( node.xpath('./div[1]/div[1]/span[2]/text()') [0])), 'status': unws(''.join( node.xpath('./div[1]/div[1]/span[3]/text()') [0])), 'formats': [{ 'type': unws(fnode.xpath('./span/text()')[0]), 'url': str(fnode.xpath('./@href')[0]), 'size': unws(fnode.xpath('./span/span/text()')[0]) } for fnode in node.xpath( './div[1]/div[2]/div[@class="d-inline"]/a')], 'authors': [{ 'name': name.strip(), "mepid": db.mepid_by_name(name.strip()) } for name in node.xpath( './div[1]/div[3]/span/text()')], } for info in node.xpath('./div[2]/div'): label = unws(''.join(info.xpath('./text()')))[:-2] value = unws(''.join(info.xpath('./span/text()'))) if 'date' in label.lower(): value = datetime.strptime(value, u"%d-%m-%Y") if label == 'Number of signatories': number, date = value.split(' - ') value = int(number) item["No of sigs date"] = datetime.strptime( date, u"%d-%m-%Y") item[label] = value else: #from lxml.etree import tostring #print('\n'.join(tostring(e).decode() for e in node.xpath('./div/div[1]'))) # all other activities share the following scraper ref = unws(''.join( node.xpath('./div[1]/div[1]/span[2]/text()'))) if ref.startswith('- '): ref = ref[2:] if ref.endswith(' -'): ref = ref[:-2] item = { 'date': datetime.strptime( node.xpath('./div[1]/div[1]/span[1]/text()') [0], u"%d-%m-%Y"), 'reference': ref, } if type not in ['written-questions', 'oral-questions']: ref = unws(''.join( node.xpath('./div[1]/div[1]/span[3]/text()'))) if ref: if not pere.match(ref): log( 2, "pe, has not expected format: '%s'" % ref) else: item['pe'] = ref # opinions don't have title urls... why would they? refurl = node.xpath('./div[1]/h3/a/@href') if refurl: item['url'] = str(refurl[0]) item['title'] = unws(''.join( node.xpath( './div/h3//span[@class="t-item"]//text()'))) abbr = node.xpath( './div[1]/div[1]/span/span[contains(concat(" ",normalize-space(@class)," ")," erpl_badge-committee ")]/text()' ) if len(abbr): item['committee'] = [ a for a in [unws(c) for c in abbr] if a ] formats = [] for fnode in node.xpath( './div[1]/div[2]/div[@class="d-inline"]/a'): elem = { 'type': unws(fnode.xpath('./span/text()')[0]), 'url': str(fnode.xpath('./@href')[0]) } tmp = fnode.xpath('./span/span/text()') if len(tmp) > 0: elem['size'] = unws(tmp[0]) formats.append(elem) if formats: item['formats'] = formats authors = [{ 'name': name.strip(), "mepid": db.mepid_by_name(name.strip()) } for name in node.xpath('./div[1]/div[3]/span/text()') ] if authors: item['authors'] = authors if type in ['opinions-shadow', 'opinions']: for f in item['formats']: if f['type'] == 'PDF': ref = pdf2ref(f['url']) if ref is not None: item['dossiers'] = [ref] break else: # try to deduce dossier from document reference dossiers = db.get('dossiers_by_doc', item['reference']) or [] if len(dossiers) > 0: item['dossiers'] = [ d['procedure']['reference'] for d in dossiers ] elif not '+DOC+PDF+' in item['url']: # try to figure out the associated dossier by making an (expensive) http request to the ep log( 4, "fetching primary activity page %s" % item['url']) try: refroot = fetch(item['url']) except: refroot = None if refroot is not None: if '/doceo/' in item[ 'url']: # stupid new EP site removed the span with the procedure, bastards. fulla = refroot.xpath( '//table[@class="buttondocwin"]//a/img[@src="/doceo/data/img/navi_moredetails.gif"]/..' ) if fulla: fullurl = fulla[0].get('href') if fullurl.endswith('.html'): if fullurl[-7:-5] != 'EN': fullurl = fullurl[:-7] + 'EN.html' log( 4, 'loading activity full text page %s' % fullurl) if fullurl.startswith( '/doceo'): fullurl = 'https://www.europarl.europa.eu' + fullurl if fullurl != item['url']: refroot = fetch(fullurl) else: log( 4, 'no fulla for %s' % item['url']) anchor = refroot.xpath( '//span[@class="contents" and text()="Procedure : " and not(ancestor::div[@style="display:none"])]' ) if len(anchor) == 1: dossier = anchor[0].xpath( "./following-sibling::a/text()") if len(dossier) == 1: item['dossiers'] = [ unws(dossier[0]) ] elif len(dossier) > 1: log( 2, "more than one dossier in ep info page: %d %s" % (len(dossier), item['url'])) elif len(anchor) > 1: log( 2, "more than one anchor in ep info page: %d %s" % (len(anchor), item['url'])) item['term'] = term if TYPE not in activities: activities[TYPE] = [] activities[TYPE].append(item) if len(root.xpath('//div[@class="erpl_document"]')) < cnt: break page += 1 url = "http://www.europarl.europa.eu/meps/en/%s/loadmore-activities/%s/%s/?page=%s&count=%s" % ( id, type, term, page, cnt) try: root = fetch(url) except: log(1, "failed to fetch {}".format(url)) #raise ValueError break #print(url, file=sys.stderr) if TYPE in activities: activities[TYPE] = sorted(activities[TYPE], key=lambda x: x['date']) activities['mep_id'] = id if len(activities.keys()) > 1: process(activities, id, db.activities, 'ep_mep_activities', mepname, nodiff=True) return activities return {}
def scrape(url, committee, **kwargs): comid = committee root = fetch(url) lines = [ x for x in root.xpath('//td[@class="contents"]/div/*') if unws(' '.join(x.xpath('.//text()'))) ] lines = [ x for x in lines if unws(' '.join(x.xpath('.//text()'))) not in ['<EPHeader>', '</EPHeader>'] ] if not len(lines): return if not unws(' '.join(lines[2].xpath('.//text()'))) in [ 'DRAFT AGENDA', '<TitreType> DRAFT AGENDA </TitreType>' ]: log( 3, "not DRAFT AGENDA %s in %s" % (unws(' '.join(lines[2].xpath('.//text()'))), url)) agenda = { u'committee': comid, u'committee_full': unws(' '.join(lines[0].xpath('.//text()'))), u'src': url, } i = 1 if unws(' '.join(lines[3].xpath( './/text()'))) == "INTERPARLIAMENTARY COMMITTEE MEETING": log(2, "skipping interparl com meet") return if len(lines) >= 7 and unws(' '.join( lines[6].xpath('.//text()'))).startswith('Room'): agenda.update({ u'docid': unws(' '.join(lines[1].xpath('.//text()'))), u'type': unws(' '.join(lines[3].xpath('.//text()'))), u'time': toTime(unws(' '.join(lines[4].xpath('.//text()')))), u'city': unws(' '.join(lines[5].xpath('.//text()'))), u'room': unws(' '.join(lines[6].xpath('.//text()')))[6:], }) i = 7 itemcnt = 0 item = {} schedule = None res = [] while i < len(lines): line = lines[i] i += 1 txt = unws(' '.join(line.xpath('.//text()'))) if txt in ['* * *', '***']: continue # skip end of schedule block # 20 December 2011, 16.00 – 16.30 tmp = toTime(txt) if tmp: schedule = tmp if i < len(lines) and unws(' '.join( lines[i].xpath('.//text()'))) == 'In camera': schedule[u'incamera'] = True i += 1 continue if line.tag == 'div': item[u'actors'] = getactors(line) continue firsttoken = txt.split()[0] # 6. Alternative dispute resolution for consumer disputes and # amending Regulation (EC) No 2006/2004 and Directive # 2009/22/EC (Directive on consumer ADR) if firsttoken[-1] == '.' and firsttoken[:-1].isdigit( ) and itemcnt + 1 == int(firsttoken[:-1]): if item: res.append(item) itemcnt += 1 item = copy.deepcopy(agenda) item.update({ u'title': ' '.join(txt.split()[1:]), u'seq_no': itemcnt, }) if schedule: item.update(schedule) continue # trailing list of "details" # · Presentation by the Commission of the proposal & Impact Assessment # · Exchange of views if firsttoken == u"·": if not 'list' in item: item[u'list'] = [] tmp = ' '.join(txt.split()[1:]) if tmp.startswith('Deadline for tabling amendments:'): try: item[u'tabling_deadline'] = datetime.strptime( tmp.split(':')[1].strip(), "%d %B %Y, %H.%M") except ValueError: try: item[u'tabling_deadline'] = datetime.strptime( tmp.split(':')[1].strip(), "%d.%m.%Y at %H.%M") except: log( 2, '[$] unknown tabling deadline format %s' % unws(tmp)) item[u'list'].append(tmp) continue # committee dossier # IMCO/7/08130 if txt.startswith("%s/7/" % comid) and len(txt) == 12: item[u'comdossier'] = txt continue # ***I 2011/0373(COD) COM(2011)0793 – C7-0454/2011 tmp = getdocs(txt) if tmp: item.update(tmp) continue # fall-through line log(4, "(falltrough) %s %s" % (line.tag, txt.encode('utf8'))) if item: res.append(item) save(res) return res
def parseMember(userid): url='http://www.europarl.europa.eu/meps/en/%s/_history.html' % userid logger.info("scraping %s" % url) root = fetch(url, ignore=[500]) data = { u'active': False, u'Photo': unicode(urljoin(BASE_URL,"/mepphoto/%s.jpg" % userid)), u'meta': {u'url': url} } mepdiv=root.xpath('//div[@class="zone_info_mep_transparent_mep_details"]') if len(mepdiv) == 1: mepdiv = mepdiv[0] else: logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv))) data[u'Name'] = mangleName(unws(' '.join(mepdiv.xpath('.//li[@class="mep_name"]//text()')))) borntxt=mepdiv.xpath('.//span[@class="more_info"]/text()') if len(borntxt)>0: if unws(borntxt[-1]).startswith('Date of death:'): try: data[u'Death'] = datetime.strptime(unws(borntxt[-1]), u"Date of death: %d %B %Y") except ValueError: logger.warn('[!] failed to scrape birth data %s' % url) logger.warn(traceback.format_exc()) tmp = borntxt[-2].split(',', 1) else: tmp = borntxt[-1].split(',', 1) if len(tmp)==2: (d, p) = tmp else: d,p = tmp[0], None try: data[u'Birth'] = { u'date': datetime.strptime(unws(d), u"Date of birth: %d %B %Y")} except ValueError: logger.warn(traceback.format_exc()) finally: if p: if 'Birth' in data: data[u'Birth'][u'place'] = unws(p) else: data[u'Birth'] = unws(p) else: logger.warn('[!] no birth data %s' % url) # scrape stuff from right column addif(data,u'RSS',[unicode(urljoin(BASE_URL,x.get('href')),'utf8') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_rss"]')]) addif(data,u'Homepage',[x.get('href') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_website"]')]) addif(data,u'Twitter',[x.get('href') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_twitt"]')]) addif(data,u'Facebook',[x.get('href') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_fb"]')]) addif(data,u'Mail',[x.get('href')[7:].replace('[dot]','.').replace('[at]','@')[::-1] for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_email"]')]) # contact information for span in root.xpath('//div[@id="content_right"]//h3'): title=unws(''.join(span.xpath('.//text()'))) if title == "Contacts": addif(data,u'Addresses',getAddress(span)) # scrape main content for section in root.xpath('//div[@id="content_left"]/div[@class="boxcontent nobackground"]/h4'): key=unws(''.join(section.xpath('.//text()'))) if key=="National parties": # constituencies key='Constituencies' for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'): line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')])) try: interval, party = line.split(' : ',1) except ValueError: continue tmp = interval.split(' / ') if not key in data: data[key]=[] if len(tmp)==2: (start, end) = tmp else: start = interval.split()[0] end = "31.12.9999" cstart = party.rfind(' (') if party[cstart+2:-1] in SEIRTNUOC: country = party[cstart+2:-1] party = party[:cstart] else: logger.warn('unknown country: %s' % party[cstart+2:-1]) country='unknown' #print etree.tostring(constlm, pretty_print=True) data[key].append({ u'party': party, u'country': country, u'start': datetime.strptime(unws(start), u"%d.%m.%Y"), u'end': datetime.strptime(unws(end), u"%d.%m.%Y"), }) elif key in ['Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President', 'President', 'Vice-President', 'Observer', 'Quaestor', 'Substitute observer']: # memberships in various committees, delegations and EP mgt for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'): line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')])) try: interval, org = line.split(' : ',1) except ValueError: continue tmp = interval.split(' / ') if len(tmp)==2: (start, end) = tmp else: start = interval.split()[0] end = "31.12.9999" item={u'role': key, u'abbr': COMMITTEE_MAP.get(org), u'Organization': org, u'start': datetime.strptime(unws(start), u"%d.%m.%Y"), u'end': datetime.strptime(unws(end), u"%d.%m.%Y"), } for start, field in orgmaps: if item['abbr'] in COMMITTEE_MAP or item['Organization'].startswith(start): if not field in data: data[field]=[] if field=='Committees' and item['Organization'] in COMMITTEE_MAP: item[u'committee_id']=COMMITTEE_MAP[item['Organization']] data[field].append(item) break elif key == u'Political groups': for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'): line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')])) interval, org = line.split(' : ',1) tmp = org.split(u' - ') if len(tmp)>1: org = ' - '.join(tmp[:-1]) role = tmp[-1] elif org.endswith(' -'): org=org[:-2] role='' else: logger.error('[!] political group line %s' % line) continue tmp = interval.split(' / ') if len(tmp)==2: (start, end) = tmp else: start = interval.split()[0] end = "31.12.9999" if not u'Groups' in data: data[u'Groups']=[] data[u'Groups'].append( {u'role': role, u'Organization': org, u'country': COUNTRIES.get(unws(constlm.get('class')).upper(), 'unknown country: %s' % unws(constlm.get('class'))), u'groupid': group_map[org], u'start': datetime.strptime(unws(start), u"%d.%m.%Y"), u'end': datetime.strptime(unws(end), u"%d.%m.%Y"), }) else: logger.error('[!] unknown field %s' % key) # sort all lists in descending order for fld in ['Constituencies', 'Groups', 'Committees', 'Delegations', 'Staff']: if not fld in data: continue data[fld]=sorted(data[fld], key=lambda x: x.get('end',x['start']), reverse=True) # get CV - page (is on separate http path :/) cvurl='http://www.europarl.europa.eu/meps/en/%s/_cv.html' % userid root = fetch(cvurl, ignore=[500]) data[u'CV']={} for sec in root.xpath('//h3[@class="collapsible"]'): section=unws(''.join(sec.xpath('.//text()'))) data[u'CV'][section]=[] for line in sec.xpath('./following-sibling::div[1]//li'): data[u'CV'][section].append(unws(''.join(line.xpath('.//text()')))) # get assistants also on a separate page :/ assurl='http://www.europarl.europa.eu/meps/en/%s/_assistants.html' % userid root = fetch(assurl, ignore=[500]) for h3 in root.xpath('//h3[@id="section"]'): title=unws(''.join(h3.xpath('.//text()'))) if title in ['Accredited assistants', 'Local assistants']: if not 'assistants' in data: data['assistants']={} addif(data['assistants'], title.lower().split()[0], [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')]) elif title in ['Accredited assistants (grouping)', 'Local assistants (grouping)', 'Service providers', ' Trainees', 'Paying agents (grouping)', 'Paying agents']: if not 'assistants' in data: data['assistants']={} addif(data['assistants'], title.lower(), [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')]) return data
def scrape(url, meps=None, **kwargs): prolog=True res=[] block=None reference=None date=None committee=[] text, PE=getraw(url) motion = False for line in text: #log(4,'line is: "%s"' % line) if prolog: line=unws(line) if not line: continue if amstart.match(line): if PE is None: log(1, "document has no PE id: %s" % url) if reference==None: log(1,"[!] couldn't find ref: %s" % (unws([x for x in text[:20] if unws(x)][2]))) # marking as scraped though if not motion: log(1, "couldn't find dossier reference in source pdf: %s" % url) #raise ValueError("No dossier reference in amendment: %s" % url) return log(3, "couldn't find dossier reference in source pdf, but was marked as motion: %s" % url) return if date==None or committee==[]: log(1,"[!] couldn't find date or committee: %s" % url) raise ValueError("No date or committee in amendment") block=[line] prolog=False continue if line == 'Draft motion for a resolution': log(4,"document is a draft motion for resolution") motion = True m = re.search(pere, line) if m: if PE is None: PE = m.group(0) log(4,"found PE reference: %s" % PE) line = unws(line.replace(PE,'')) log(4,'updated line is: "%s"' % line) if line in COMMITTEE_MAP: log(4,'found committee: "%s"' % line) committee.append(COMMITTEE_MAP[line]) continue m = re.search(refre, line) if (committee and not reference and m): reference=m.group(1) log(4,'found reference: "%s"' % reference) if url == 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-506.166%2b03%2bDOC%2bPDF%2bV0%2f%2fEN': log(3, "adjusting reference to eudatap") reference="2012/0011(COD)" continue if (not date): try: date = parse(unws(line), dayfirst=True) log(4,'found date: "%s"' % line) except ValueError: pass except TypeError: pass continue if amstart.match(line): # parse block am=parse_block(block, url, reference, date, committee, meps, PE) if am is not None: process(am, am['id'], db.amendment, 'ep_amendments', am['reference']+' '+am['id'], nodiff=True) res.append(am) block=[line] continue block.append(line) if block and filter(None,block): am = parse_block(block, url, reference, date, committee, meps, PE) if am is not None: process(am, am['id'], db.amendment, 'ep_amendments', am['reference']+' '+am['id'], nodiff=True) res.append(am) log(3,"total amendments %d in %s" % (len(res),url)) return res
def parse_block(block, url, reference, date, committee, rapporteur, PE): am={u'src': url, u'peid': PE, u'reference': reference, u'date': date, u'committee': committee} #logger.info(block) # get title try: am[u'seq']=int(unws(block[0]).split()[1]) except ValueError: am[u'seq']=unws(block[0]).split()[1] except IndexError: log(2,"wrong seq %s" % (block[0])) am[u'seq']=unws(block[0]) del block[0] pefix = PE.split('v')[0] # we strip of the v0[0-9]-[0-9]{1,2} part of the PEID am['id']="%s-%s" % (pefix,am['seq']) strip(block) # find and strip justification i=len(block)-1 while i>2 and not (unws(block[i])=="Justification" and block[i].startswith(' ' * 6)): i-=1 if i>2: if i<len(block)-1 and (not unws(block[i+1]) or not block[i+1].startswith(' ') ): am['justification']='\n'.join(block[i+2:]) del block[i:] strip(block) else: log(2, 'wrong justification in %s: "%s"' % (am['seq'], '\\n'.join(block[i:]))) # get original language if 4<len(unws(block[-1]))<=6 and unws(block[-1]).startswith('Or.'): am['orig_lang']=unws(block[-1])[4:] del block[-1] strip(block) # find split column new/old heading i=len(block)-1 while (i>2 and not ((block[i].endswith(" Amendment") or block[i].endswith(" PARTICULARS") or block[i].endswith(" Remedy") or block[i].endswith(" Amended text") or block[i].endswith(" Amendement") or block[i].endswith(" Amendments by Parliament") or block[i].endswith(" Proposal for rejection") or block[i].endswith(" Proposal for a rejection") or block[i].endswith(" Does not affect English version") or block[i].endswith(" (Does not affect English version)") or block[i].endswith(" Amendment by Parliament")) and len(block[i])>33) and not (unws(block[i])=='Text proposed by the Commission' or unws(block[i]) in types)): i-=1 if i>2: #if block[i].endswith(" Proposal for rejection"): # pass # location will be possibly '-' seq=False if unws(block[i]) in ["Amendment", "Amendment by Parliament"]: # sequential format, search for preceeding original text j=i while (j>2 and not (unws(block[j]) in types or unws(block[j])=='Text proposed by the Commission')): j-=1 if j>2: i=j seq=True; key='old' elif unws(block[i])=='Text proposed by the Commission' or block[i].strip() in types: seq=True; key='old' # throw headers del block[i] while i<len(block) and not unws(block[i]): del block[i] # skip blank lines mid=max([len(x) for x in block])//2 while i<len(block): if seq: if unws(block[i]) in ["Amendment", "Amendment by Parliament", "Text Amended"]: key='new' del block[i] continue try: am[key].append(block[i]) except KeyError: am[key]=[block[i]] del block[i] continue # only new, old is empty if block[i].startswith(' '): try: am['new'].append(unws(block[i])) except KeyError: am['new']=[unws(block[i])] del block[i] continue newstart = block[i].rstrip().rfind(' ') # only old, new is empty if newstart < 6: try: am['old'].append(unws(block[i])) except KeyError: am['old']=[unws(block[i])] del block[i] continue #mid=len(block[i])/2 #mid=40 lsep=block[i].rfind(' ', 0, mid) rsep=block[i].find(' ', mid) sep=None if abs(lsep-mid)<abs(rsep-mid): if abs(lsep-mid)<15: sep=lsep else: if abs(rsep-mid)<15: sep=rsep if sep: try: am['old'].append(unws(block[i][:sep])) except KeyError: am['old']=[unws(block[i][:sep])] try: am['new'].append(unws(block[i][sep:])) except KeyError: am['new']=[unws(block[i][sep:])] else: # no sane split found #logger.warn("no split: %s %s\n%s" % (datetime.now().isoformat(), # (sep, mid, len(block[i]), newstart, block[i]), # block[i][mid-1:mid+2])) # fallback to naive splitting try: am['old'].append(unws(block[i][:newstart])) except KeyError: am['old']=[unws(block[i][:newstart])] try: am['new'].append(unws(block[i][newstart:])) except KeyError: am['new']=[unws(block[i][newstart:])] del block[i] strip(block) else: if not 'Does not affect English version.' in block[i:]: log(2, "no table\n%s" % ('\n'.join(block[i:]))) return None #am['content']=block[i:] #return am i=0 # find end of authors while (i<len(block) and unws(block[i]) and not unws(block[i]).lower().startswith('compromise') and not istype(block[i]) and not unws(block[i]).split()[0] in locstarts): i+=1 if i<len(block): if i>0: names=' '.join(block[:i]) am['authors']=names #logger.info("names \n%s" % names) # convert to pt mep _ids for text in filter(None,splitNames(names)): mepid=db.getMep(text,date) if mepid: try: am['meps'].append(mepid) except KeyError: am['meps']=[mepid] else: log(3, "fix %s" % text) del block[:i] strip(block) elif rapporteur: am['authors']=rapporteur if isinstance(rapporteur,list): for text in rapporteur: mepid=db.getMep(text,date) if mepid: try: am['meps'].append(mepid) except KeyError: am['meps']=[mepid] else: log(3, "fix %s" % text) else: for text in filter(None,splitNames(rapporteur)): mepid=db.getMep(text,date) if mepid: try: am['meps'].append(mepid) except KeyError: am['meps']=[mepid] else: log(3, "fix %s" % text) else: log(3, "no authors in Amendment %s %s" % (am['seq'], url)) else: log(2, "no boundaries in Amendment %s %s\n%s" % (am['seq'], url, '\n'.join(block))) am['rest']=block return am # handle compromise info i=0 while (i<len(block) and unws(block[i]) and not istype(block[i]) and not unws(block[i]).split()[0] in locstarts): i+=1 if i<len(block) and i>0: if [unws(x) for x in block[:i]]!=["Draft proposal for a recommendation"]: am['compromise']=block[:i] del block[:i] strip(block) i=0 while (i<len(block) and unws(block[i])): if unws(block[i]).split()[0] in locstarts: try: am['location'].append((' '.join(block[:i]),unws(block[i]))) except KeyError: am['location']=[(' '.join(block[:i]),unws(block[i]))] del block[:i+1] i=0 else: i+=1 if len(block)>0 and ((len(block)==1 or not unws(block[1])) and unws(block[0])!='1' and 'location' in am): am['location'][-1]=(am['location'][-1][0],"%s %s" % (am['location'][-1][1],block[0])) del block[0] strip(block) if block: if not ((len(block)==3 and unws(block[0])=='1' and not unws(block[1]) and block[2].startswith(" ")) or (len(block)==2 and unws(block[0])=='1' and block[1].startswith(" "))): # ignore obvious footnotes log(3, "rest in Amendment %s\n%s" % (am['seq'],'\n'.join(block))) return am
def strip(block): while len(block) and not unws(block[0]): del block[0] while len(block) and not unws(block[-1]): del block[-1]
def getactors(node): res = {} ax = [None, []] for row in node.xpath('.//tr'): cells = row.xpath('./td/p') if not cells: continue # get role Rapporteur|Responsible|Rapporteur for the opinion|Opinions role = cells[0].xpath('text()') if role and unws(role[0]): #print(ax[1]) if ax[0] and ax[1]: res[ax[0]] = sorted(ax[1], key=lambda x: x.get('name', '')) tmp = unws(role[0])[:-1] if tmp == "Rapporteur for the opinion": tmp = "Rapporteur" ax = [tmp, []] tmp = unws((cells[1].xpath('text()') or [''])[0]) if ax[0] in ["Rapporteur", "Rapporteur for the opinion"] and tmp: name = ' '.join(tmp.split()[:-1]) item = { u'group': tmp.split()[-1][1:-1], u'name': name, u'mepref': getMEPRef(name) } if len(cells) > 2: item[u'docs'] = getdoclist(cells[2]) ax[1].append(item) continue if ax[0] in ["Opinions", "Responsible"] and tmp: tmp1 = tmp.split(u' –', 1) if len(tmp1) == 2: (comid, rest) = tmp1 elif len(tmp1) == 1: if len(tmp1[0]) == 4 and tmp1[0].isupper(): (comid, rest) = (tmp1, '') elif len(tmp1[0]) > 4 and tmp1[0][4] in [ '-', u'–', u':', u'*' ] and tmp1[0][:4].isupper(): (comid, rest) = (tmp1[:4], tmp1[5:]) else: skip = False for com in tmp.split(', '): if com in COMMITTEE_MAP and len(com) == 4: ax[1].append({u'comid': com}) skip = True if skip: continue else: log(2, "[!] unknown committee: %s" % tmp) raise if not comid: log(2, "[!] unknown committee: %s" % tmp) item = {u'comid': comid} if rest == ' Decision: no opinion': item[u'response'] = u'Decision: no opinion' if not rest and len(comid) > 4: for com in comid.split(', '): ax[1].append({u'comid': com}) continue if len(cells) > 2: tmp = unws((cells[2].xpath('text()') or [None])[0]) if tmp: name = ' '.join(tmp.split()[:-1]) item.update({ u'group': tmp.split()[-1][1:-1], u'name': name, u'mepref': getMEPRef(name) }) if len(cells) > 3: item[u'docs'] = getdoclist(cells[3]) ax[1].append(item) if ax[0] and ax[1]: #print(ax[0], ax[1]) #res[ax[0]]=ax[1] res[ax[0]] = sorted(ax[1], key=lambda x: x.get('name', '')) return res
def scrape(id, **kwargs): # we ignore the /meps/en/<id>/<name>/home path, since we can get all info also from other pages url = "http://www.europarl.europa.eu/meps/en/%s/name/cv" % id xml = fetch_raw(url) # we have to patch up the returned html... xml = xml.replace("</br>", "<br/>") # ...it contains some bad tags.. root = fromstring( xml ) # ...which make the lxml soup parser drop some branches in the DOM sidebar_check(root, url) mep = { 'UserID': id, 'Name': mangleName( unws(' '.join( root.xpath('//span[@class="sln-member-name"]/text()'))), id), 'Photo': "https://www.europarl.europa.eu/mepphoto/%s.jpg" % id, 'meta': { 'url': url }, 'Twitter': [ unws(x.replace("http:// ", "")) for x in root.xpath( '//section[@id="presentationmep"]//a[@data-original-title="Twitter"]/@href' ) ], 'Homepage': [ unws(x.replace("http:// ", "")) for x in root.xpath( '//section[@id="presentationmep"]//a[@data-original-title="Website"]/@href' ) ], 'Facebook': [ unws(x.replace("http:// ", "")) for x in root.xpath( '//section[@id="presentationmep"]//a[@data-original-title="Facebook"]/@href' ) ], 'Instagram': [ unws(x.replace("http:// ", "")) for x in root.xpath( '//section[@id="presentationmep"]//a[@data-original-title="Instagram"]/@href' ) ], 'Mail': [ deobfus_mail(x) for x in root.xpath( '//section[@id="presentationmep"]//a[@data-original-title="E-mail"]/@href' ) ], 'Addresses': parse_addr(root), 'active': False, } mep = addchangednames(mep) birthdate = root.xpath('//time[@id="birthDate"]/text()') if len(birthdate) > 0: mep['Birth'] = { 'date': datetime.strptime(unws(birthdate[0]), u"%d-%m-%Y") } place = root.xpath('//time[@id="birthDate"]/following-sibling::text()') if len(place) > 0: tmp = unws(' '.join(place)) if tmp.startswith(", "): tmp = tmp[2:] mep['Birth']['place'] = tmp death = root.xpath('//time[@id="deathDate"]/text()') if death: mep['Death'] = datetime.strptime(unws(death[0]), u"%d-%m-%Y") body = root.xpath( '//span[@id="detailedcardmep"]/following-sibling::section')[0] if body.xpath('.//h1[text()="Curriculum vitae "]'): if not body.xpath('.//h3[@id="no_cv_available"]'): mep['CV'] = { 'updated': datetime.strptime( unws( body.xpath( './/p[@class="small"]/strong[contains(text(),"Updated: ")]/text()' )[0]), u"Updated: %d/%m/%Y") } mep['CV'].update({ unws(''.join(title.xpath(".//text()"))): [ unws(''.join(item.xpath(".//text()"))).replace( "-...", "- ...") for item in title.xpath("following-sibling::ul/li") ] for title in body.xpath('.//h4') #if not unws(''.join(title.xpath(".//text()"))).startswith("Original version : ") }) # assistants url = "http://www.europarl.europa.eu/meps/en/%s/name/assistants" % id root = fetch(url) body = root.xpath( '//span[@id="detailedcardmep"]/following-sibling::section')[0] if unws(' '.join(body.xpath(".//h1/text()"))) == "Assistants": for h4 in body.xpath('.//h4'): title = unws(''.join(h4.xpath(".//text()"))) assistants = [ unws(''.join(item.xpath(".//text()"))) for item in h4.xpath("../div//span") ] if title in ['Accredited assistants', 'Local assistants']: if not 'assistants' in mep: mep['assistants'] = {} title = title.lower().split()[0] if assistants: mep['assistants'][title] = assistants elif title in [ 'Accredited assistants (grouping)', 'Local assistants (grouping)', 'Service providers', 'Trainees', 'Paying agents (grouping)', 'Paying agents', 'Assistants to the Vice-Presidency/to the Quaestorate' ]: if not 'assistants' in mep: mep['assistants'] = {} title = title.lower() if assistants: mep['assistants'][title] = assistants else: log(2, 'unknown title for assistants "{}" {}'.format(title, url)) raise ValueError # declarations root = fetch("http://www.europarl.europa.eu/meps/en/%s/name/declarations" % id) body = root.xpath( '//span[@id="detailedcardmep"]/following-sibling::section')[0] if unws(' '.join(body.xpath(".//h1/text()"))) == "Declarations": for title in body.xpath('.//h4'): key = unws(''.join(title.xpath('.//text()'))) if key == 'Declaration of financial interests': key = 'Financial Declarations' mep[key] = [] for pdf in title.xpath('./following-sibling::ul/li/a'): url = pdf.xpath('./@href')[0] try: mep[key].append(findecl.scrape(url)) except: log(1, "failed to extract findecl from %s" % url) elif key == 'Declarations of participation by Members in events organised by third parties': key = 'Declarations of Participation' mep[key] = [] for pdf in title.xpath( './following-sibling::ul/li/a' )[:: -1]: # reversed order, otherwise newer ones get prepended and mess up the diff url = pdf.xpath('./@href')[0] name = unws(''.join(pdf.xpath('.//text()'))) mep[key].append({'title': name, 'url': url}) elif key in [ 'Declaration of good conduct', 'Voluntary confirmation on the use of the General Expenditure Allowance' ]: mep[key] = [] for pdf in title.xpath( './following-sibling::ul/li/a' )[:: -1]: # reversed order, otherwise newer ones get prepended and mess up the diff url = pdf.xpath('./@href')[0] name = unws(''.join(pdf.xpath('.//text()'))) mep[key].append({'title': name, 'url': url}) else: log( 2, 'unknown type of declaration: "%s" http://www.europarl.europa.eu/meps/en/%s/name/declarations' % (key, id)) key = None raise ValueError # history parse_history(id, root, mep) process(mep, id, db.mep, 'ep_meps', mep['Name']['full'], nopreserve=(['Addresses'], ['assistants']), onchanged=onchanged) if __name__ == '__main__': return mep del mep
def parse_history(id, root, mep): for term in root.xpath( '//div[@id="sectionsNavPositionInitial"]//div[@class="erpl_side-navigation"]/div/ul/li//span[text()="History of parliamentary service"]/../following-sibling::div//ul/li//a/span[@class="t-x"]/text()' ): if not term.endswith("parliamentary term"): log( 2, 'history menu item does not end as expected with "parliamentary term": %s http://www.europarl.europa.eu/meps/en/%s/name/declarations' % (term, id)) raise ValueError #continue term = int(term[0]) if (id, term) in {(124870, 9), (129141, 9)}: continue # jeppe kofod, and frans timmermanns never really got started. root = fetch( "http://www.europarl.europa.eu/meps/en/%s/name/history/%s" % (id, term)) body = root.xpath('//div[@id="status"]')[0] for title in body.xpath('.//h4'): key = unws(''.join(title.xpath('.//text()'))) if key in [None, '']: log( 2, "empty history section http://www.europarl.europa.eu/meps/en/%s/name/history/%s" % (id, term)) raise ValueError #continue #mep[key] = [] for item in title.xpath('./following-sibling::ul/li'): interval = unws(''.join(item.xpath('./strong/text()'))) post = item.xpath('./strong/following-sibling::text()')[0][3:] if key in ["National parties", "Constituencies"]: key = 'Constituencies' # parse date interval try: start, end = parse_hist_date(interval) except: log( 1, "illegal date interval: %s http://www.europarl.europa.eu/meps/en/%s/name/history/%s" % (interval, id, term)) raise ValueError #continue # parse party and country cstart = post.rfind(' (') if post[cstart + 2:-1] in SEIRTNUOC: country = post[cstart + 2:-1] party = post[:cstart] else: log( 2, '%s unknown country: %s' % (id, post[cstart + 2:-1])) raise ValueError party = 'unknown' country = 'unknown' if not key in mep: mep[key] = [] mep[key].append({ u'party': party, u'country': country, u'start': start, u'end': end, 'term': term }) if end == datetime.strptime("31.12.9999", u"%d.%m.%Y"): mep['active'] = True elif key in [ 'Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President', 'President', 'Vice-President', 'Observer', 'Quaestor', 'Substitute observer' ]: # memberships in various committees, delegations and EP mgt try: start, end = parse_hist_date(interval) except: log( 2, "illegal date interval: %s http://www.europarl.europa.eu/meps/en/%s/name/history/%s" % (interval, id, term)) raise ValueError #continue item = { u'role': key, u'Organization': unws(post), u'start': start, u'end': end, u'term': term, } for start, field in ORGMAPS: if item['Organization'].startswith(start): if field == 'Committees': if item['Organization'] in COMMITTEE_MAP: item[u'abbr'] = COMMITTEE_MAP[ item['Organization']] else: log( 5, "no abbr found for committee: %s" % item['Organization']) if field == 'Delegations': if item['Organization'] in DELEGATIONS: item[u'abbr'] = DELEGATIONS[ item['Organization']] else: log( 5, "no abbr found for delegation: %s" % item['Organization']) if not field in mep: mep[field] = [] mep[field].append(item) break elif key == u'Political groups': try: start, end = parse_hist_date(interval) except: log( 1, "illegal date interval: %s http://www.europarl.europa.eu/meps/en/%s/name/history/%s" % (interval, id, term)) raise ValueError #continue tmp = post.split(u' - ') if len(tmp) > 1: org = ' - '.join(tmp[:-1]) role = tmp[-1] elif post.endswith(' -'): org = post[:-2] role = '' elif post in ['Non-attached Members', 'Non-attached']: org = post role = 'Member' else: log( 2, '[!] political group line "%s", http://www.europarl.europa.eu/meps/en/%s/name/history/%s' % (post, id, term)) raise ValueError #continue if not u'Groups' in mep: mep[u'Groups'] = [] if not org in GROUP_MAP: log(5, "no groupid found for group: %s" % org) mep[u'Groups'].append({ u'role': role, u'Organization': org, # u'country': country, # this value is missing from the latest EP website u'groupid': GROUP_MAP.get(org, org), u'start': start, u'end': end, }) else: log( 2, '[!] unknown field "%s" http://www.europarl.europa.eu/meps/en/%s/name/history/%s' % (key, id, term)) raise ValueError # reorder historical lists in ascending order, so new entries are appended and don't mess up the diffs for k in ('Constituencies', 'Groups', 'Committees', 'Delegations', 'Staff'): if not k in mep: continue mep[k] = [ e for e in sorted(mep[k], key=lambda x: (x['start'], x[ 'end'], x.get('Organization', x.get('party')))) ]
def scrape_epagents(table): heading=''.join(table.xpath('.//td[@class="players_committee"]')[0].xpath(".//text()")).strip() responsible=None if heading in [ "Committee responsible", "Former committee responsible"]: responsible=True elif heading in ["Committee for opinion", "Former committee for opinion"]: responsible=False else: logger.warn(u"[!] unknown committee heading %s" % heading) # handle shadows shadowelems=table.xpath('//a[@id="shadowRapporteurHeader"]/../following-sibling::div/p//span[@class="players_rapporter_text"]/a') tips=[t.xpath('text()')[0] if len(t.xpath('text()'))>0 else groupurlmap[t.xpath("a")[0].get('href')] if len(t.xpath("a"))>0 else groupurlmap[t.xpath("img")[0].get('src')] for t in table.xpath('//a[@id="shadowRapporteurHeader"]/../following-sibling::div//span[@class="tiptip"]')] shadows={} for shadow, group in izip_longest(shadowelems, tips): committee=shadow.xpath('./ancestor::td/preceding-sibling::td//acronym/text()')[0] if not committee in shadows: shadows[committee]=[] if group=='NI': group=u'NI' mep={u'name': unicode(shadow.xpath('text()')[0]), u'group': unicode(group)} tmp=getMEPRef(shadow.xpath('text()')[0]) if tmp: mep[u'mepref']=tmp #else: # raise IndexError shadows[committee].append(mep) # delete the uneccessary shadow elements - so the following regular lst2obj get's what it expects for todel in table.xpath('//a[@id="shadowRapporteurHeader"]/..'): parent=todel.xpath('..')[0] parent.remove(todel.xpath('following-sibling::div')[0]) parent.remove(todel) # handle each row of agents agents=[] for agent in lst2obj(table,epagents,1): agent[u'responsible']=responsible agent[u'body']=u'EP' if agent.get('rapporteur'): meps=[] for mep in agent['rapporteur']: if unws(mep['name']).startswith("The committee decided not to give an opinion"): del agent['rapporteur'][agent['rapporteur'].index(mep)] agent[u'opinion']=None continue tmp=getMEPRef(mep['name']) if tmp: meps.append({u'mepref': tmp, u'group': mep['group'], u'name': mep['name']}) else: meps.append({u'group': mep['group'], u'name': mep['name']}) agent[u'rapporteur']=meps abbr=agent['committee'][:4] if abbr=='BUDE': abbr='BUDG' if not abbr in COMMITTEE_MAP.keys(): logger.warn(u"[!] uknown committee abbrev %s" % abbr) agent[u'committee_full']=agent['committee'] if agent['committee'][4]==' ' and abbr.isalpha(): agent[u'committee']=abbr else: agent[u'committee_full']=agent['committee'][5:] agent[u'committee']=abbr if agent.get(u'committee') in shadows.keys(): agent[u'shadows']=shadows[agent['committee']] if not agent in agents: agents.append(agent) return agents
def unpaginate(text, url): lines = text.split('\n') # find end of 1st page eo1p = 0 PE = None while not lines[eo1p].startswith('\x0c') and eo1p<len(lines): eo1p+=1 if eo1p == len(lines): log(1, "could not find end of 1st page in %s" % url) raise ValueError("eo1p not found: %s" % url) i = len(lines) - 1 while i>=0: if not lines[i].startswith('\x0c'): i -= 1 continue # we found a line starting with pagebreak lines[i]=lines[i][1:] i -= 1 fstart = i # skip empty lines before pagebreak while i>=0 and unws(lines[i])=='': i-=1 # we expect i>0 and lines[i] == 'EN' (or variations) if i<=0: log(1, "could not find non-empty line above pagebreak in %s" % url) raise ValueError("no EN marker found: %s" % url) tmp = unws(lines[i]) if tmp not in ["EN", "EN EN", "EN United in diversity EN", "EN Unity in diversity EN", "EN Unie dans la diversité EN", "EN In Vielfalt geeint EN", "ENEN United in diversity EN", "XM United in diversity XM", "XT United in diversity EN", "XM", "XM XM", "XT", "XT XT"]: if tmp in ["FR",'NL','HU']: log(2,'Document has non-english language marker: "%s" %s' % (tmp, url)) return [], None if tmp=="Or. en": # no footer in this page continue if tmp in ['AM_Com_NonLegCompr', 'AM_Com_NonLegReport','AM_Com_NonLegOpinion']: # no footer on this page (and probably neither on the previous one which should be the first) continue # an exceptional document if (url=='http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-//EP//NONSGML+COMPARL+PE-532.324+01+DOC+PDF+V0//EN&language=EN' and tmp in ["Paragraph 8", "Pervenche Berès, Frédéric Daerden"]): continue if (url in ['http://www.europarl.europa.eu/doceo/document/CJ25-AM-593898_EN.pdf', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-//EP//NONSGML+COMPARL+PE-593.898+01+DOC+PDF+V0//EN&language=EN'] and tmp=="(2016/2204(INI))"): continue if (url=='http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-//EP//NONSGML+COMPARL+PE-594.137+01+DOC+PDF+V0//EN&language=EN' and tmp=='(2016/2018(INI))'): continue if isfooter(tmp): if PE is None: # try to figure out PE id m = pere.match(tmp) if m: PE = m.group(0) log(3, 'no EN marker found, but footer: "%s"' % tmp) i+=1 # neutralize the decrement after this block else: log(1, 'could not find EN marker above pagebreak: %d %d "%s"' % (i, eo1p, tmp)) raise ValueError('no EN marker found "%s" in %s' % (tmp,url)) if lines[i].startswith('\x0c'): # we found a ^LEN^L # we found an empty page. while fstart > i: del lines[fstart] fstart -= 1 lines[i]="\x0c" continue i -= 1 # find the next non-empty line above the EN marker while i>0 and unws(lines[i])=='': i-=1 if i<=0: log(1, "could not find non-empty line above EN marker: %s" % url) raise ValueError("no next line above EN marker found: %s" % url) if (not isfooter(lines[i])): tmp = unws(lines[i]) if tmp=="Or. en": i+=1 # preserve this line - and cut of the rest elif tmp not in ['AM_Com_NonLegCompr', 'AM_Com_NonLegReport','AM_Com_NonLegOpinion']: log(1,'not a footer: "%s" line: %d in %s' % (repr(lines[i]),i,url)) raise ValueError('not a footer: "%s" line: %d in %s' % (lines[i],i,url)) elif PE is None: # try to figure out PE id m = pere.match(unws(lines[i])) if m: PE = m.group(0) if lines[i].startswith('\x0c'): # we found an empty page with only the footer lines[i]='\x0c' i+=1 #else: # is a regular page # i -= 1 # if unws(lines[i])!='': # for j in range(-10,10): # log(1, '"%s"' % (unws(lines[i+j]))) # log(1, 'line above footer is not an empty line: "%s"' % (unws(lines[i]))) # raise ValueError("no empty line above footer") # delete all lines between fstart and i while fstart >= i: del lines[fstart] fstart -= 1 return lines, PE