def getAddress(root): res={} for div in root.xpath('//div[@id="contextzone"]//div[@class="ep_title"]'): # getAddress(map(strip, div.xpath("../..//div[@class='ep_elementcontact']/ul"))) key=unws(''.join(div.xpath('.//text()'))) if key not in ['Bruxelles', 'Strasbourg', 'Postal address', 'Luxembourg']: continue res[key]={} if key in ['Bruxelles', 'Strasbourg', 'Luxembourg']: tmp=div.xpath('../..//li[@class="ep_phone"]/div/text()') if tmp: res[key]['Phone'] = unws(tmp[0]).replace('(0)','') tmp=div.xpath('../..//li[@class="ep_fax"]/div/text()') if tmp: res[key]['Fax'] = unws(tmp[0]).replace('(0)','') tmp=[unws(x) for x in div.xpath('../..//li[@class="ep_address"]/div/text()') if len(unws(x))] if key=='Strasbourg': res[key].update(dict(zip(['Organization','Building', 'Office', 'Street','Zip1', 'Zip2'],tmp))) res[key]['City']=res[key]['Zip2'].split()[1] res[key]['Zip2']=res[key]['Zip2'].split()[0] res[key]['building_code']=buildings[res[key]['Building']] elif key=='Bruxelles': res[key].update(dict(zip(['Organization','Building', 'Office', 'Street','Zip'],tmp))) res[key]['City']=res[key]['Zip'].split()[1] res[key]['Zip']=res[key]['Zip'].split()[0] res[key]['building_code']=buildings[res[key]['Building']] elif key=='Luxembourg': res[key]['Address']=tmp elif key=='Postal address': res[key]=tmp else: logger.error("wtf %s" % key) return res
def getAddress(root): res={} for div in root.xpath('../following-sibling::div[@class="boxcontent " or @class="boxcontent nobordertop"]/ul[@class="contact"]'): key=unws(''.join(div.xpath('./preceding-sibling::h4/text()'))) if key not in ['Bruxelles', 'Strasbourg', 'Postal address', 'Luxembourg']: continue if key=='Bruxelles': key=u'Brussels' elif key=='Postal address': key=u'Postal' res[key]={} if key in ['Brussels', 'Strasbourg', 'Luxembourg']: tmp=div.xpath('./following-sibling::ul[@class="link_collection_noborder"]//span[@class="phone"]/text()') if tmp: res[key][u'Phone'] = unws(tmp[0]).replace('(0)','') tmp=div.xpath('./following-sibling::ul[@class="link_collection_noborder"]//span[@class="fax"]/text()') if tmp: res[key][u'Fax'] = unws(tmp[0]).replace('(0)','') tmp=[unws(x) for x in div.xpath('./li[@class="address"]//text()') if len(unws(x))] if key=='Strasbourg': res[key][u'Address']=dict(zip([u'Organization',u'Building', u'Office', u'Street',u'Zip1', u'Zip2'],tmp)) res[key][u'Address']['City']=res[key]['Address']['Zip2'].split()[1] res[key][u'Address']['Zip2']=res[key]['Address']['Zip2'].split()[0] res[key][u'Address']['building_code']=buildings.get(res[key]['Address']['Building']) elif key=='Brussels': res[key][u'Address']=dict(zip([u'Organization',u'Building', u'Office', u'Street',u'Zip'],tmp)) res[key][u'Address']['City']=res[key]['Address']['Zip'].split()[1] res[key][u'Address']['Zip']=res[key]['Address']['Zip'].split()[0] res[key][u'Address']['building_code']=buildings.get(res[key]['Address']['Building']) elif key=='Luxembourg': res[key][u'Address']=tmp elif key=='Postal': res[key]=tmp else: logger.error("wtf %s" % key) return res
def getMEPGender(id): try: mepraw = fetch("http://www.europarl.europa.eu/meps/fr/%s/get.html" % (id), ignore=[500]) except Exception, e: logger.error("mepgender %s" % e) return 'n/a'
def getMEPDeclarations(id): try: dom = fetch( "http://www.europarl.europa.eu/meps/en/%s/_declarations.html" % (id), ignore=[500]) except Exception, e: logger.error("mepdeclaration %s" % e) return []
def getAddress(root): res = {} for div in root.xpath('//div[@id="contextzone"]//div[@class="ep_title"]'): # getAddress(map(strip, div.xpath("../..//div[@class='ep_elementcontact']/ul"))) key = unws("".join(div.xpath(".//text()"))) if key not in ["Bruxelles", "Strasbourg", "Postal address", "Luxembourg"]: continue if key == "Bruxelles": key = u"Brussels" elif key == "Postal address": key = u"Postal" res[key] = {} if key in ["Brussels", "Strasbourg", "Luxembourg"]: tmp = div.xpath('../..//li[@class="ep_phone"]/div/text()') if tmp: res[key][u"Phone"] = unws(tmp[0]).replace("(0)", "") tmp = div.xpath('../..//li[@class="ep_fax"]/div/text()') if tmp: res[key][u"Fax"] = unws(tmp[0]).replace("(0)", "") tmp = [unws(x) for x in div.xpath('../..//li[@class="ep_address"]/div/text()') if len(unws(x))] if key == "Strasbourg": res[key][u"Address"] = dict( zip([u"Organization", u"Building", u"Office", u"Street", u"Zip1", u"Zip2"], tmp) ) res[key][u"Address"]["City"] = res[key]["Address"]["Zip2"].split()[1] res[key][u"Address"]["Zip2"] = res[key]["Address"]["Zip2"].split()[0] res[key][u"Address"]["building_code"] = buildings[res[key]["Address"]["Building"]] elif key == "Brussels": res[key][u"Address"] = dict(zip([u"Organization", u"Building", u"Office", u"Street", u"Zip"], tmp)) res[key][u"Address"]["City"] = res[key]["Address"]["Zip"].split()[1] res[key][u"Address"]["Zip"] = res[key]["Address"]["Zip"].split()[0] res[key][u"Address"]["building_code"] = buildings[res[key]["Address"]["Building"]] elif key == "Luxembourg": res[key][u"Address"] = tmp elif key == "Postal": res[key] = tmp else: logger.error("wtf %s" % key) return res
def parseMember(userid): url='http://www.europarl.europa.eu/meps/en/%s/_history.html' % userid logger.info("scraping %s" % url) root = fetch(url, ignore=[500]) data = { u'active': False, u'Photo': unicode(urljoin(BASE_URL,"/mepphoto/%s.jpg" % userid)), u'meta': {u'url': url} } mepdiv=root.xpath('//div[@class="zone_info_mep_transparent_mep_details"]') if len(mepdiv) == 1: mepdiv = mepdiv[0] else: logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv))) data[u'Name'] = mangleName(unws(' '.join(mepdiv.xpath('.//li[@class="mep_name"]//text()')))) borntxt=mepdiv.xpath('.//span[@class="more_info"]/text()') if len(borntxt)>0: if unws(borntxt[-1]).startswith('Date of death:'): try: data[u'Death'] = datetime.strptime(unws(borntxt[-1]), u"Date of death: %d %B %Y") except ValueError: logger.warn('[!] failed to scrape birth data %s' % url) logger.warn(traceback.format_exc()) tmp = borntxt[-2].split(',', 1) else: tmp = borntxt[-1].split(',', 1) if len(tmp)==2: (d, p) = tmp else: d,p = tmp[0], None try: data[u'Birth'] = { u'date': datetime.strptime(unws(d), u"Date of birth: %d %B %Y")} except ValueError: logger.warn(traceback.format_exc()) finally: if p: if 'Birth' in data: data[u'Birth'][u'place'] = unws(p) else: data[u'Birth'] = unws(p) else: logger.warn('[!] no birth data %s' % url) # scrape stuff from right column addif(data,u'RSS',[unicode(urljoin(BASE_URL,x.get('href')),'utf8') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_rss"]')]) addif(data,u'Homepage',[x.get('href') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_website"]')]) addif(data,u'Twitter',[x.get('href') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_twitt"]')]) addif(data,u'Facebook',[x.get('href') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_fb"]')]) addif(data,u'Mail',[x.get('href')[7:].replace('[dot]','.').replace('[at]','@')[::-1] for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_email"]')]) # contact information for span in root.xpath('//div[@id="content_right"]//h3'): title=unws(''.join(span.xpath('.//text()'))) if title == "Contacts": addif(data,u'Addresses',getAddress(span)) # scrape main content for section in root.xpath('//div[@id="content_left"]/div[@class="boxcontent nobackground"]/h4'): key=unws(''.join(section.xpath('.//text()'))) if key=="National parties": # constituencies key='Constituencies' for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'): line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')])) try: interval, party = line.split(' : ',1) except ValueError: continue tmp = interval.split(' / ') if not key in data: data[key]=[] if len(tmp)==2: (start, end) = tmp else: start = interval.split()[0] end = "31.12.9999" cstart = party.rfind(' (') if party[cstart+2:-1] in SEIRTNUOC: country = party[cstart+2:-1] party = party[:cstart] else: logger.warn('unknown country: %s' % party[cstart+2:-1]) country='unknown' #print etree.tostring(constlm, pretty_print=True) data[key].append({ u'party': party, u'country': country, u'start': datetime.strptime(unws(start), u"%d.%m.%Y"), u'end': datetime.strptime(unws(end), u"%d.%m.%Y"), }) elif key in ['Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President', 'President', 'Vice-President', 'Observer', 'Quaestor']: # memberships in various committees, delegations and EP mgt for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'): line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')])) try: interval, org = line.split(' : ',1) except ValueError: continue tmp = interval.split(' / ') if len(tmp)==2: (start, end) = tmp else: start = interval.split()[0] end = "31.12.9999" item={u'role': key, u'abbr': COMMITTEE_MAP.get(org), u'Organization': org, u'start': datetime.strptime(unws(start), u"%d.%m.%Y"), u'end': datetime.strptime(unws(end), u"%d.%m.%Y"), } for start, field in orgmaps: if item['abbr'] in COMMITTEE_MAP or item['Organization'].startswith(start): if not field in data: data[field]=[] if field=='Committees' and item['Organization'] in COMMITTEE_MAP: item[u'committee_id']=COMMITTEE_MAP[item['Organization']] data[field].append(item) break elif key == u'Political groups': for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'): line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')])) interval, org = line.split(' : ',1) tmp = org.split(u' - ') if len(tmp)>1: org = ' - '.join(tmp[:-1]) role = tmp[-1] elif org.endswith(' -'): org=org[:-2] role='' else: logger.error('[!] political group line %s' % line) continue tmp = interval.split(' / ') if len(tmp)==2: (start, end) = tmp else: start = interval.split()[0] end = "31.12.9999" if not u'Groups' in data: data[u'Groups']=[] data[u'Groups'].append( {u'role': role, u'Organization': org, u'country': COUNTRIES.get(unws(constlm.get('class')).upper(), 'unknown country: %s' % unws(constlm.get('class'))), u'groupid': group_map[org], u'start': datetime.strptime(unws(start), u"%d.%m.%Y"), u'end': datetime.strptime(unws(end), u"%d.%m.%Y"), }) else: logger.error('[!] unknown field %s' % key) # sort all lists in descending order for fld in ['Constituencies', 'Groups', 'Committees', 'Delegations', 'Staff']: if not fld in data: continue data[fld]=sorted(data[fld], key=lambda x: x.get('end',x['start']), reverse=True) # get CV - page (is on separate http path :/) cvurl='http://www.europarl.europa.eu/meps/en/%s/_cv.html' % userid root = fetch(cvurl, ignore=[500]) data[u'CV']=[unws(x) for x in root.xpath('//p[@class="details_cv"]/text()')] # get assistants also on a separate page :/ assurl='http://www.europarl.europa.eu/meps/en/%s/_assistants.html' % userid root = fetch(assurl, ignore=[500]) for h3 in root.xpath('//h3[@id="section"]'): title=unws(''.join(h3.xpath('.//text()'))) if title in ['Accredited assistants', 'Local assistants']: if not 'assistants' in data: data['assistants']={} addif(data['assistants'], title.lower().split()[0], [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')]) elif title in ['Accredited assistants (grouping)', 'Local assistants (grouping)', 'Service providers', ' Trainees', 'Paying agents (grouping)', 'Paying agents']: if not 'assistants' in data: data['assistants']={} addif(data['assistants'], title.lower(), [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')]) return data
def scrape(url): try: logger.info('scrape ' + url) tree = fetch(url) agents, committees = scrape_actors(tree) forecasts = lst2obj((tree.xpath('//table[@id="forecast"]') or [None])[0], forecastFields) events = scrape_events(tree) procedure = scrape_basic(tree) ipext = [] for ipexd in (IPEXMAP[procedure['reference']] or {}).get('Dates', []): skip = False for event in forecasts + events: if event['type'] == ipexevents.get(ipexd['type'], {}).get( 'oeil', 'asdf') and event['date'] == ipexd['date']: skip = True break if skip: continue ipext.append(ipexd) allevents = agents + scrape_docs(tree) + events + forecasts + ipext other = [x for x in allevents if not x.get('date')] allevents = sorted([x for x in allevents if x.get('date')], key=itemgetter('date')) allevents = merge_events(allevents, committees) res = { u'meta': { 'source': url, 'id': int(url.split('id=')[1]), 'timestamp': datetime.datetime.utcnow() }, u'procedure': procedure, u'links': form2obj((tree.xpath('//table[@id="external_links"]') or [None])[0]), u'committees': committees, u'activities': sorted(allevents, key=itemgetter('date')), u'other': other, } # check for "final act" finalas = tree.xpath('//div[@id="final_act"]//a') final = {} for link in finalas: if link.get('class') == 'sumbutton': try: summary = fetch("http://www.europarl.europa.eu%s" % link.get('href')) except: continue final['text'] = [ tostring(x) for x in summary.xpath('//div[@id="summary"]') ] else: if not 'docs' in final: final['docs'] = [] final['docs'].append({ 'title': link.xpath('text()')[0].strip(), 'url': link.get('href') }) if final and final.get('docs'): res[u'procedure'][u'final'] = final.get('docs', [{}])[0] for item in res['activities']: if item.get( 'type') == u'Final act published in Official Journal': if final.get('text'): item[u'text'] = final['text'] if len(final.get('docs')) > 1: if not 'docs' in item: item[u'docs'] = final['docs'] else: item[u'docs'].extend(final['docs']) break return res except: logger.error("%s\n%s" % (url, traceback.format_exc())) return
def scrape(url): try: logger.info("scrape " + url) tree = fetch(url) agents, committees = scrape_actors(tree) forecasts = lst2obj((tree.xpath('//table[@id="forecast"]') or [None])[0], forecastFields) events = scrape_events(tree) procedure = scrape_basic(tree) ipext = [] for ipexd in (IPEXMAP[procedure["reference"]] or {}).get("Dates", []): skip = False for event in forecasts + events: if ( event["type"] == ipexevents.get(ipexd["type"], {}).get("oeil", "asdf") and event["date"] == ipexd["date"] ): skip = True break if skip: continue ipext.append(ipexd) allevents = agents + scrape_docs(tree) + events + forecasts + ipext other = [x for x in allevents if not x.get("date")] allevents = sorted([x for x in allevents if x.get("date")], key=itemgetter("date")) allevents = merge_events(allevents, committees) res = { u"meta": {"source": url, "id": int(url.split("id=")[1]), "timestamp": datetime.datetime.utcnow()}, u"procedure": procedure, u"links": form2obj((tree.xpath('//table[@id="external_links"]') or [None])[0]), u"committees": committees, u"activities": sorted(allevents, key=itemgetter("date")), u"other": other, } # check for "final act" finalas = tree.xpath('//div[@id="final_act"]//a') final = {} for link in finalas: if link.get("class") == "sumbutton": try: summary = fetch("http://www.europarl.europa.eu%s" % link.get("href")) except: continue final["text"] = [tostring(x) for x in summary.xpath('//div[@id="summary"]')] else: if not "docs" in final: final["docs"] = [] final["docs"].append({"title": link.xpath("text()")[0].strip(), "url": link.get("href")}) if final and final.get("docs"): res[u"procedure"][u"final"] = final.get("docs", [{}])[0] for item in res["activities"]: if item.get("type") == u"Final act published in Official Journal": if final.get("text"): item[u"text"] = final["text"] if len(final.get("docs")) > 1: if not "docs" in item: item[u"docs"] = final["docs"] else: item[u"docs"].extend(final["docs"]) break return res except: logger.error("%s\n%s" % (url, traceback.format_exc())) return
def parseMember(userid): url='http://www.europarl.europa.eu/meps/en/%s/get.html' % userid logger.info("scraping %s" % url) root = fetch(url) data = {u'active': True, 'meta': {u'url': url}} # return {'active': False} mepdiv=root.xpath('//div[@class="ep_elementpeople2"]') if len(mepdiv) == 1: mepdiv = mepdiv[0] else: logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv))) data[u'Name'] = mangleName(unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[0])) data[u'Photo'] = unicode(urljoin(BASE_URL,mepdiv.xpath('.//span[@class="ep_img"]/img')[0].get('src')),'utf8') (d, p) = mepdiv.xpath('.//div[@class="ep_elementtext"]/p/text()')[0].split(',', 1) try: data[u'Birth'] = { u'date': datetime.strptime(unws(d), "Born on %d %B %Y"), u'place': unws(p) } except ValueError: logger.warn('[!] failed to scrape birth data %s' % url) logger.warn(traceback.format_exc()) const={u'country': unws(mepdiv.xpath('.//span[@class="ep_country"]/text()')[0])} data[u'Constituencies']=[const] try: const[u'party']=unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[1]), except IndexError: data[u'active']=False else: group=unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[0]) data[u'Groups'] = [{ u'role': unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[1]), u'group': group, u'groupid': group_map[group]}] cdiv=root.xpath('//div[@class="ep_elementcontact"]') if len(cdiv): addif(data,u'RSS',[unicode(urljoin(BASE_URL,x.get('href')),'utf8') for x in cdiv[0].xpath('.//li[@class="ep_rss"]//a')]) addif(data,u'Homepage',[unicode(x.get('href'),'utf8') for x in cdiv[0].xpath('.//li[@class="ep_website"]//a')]) addif(data,u'Mail',[decodemail(unws(x)) for x in cdiv[0].xpath('.//li[@class="ep_email"]//text()') if len(unws(x))]) for span in root.xpath('//div[@id="contextzone"]//span[@class="ep_title"]'): title=unws(''.join(span.xpath('.//text()'))) if title in ['Accredited assistants', 'Local assistants']: addif(data,title,[unws(x) for x in span.xpath('../../..//li/div/text()')]) addif(data,u'Addresses',getAddress(root)) for div in root.xpath('//div[@class="ep_content"]'): key=unws(u''.join(div.xpath('.//span[@class="ep_title"]/text()'))) if not len(key): continue elif key.lower()=='curriculum vitae': data[u'CV'] = [unws(x) for x in div.xpath('.//div[@class="ep_elementtext"]//li/div/text()')] elif key in ['Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President', 'President', 'Vice-President']: for span in div.xpath('.//span[@class="commission_label"]'): item={u'role': key, u'abbr': unws(''.join(span.xpath('text()'))), u'Organization': unws(span.tail)} for start, field in orgmaps: if item['Organization'].startswith(start): if not field in data: data[field]=[] if field=='Committees' and item['Organization'] in COMMITTEE_MAP: item[u'committee_id']=COMMITTEE_MAP[item['Organization']] data[field].append(item) break else: logger.error('[!] unknown field %s' % key) return data
def getMEPDeclarations(id): try: dom = fetch("http://www.europarl.europa.eu/meps/en/%s/_declarations.html" % (id), ignore=[500]) except Exception, e: logger.error("mepdeclaration %s" % e) return []
def parseMember(userid): url = "http://www.europarl.europa.eu/meps/en/%s/get.html" % userid logger.info("scraping %s" % url) root = fetch(url, ignore=[500]) data = {u"active": False, "meta": {u"url": url}} # return {'active': False} mepdiv = root.xpath('//div[@class="ep_elementpeople2"]') if len(mepdiv) == 1: mepdiv = mepdiv[0] else: logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv))) data[u"Name"] = mangleName(unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[0])) data[u"Photo"] = unicode(urljoin(BASE_URL, mepdiv.xpath('.//span[@class="ep_img"]/img')[0].get("src")), "utf8") borntxt = mepdiv.xpath('.//div[@class="ep_elementtext"]/p/text()') if len(borntxt) > 0: (d, p) = borntxt[0].split(",", 1) try: data[u"Birth"] = {u"date": datetime.strptime(unws(d), u"Born on %d %B %Y"), u"place": unws(p)} except ValueError: logger.warn("[!] failed to scrape birth data %s" % url) logger.warn(traceback.format_exc()) else: logger.warn("[!] no birth data %s" % url) const = {u"country": unws(mepdiv.xpath('.//span[@class="ep_country"]/text()')[0]), u"start": datetime(2009, 7, 14)} data[u"Constituencies"] = [const] try: data[u"party"] = unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[1]) except IndexError: pass else: group = unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[0]) try: role = unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[1]) except IndexError: role = u"Member" data[u"Groups"] = [{u"role": role, u"Organization": group, u"groupid": group_map[group]}] cdiv = root.xpath('//div[@class="ep_elementcontact"]') if len(cdiv): addif( data, u"RSS", [unicode(urljoin(BASE_URL, x.get("href")), "utf8") for x in cdiv[0].xpath('.//li[@class="ep_rss"]//a')], ) addif( data, u"Homepage", [unicode(x.get("href"), "utf8") for x in cdiv[0].xpath('.//li[@class="ep_website"]//a')] ) addif( data, u"Mail", [decodemail(unws(x)) for x in cdiv[0].xpath('.//li[@class="ep_email"]//text()') if len(unws(x))], ) for span in root.xpath('//div[@id="contextzone"]//span[@class="ep_title"]'): title = unws("".join(span.xpath(".//text()"))) if title in ["Accredited assistants", "Local assistants"]: if not "assistants" in data: data["assistants"] = {} addif( data["assistants"], title.lower().split()[0], [unws(x) for x in span.xpath("../../..//li/div/text()")] ) addif(data, u"Addresses", getAddress(root)) for div in root.xpath('//div[@class="ep_content"]'): key = unws(u"".join(div.xpath('.//span[@class="ep_title"]/text()'))) if not len(key): continue elif key.lower() == "curriculum vitae": data[u"CV"] = [unws(x) for x in div.xpath('.//div[@class="ep_elementtext"]//li/div/text()')] elif key in ["Member", "Substitute", "Chair", "Vice-Chair", "Co-President", "President", "Vice-President"]: for span in div.xpath('.//span[@class="commission_label"]'): item = {u"role": key, u"abbr": unws("".join(span.xpath(".//text()"))), u"Organization": unws(span.tail)} for start, field in orgmaps: if item["abbr"] in COMMITTEE_MAP or item["Organization"].startswith(start): if not field in data: data[field] = [] if field == "Committees" and item["Organization"] in COMMITTEE_MAP: item[u"committee_id"] = COMMITTEE_MAP[item["Organization"]] data[field].append(item) break else: logger.error("[!] unknown field %s" % key) return data
def scrape(decl): mep_id = decl.split('/')[-1].split('_')[0] data = {'mep_id': mep_id, 'url': unicode(decl), 'date': ''} logger.info("findecl scraping %s" % mep_id) text = getraw(decl).split('\n') state = 0 ptr = 0 while ptr < len(text): # bg: "А Б В Г Д Е Ж З И" # el: "A B Γ Δ E ΣΤ Ζ H Θ" if (issectionhead(decl, text, ptr, state, 0, ('A', u'А', 'A')) or issectionhead(decl, text, ptr, state, 2, ('C', u'В', u'Γ')) or issectionhead(decl, text, ptr, state, 3, ('D', u'Г', u'Δ')) or issectionhead(decl, text, ptr, state, 4, ('E', u'Д', u'E')) or issectionhead(decl, text, ptr, state, 5, ('F', u'Е', u'ΣΤ'))): # skip to table while (text[ptr].split()[-4:] != ['1', '2', '3', '4']): ptr += 1 if ptr >= len(text): logger.error('[meh] %s table not found' % state) raise IndexError start = ptr # skip empty lines while not text[ptr].split(): ptr += 1 if ptr >= len(text): logger.error('[meh] %s fail skip empty lines' % state) raise IndexError while True: if ptr > len(text): logger.error('[meh] fail past end of block %s' % state) raise IndexError if (text[ptr].strip() == '' and (text[ptr + 1] in ['1', ''] or text[ptr + 1].strip()[:3] == '1/6')): break if text[ptr].startswith(' ' * 20) and ( text[ptr].strip()[1] == '/' and text[ptr].strip()[0] in ['2', '3', '4']): break ptr += 1 end = ptr state += 1 #print >> sys.stderr, text[start:end] if state == 6: t = parse_table_f(text[start:end]) else: t = parse_table(text[start:end]) data[state_map[state]] = t if DEBUG: print "\t%s" % ('\n\t'.join( (repr(x) for x in t)) or "none"), state elif issectionhead(decl, text, ptr, state, 1, ('B', u'Б', u'B')): while len([x for x in text[ptr].split(' ' * 10) if x]) != 2: ptr += 1 if ptr >= len(text): logger.error('[meh] table B not found') raise IndexError start = ptr # skip empty lines while ptr < len(text) and not text[ptr].split(): ptr += 1 while True: if ptr > len(text): logger.error('[meh] fail skip empty lines in B') raise IndexError if [text[ptr].strip(), text[ptr + 1]] in (['', '1'], ['', '']): break if text[ptr].startswith(' ' * 20) and ( text[ptr].strip()[1] == '/' and text[ptr].strip()[0] in ['2', '3', '4']): break ptr += 1 end = ptr state += 1 t = parse_table_b(text[start:end]) if DEBUG: print "\t%s" % ('\n\t'.join( (repr(x) for x in t)) or "none"), state data[state_map[state]] = t elif state == 6: while not issectionhead(decl, text, ptr, state, 6, ('G', u'Ж', u'Ζ')): ptr += 1 # skip continuation lines while text[ptr].split(): ptr += 1 if ptr >= len(text): logger.error('[meh] continuation in G fail') raise IndexError # skip empty lines while not text[ptr].split(): ptr += 1 if ptr >= len(text): logger.error('[meh] fail skip empty lines in G') raise IndexError gstart = ptr state += 1 while not issectionhead(decl, text, ptr, state, 7, ('H', u'З', u'H')): ptr += 1 gend = ptr - 1 if DEBUG: print "\t", text[gstart:gend], state data[state_map[state]] = '\n'.join( x for x in map(unicode.strip, text[gstart:gend]) if x) # skip continuation lines while text[ptr].split(): ptr += 1 if ptr >= len(text): logger.error('[meh] continuation in H fail') raise IndexError # skip empty lines while not text[ptr].split(): ptr += 1 if ptr >= len(text): logger.error('[meh] fail skip empty lines in H') raise IndexError hstart = ptr state += 1 while not issectionhead(decl, text, ptr, state, 8, ('I', u'И', u'Θ')): ptr += 1 hend = ptr - 1 if DEBUG: print "\t", text[hstart:hend], state data[state_map[state]] = '\n'.join( x for x in map(unicode.strip, text[hstart:hend]) if x) # skip continuation lines while text[ptr].split(): ptr += 1 if ptr >= len(text): logger.error('[meh] continuation in I fail') raise IndexError # skip empty lines while not text[ptr].split(): ptr += 1 if ptr >= len(text): logger.error('[meh] fail skip empty lines in I') raise IndexError istart = ptr while True: tmp = text[ptr].split() if len(tmp) == 3: data['date'] = tmp[1] del tmp[1] if tmp in iendsigs: break elif len(tmp) == 5: # date=tmp[2] could be preserved in data del tmp[2] if tmp in [['Date', ':', 'Signature', ':']]: break ptr += 1 if ptr >= len(text): logger.error('[meh] fail find end in I') if DEBUG: print 'meh\n>>>%s' % '\n>>>'.join( text[istart:istart + 14]).encode('utf8') raise IndexError state += 1 if DEBUG: print >> sys.stderr, state #print >> sys.stderr, "\t", text[istart:ptr], state data[state_map[state]] = '\n'.join( x for x in map(unicode.strip, text[istart:ptr]) if x) #else: #print >> sys.stderr, '>>>>>>>>', line.encode('utf8') ptr += 1 if state != 9: print >> sys.stderr, '>>>>>>>>', "wtfwtf", state logger.error('[wtf] did not reach final state %s' % state) return {} else: return data
def scrape(decl): mep_id = decl.split("/")[-1].split("_")[0] data = {"mep_id": mep_id, "url": unicode(decl), "date": ""} logger.info("findecl scraping %s" % mep_id) text = getraw(decl).split("\n") state = 0 ptr = 0 while ptr < len(text): # bg: "А Б В Г Д Е Ж З И" # el: "A B Γ Δ E ΣΤ Ζ H Θ" if ( issectionhead(decl, text, ptr, state, 0, ("A", u"А", "A")) or issectionhead(decl, text, ptr, state, 2, ("C", u"В", u"Γ")) or issectionhead(decl, text, ptr, state, 3, ("D", u"Г", u"Δ")) or issectionhead(decl, text, ptr, state, 4, ("E", u"Д", u"E")) or issectionhead(decl, text, ptr, state, 5, ("F", u"Е", u"ΣΤ")) ): # skip to table while text[ptr].split()[-4:] != ["1", "2", "3", "4"]: ptr += 1 if ptr >= len(text): logger.error("[meh] %s table not found" % state) raise IndexError start = ptr # skip empty lines while not text[ptr].split(): ptr += 1 if ptr >= len(text): logger.error("[meh] %s fail skip empty lines" % state) raise IndexError while True: if ptr > len(text): logger.error("[meh] fail past end of block %s" % state) raise IndexError if text[ptr].strip() == "" and (text[ptr + 1] in ["1", ""] or text[ptr + 1].strip()[:3] == "1/6"): break if text[ptr].startswith(" " * 20) and ( text[ptr].strip()[1] == "/" and text[ptr].strip()[0] in ["2", "3", "4"] ): break ptr += 1 end = ptr state += 1 # print >> sys.stderr, text[start:end] if state == 6: t = parse_table_f(text[start:end]) else: t = parse_table(text[start:end]) data[state_map[state]] = t if DEBUG: print "\t%s" % ("\n\t".join((repr(x) for x in t)) or "none"), state elif issectionhead(decl, text, ptr, state, 1, ("B", u"Б", u"B")): while len([x for x in text[ptr].split(" " * 10) if x]) != 2: ptr += 1 if ptr >= len(text): logger.error("[meh] table B not found") raise IndexError start = ptr # skip empty lines while ptr < len(text) and not text[ptr].split(): ptr += 1 while True: if ptr > len(text): logger.error("[meh] fail skip empty lines in B") raise IndexError if [text[ptr].strip(), text[ptr + 1]] in (["", "1"], ["", ""]): break if text[ptr].startswith(" " * 20) and ( text[ptr].strip()[1] == "/" and text[ptr].strip()[0] in ["2", "3", "4"] ): break ptr += 1 end = ptr state += 1 t = parse_table_b(text[start:end]) if DEBUG: print "\t%s" % ("\n\t".join((repr(x) for x in t)) or "none"), state data[state_map[state]] = t elif state == 6: while not issectionhead(decl, text, ptr, state, 6, ("G", u"Ж", u"Ζ")): ptr += 1 # skip continuation lines while text[ptr].split(): ptr += 1 if ptr >= len(text): logger.error("[meh] continuation in G fail") raise IndexError # skip empty lines while not text[ptr].split(): ptr += 1 if ptr >= len(text): logger.error("[meh] fail skip empty lines in G") raise IndexError gstart = ptr state += 1 while not issectionhead(decl, text, ptr, state, 7, ("H", u"З", u"H")): ptr += 1 gend = ptr - 1 if DEBUG: print "\t", text[gstart:gend], state data[state_map[state]] = "\n".join(x for x in map(unicode.strip, text[gstart:gend]) if x) # skip continuation lines while text[ptr].split(): ptr += 1 if ptr >= len(text): logger.error("[meh] continuation in H fail") raise IndexError # skip empty lines while not text[ptr].split(): ptr += 1 if ptr >= len(text): logger.error("[meh] fail skip empty lines in H") raise IndexError hstart = ptr state += 1 while not issectionhead(decl, text, ptr, state, 8, ("I", u"И", u"Θ")): ptr += 1 hend = ptr - 1 if DEBUG: print "\t", text[hstart:hend], state data[state_map[state]] = "\n".join(x for x in map(unicode.strip, text[hstart:hend]) if x) # skip continuation lines while text[ptr].split(): ptr += 1 if ptr >= len(text): logger.error("[meh] continuation in I fail") raise IndexError # skip empty lines while not text[ptr].split(): ptr += 1 if ptr >= len(text): logger.error("[meh] fail skip empty lines in I") raise IndexError istart = ptr while True: tmp = text[ptr].split() if len(tmp) == 3: data["date"] = tmp[1] del tmp[1] if tmp in iendsigs: break elif len(tmp) == 5: # date=tmp[2] could be preserved in data tmpdate = tmp[2] del tmp[2] if tmp in [["Date", ":", "Signature", ":"]]: data["date"] = tmpdate break ptr += 1 if ptr >= len(text): logger.error("[meh] fail find end in I") if DEBUG: print "meh\n>>>%s" % "\n>>>".join(text[istart : istart + 14]).encode("utf8") raise IndexError state += 1 if DEBUG: print >> sys.stderr, state # print >> sys.stderr, "\t", text[istart:ptr], state data[state_map[state]] = "\n".join(x for x in map(unicode.strip, text[istart:ptr]) if x) # else: # print >> sys.stderr, '>>>>>>>>', line.encode('utf8') ptr += 1 if state != 9: print >> sys.stderr, ">>>>>>>>", "wtfwtf", state logger.error("[wtf] did not reach final state %s" % state) return {} else: return data
"http://www.europarl.europa.eu/meps/en/28269/Jerzy_BUZEK.html" ), None) print jdump( scrape( "http://www.europarl.europa.eu/meps/en/1186/Astrid_LULLING.html" ), None) elif sys.argv[1] == 'url' and sys.argv[2]: print jdump(scrape(sys.argv[2])).encode('utf8') sys.exit(0) # handle opts if 'current' in args: newbies = getIncomming() meps = get_meps elif 'outgoing' in args: meps = getOutgoing elif 'new' in args: newbies = getIncomming() meps = get_new else: logger.error('Need either <current|outgoing|new>') sys.exit(0) logger.info('\n\tsaver: %s\n\tmeps: %s\n\tseq: %s' % (saver, meps, 'seq' in args)) if 'seq' in args: res = seqcrawl(meps, saver=saver, null=null) if 'dry' in args: print "[%s]" % ',\n'.join(res).encode('utf8') else: crawler(meps, saver=saver)
sys.exit(0) print jdump(scrape("http://www.europarl.europa.eu/meps/en/1934/get.html"),None) print jdump(scrape("http://www.europarl.europa.eu/meps/en/28576/get.html"), None) print jdump(scrape("http://www.europarl.europa.eu/meps/en/1263/Elmar_BROK.html"), None) print jdump(scrape("http://www.europarl.europa.eu/meps/en/96739/Reinhard_B%C3%9CTIKOFER.html"), None) print jdump(scrape("http://www.europarl.europa.eu/meps/en/28269/Jerzy_BUZEK.html"), None) print jdump(scrape("http://www.europarl.europa.eu/meps/en/1186/Astrid_LULLING.html"), None) elif sys.argv[1]=='url' and sys.argv[2]: print jdump(scrape(sys.argv[2])).encode('utf8') sys.exit(0) # handle opts if 'current' in args: newbies=getIncomming() meps=get_meps elif 'outgoing' in args: meps=getOutgoing elif 'new' in args: newbies=getIncomming() meps=get_new else: logger.error('Need either <current|outgoing|new>') sys.exit(0) logger.info('\n\tsaver: %s\n\tmeps: %s\n\tseq: %s' % (saver, meps, 'seq' in args)) if 'seq' in args: res=seqcrawl(meps,saver=saver, null=null) if 'dry' in args: print "[%s]" % ',\n'.join(res).encode('utf8') else: crawler(meps,saver=saver)
def scrape(decl): mep_id = decl.split('/')[-1].split('_')[0] data = {'mep_id': mep_id, 'url': unicode(decl), 'date': ''} logger.info("findecl scraping %s" % mep_id) text=getraw(decl).split('\n') state=0 ptr=0 while ptr<len(text): # bg: "А Б В Г Д Е Ж З И" # el: "A B Γ Δ E ΣΤ Ζ H Θ" if (issectionhead(decl, text,ptr,state,0,('A',u'А','A')) or issectionhead(decl, text,ptr,state,2,('C',u'В',u'Γ')) or issectionhead(decl, text,ptr,state,3,('D',u'Г',u'Δ')) or issectionhead(decl, text,ptr,state,4,('E',u'Д',u'E')) or issectionhead(decl, text,ptr,state,5,('F',u'Е',u'ΣΤ'))): # skip to table while (text[ptr].split()[-4:]!=['1','2','3','4']): ptr+=1 if ptr>=len(text): logger.error('[meh] %s table not found' % state) raise IndexError start=ptr # skip empty lines while not text[ptr].split(): ptr+=1 if ptr>=len(text): logger.error('[meh] %s fail skip empty lines' % state) raise IndexError while True: if ptr>len(text): logger.error('[meh] fail past end of block %s' % state) raise IndexError if (text[ptr].strip()=='' and (text[ptr+1] in ['1',''] or text[ptr+1].strip()[:3] == '1/6')): break if text[ptr].startswith(' ' * 20) and (text[ptr].strip()[1]=='/' and text[ptr].strip()[0] in ['2','3','4']): break ptr+=1 end=ptr state+=1 #print >> sys.stderr, text[start:end] if state == 6: t = parse_table_f(text[start:end]) else: t = parse_table(text[start:end]) data[state_map[state]] = t if DEBUG: print "\t%s" % ('\n\t'.join((repr(x) for x in t)) or "none"), state elif issectionhead(decl, text,ptr,state,1,('B',u'Б', u'B')): while len([x for x in text[ptr].split(' ' * 10) if x]) != 2: ptr+=1 if ptr>=len(text): logger.error('[meh] table B not found') raise IndexError start=ptr # skip empty lines while ptr<len(text) and not text[ptr].split(): ptr+=1 while True: if ptr>len(text): logger.error('[meh] fail skip empty lines in B') raise IndexError if [text[ptr].strip(), text[ptr+1]] in (['','1'], ['','']): break if text[ptr].startswith(' ' * 20) and (text[ptr].strip()[1]=='/' and text[ptr].strip()[0] in ['2','3','4']): break ptr+=1 end=ptr state+=1 t = parse_table_b(text[start:end]) if DEBUG: print "\t%s" % ('\n\t'.join((repr(x) for x in t)) or "none"), state data[state_map[state]] = t elif state==6: while not issectionhead(decl, text,ptr,state,6,('G',u'Ж',u'Ζ')): ptr+=1 # skip continuation lines while text[ptr].split(): ptr+=1 if ptr>=len(text): logger.error('[meh] continuation in G fail') raise IndexError # skip empty lines while not text[ptr].split(): ptr+=1 if ptr>=len(text): logger.error('[meh] fail skip empty lines in G') raise IndexError gstart=ptr state+=1 while not issectionhead(decl, text,ptr,state,7,('H',u'З',u'H')): ptr+=1 gend=ptr-1 if DEBUG: print "\t", text[gstart:gend], state data[state_map[state]] = '\n'.join(x for x in map(unicode.strip, text[gstart:gend]) if x) # skip continuation lines while text[ptr].split(): ptr+=1 if ptr>=len(text): logger.error('[meh] continuation in H fail') raise IndexError # skip empty lines while not text[ptr].split(): ptr+=1 if ptr>=len(text): logger.error('[meh] fail skip empty lines in H') raise IndexError hstart=ptr state+=1 while not issectionhead(decl, text,ptr,state,8,('I',u'И',u'Θ')): ptr+=1 hend=ptr-1 if DEBUG: print "\t", text[hstart:hend], state data[state_map[state]] = '\n'.join(x for x in map(unicode.strip, text[hstart:hend]) if x) # skip continuation lines while text[ptr].split(): ptr+=1 if ptr>=len(text): logger.error('[meh] continuation in I fail') raise IndexError # skip empty lines while not text[ptr].split(): ptr+=1 if ptr>=len(text): logger.error('[meh] fail skip empty lines in I') raise IndexError istart=ptr while True: tmp = text[ptr].split() if len(tmp)==3: data['date']=tmp[1] del tmp[1] if tmp in iendsigs: break elif len(tmp)==5: # date=tmp[2] could be preserved in data tmpdate=tmp[2] del tmp[2] if tmp in [['Date', ':','Signature', ':']]: data['date']=tmpdate break ptr+=1 if ptr>=len(text): logger.error('[meh] fail find end in I') if DEBUG: print 'meh\n>>>%s' % '\n>>>'.join(text[istart:istart+14]).encode('utf8') raise IndexError state+=1 if DEBUG: print >> sys.stderr, state #print >> sys.stderr, "\t", text[istart:ptr], state data[state_map[state]] = '\n'.join(x for x in map(unicode.strip, text[istart:ptr]) if x) #else: #print >> sys.stderr, '>>>>>>>>', line.encode('utf8') ptr+=1 if state!=9: print >> sys.stderr, '>>>>>>>>', "wtfwtf", state logger.error('[wtf] did not reach final state %s' % state) return {} else: if (len(data['occupation'])>1 and data['occupation'][-1][0] in [u"No occupation held during the three years preceding the current mandate", u"Καμία επαγγελματική δραστηριότητα κατά τη διάρκεια των τριών ετών που προηγήθηκαν της τρέχουσας εντολής", u"Atividade Liberal como autor/outras atividades artísticas (remuneração inferior a 500 € na totalidade dos 3 anos anteriores)", u"Brak działalności zawodowej w okresie trzech lat poprzedzających obecną kadencję", u"Geen beroep uitgeoefend gedurende de drie jaar voorafgaand aan de huidige zittingsperiode", u"Nessuna attività svolta durante i tre anni precedenti l'attuale mandato", u"Keine Berufstätigkeit während des Dreijahreszeitraums vor der laufenden Wahlperiode", ]): del data['occupation'][-1] return data
def getMEPGender(id): try: mepraw=fetch("http://www.europarl.europa.eu/meps/fr/%s/_home.html" % (id), ignore=[500]) except Exception, e: logger.error("mepgender %s" % e) return 'n/a'
def scrape(url, comid): root=fetch(url) lines=[x for x in root.xpath('//td[@class="contents"]/div/*') if unws(' '.join(x.xpath('.//text()')))] if not len(lines): return if not unws(' '.join(lines[2].xpath('.//text()')))=='DRAFT AGENDA': logger.error("NOT DRAFT AGENDA %s" % unws(' '.join(lines[2].xpath('.//text()')))) agenda={u'committee': comid, u'committee_full': unws(' '.join(lines[0].xpath('.//text()'))), u'src': url, } i=1 if unws(' '.join(lines[3].xpath('.//text()')))=="INTERPARLIAMENTARY COMMITTEE MEETING": logger.warn("skipping interparl com meet") return if unws(' '.join(lines[6].xpath('.//text()'))).startswith('Room'): agenda.update({u'docid': unws(' '.join(lines[1].xpath('.//text()'))), u'type': unws(' '.join(lines[3].xpath('.//text()'))), u'time': toTime(unws(' '.join(lines[4].xpath('.//text()')))), u'city': unws(' '.join(lines[5].xpath('.//text()'))), u'room': unws(' '.join(lines[6].xpath('.//text()')))[6:], }) i=7 itemcnt=0 item={} schedule=None res=[] while i < len(lines): line=lines[i] i+=1 txt=unws(' '.join(line.xpath('.//text()'))) if txt in ['* * *', '***']: continue # skip end of schedule block # 20 December 2011, 16.00 – 16.30 tmp=toTime(txt) if tmp: schedule=tmp if i<len(lines) and unws(' '.join(lines[i].xpath('.//text()'))) == 'In camera': schedule[u'incamera']=True i+=1 continue if line.tag=='div': item[u'actors']=getactors(line) continue firsttoken=txt.split()[0] # 6. Alternative dispute resolution for consumer disputes and # amending Regulation (EC) No 2006/2004 and Directive # 2009/22/EC (Directive on consumer ADR) if firsttoken[-1]=='.' and firsttoken[:-1].isdigit() and itemcnt+1==int(firsttoken[:-1]): if item: res.append(item) itemcnt+=1 item=copy.deepcopy(agenda) item.update({u'title': ' '.join(txt.split()[1:]), u'seq_no': itemcnt,}) if schedule: item.update(schedule) continue # trailing list of "details" # · Presentation by the Commission of the proposal & Impact Assessment # · Exchange of views if firsttoken==u"·": if not 'list' in item: item[u'list']=[] tmp=' '.join(txt.split()[1:]) if tmp.startswith('Deadline for tabling amendments:'): try: item[u'tabling_deadline']=datetime.strptime(tmp.split(':')[1].strip(),"%d %B %Y, %H.%M") except ValueError: try: item[u'tabling_deadline']=datetime.strptime(tmp.split(':')[1].strip(),"%d.%m.%Y at %H.%M") except: logger.warn('[$] unknown tabling deadline format %s' % unws(tmp)) item[u'list'].append(tmp) continue # committee dossier # IMCO/7/08130 if txt.startswith("%s/7/" % comid) and len(txt)==12: item[u'comdossier']=txt continue # ***I 2011/0373(COD) COM(2011)0793 – C7-0454/2011 tmp=getdocs(txt) if tmp: item.update(tmp) continue # fall-through line logger.debug("(falltrough) %s %s" % (line.tag, txt.encode('utf8'))) if item: res.append(item) return res
def scrape(url): try: logger.info('scrape '+url) tree=fetch(url) agents,committees=scrape_actors(tree) forecasts=lst2obj((tree.xpath('//table[@id="forecast"]') or [None])[0],forecastFields) events=scrape_events(tree) procedure=scrape_basic(tree) if not procedure: return ipext=[] for ipexd in IPEXMAP.get(procedure['reference'], {}).get('Dates',[]): skip=False for event in forecasts+events: if event['type'] in ipexevents.get(ipexd['type'],{}).get('oeil',[]) and event['date']==ipexd['date']: skip=True break if skip: continue ipext.append(ipexd) allevents=agents+scrape_docs(tree)+events+forecasts+ipext other=[x for x in allevents if not x.get('date')] allevents=sorted([x for x in allevents if x.get('date')],key=itemgetter('date')) allevents=merge_events(allevents,committees, agents) res={u'meta': {'source': url, 'timestamp': datetime.datetime.utcnow() }, u'procedure': procedure, u'links': form2obj((tree.xpath('//table[@id="external_links"]') or [None])[0]), u'committees': committees, u'activities': sorted(allevents, key=itemgetter('date')), u'other': other, } tmp=url.split('id=') if len(tmp)>1: res['meta']['id']=int(tmp[1]) # check for "final act" finalas=tree.xpath('//div[@id="final_act"]//a') final={} for link in finalas: if link.get('class')=='sumbutton': try: summary=fetch("http://www.europarl.europa.eu%s" % link.get('href')) except: continue final['text']=[unicode(tostring(x)) for x in summary.xpath('//div[@id="summary"]')] else: if not 'docs' in final: final['docs']=[] final['docs'].append({'title': link.xpath('text()')[0].strip(), 'url': link.get('href')}) if final and final.get('docs'): res[u'procedure'][u'final']=final.get('docs',[{}])[0] for item in res['activities']: if item.get('type')==u'Final act published in Official Journal': if final.get('text'): item[u'text']=final['text'] if len(final.get('docs'))>1: if not 'docs' in item: item[u'docs']=final['docs'] else: item[u'docs'].extend(final['docs']) break return res except: logger.error("%s\n%s" % (url,traceback.format_exc())) return
def scrape(decl): mep_id = decl.split('/')[-1].split('_')[0] data = {'mep_id': mep_id, 'url': unicode(decl), 'date': ''} logger.info("findecl scraping %s" % mep_id) text=getraw(decl).split('\n') state=0 ptr=0 while ptr<len(text): # bg: "А Б В Г Д Е Ж З И" # el: "A B Γ Δ E ΣΤ Ζ H Θ" if (issectionhead(decl, text,ptr,state,0,('A',u'А','A')) or issectionhead(decl, text,ptr,state,2,('C',u'В',u'Γ')) or issectionhead(decl, text,ptr,state,3,('D',u'Г',u'Δ')) or issectionhead(decl, text,ptr,state,4,('E',u'Д',u'E')) or issectionhead(decl, text,ptr,state,5,('F',u'Е',u'ΣΤ'))): # skip to table while (text[ptr].split()[-4:]!=['1','2','3','4']): ptr+=1 if ptr>=len(text): logger.error('[meh] %s table not found' % state) raise IndexError start=ptr # skip empty lines while not text[ptr].split(): ptr+=1 if ptr>=len(text): logger.error('[meh] %s fail skip empty lines' % state) raise IndexError while True: if ptr>len(text): logger.error('[meh] fail past end of block %s' % state) raise IndexError if (text[ptr].strip()=='' and (text[ptr+1] in ['1',''] or text[ptr+1].strip()[:3] == '1/6')): break if text[ptr].startswith(' ' * 20) and (text[ptr].strip()[1]=='/' and text[ptr].strip()[0] in ['2','3','4']): break ptr+=1 end=ptr state+=1 #print >> sys.stderr, text[start:end] if state == 6: t = parse_table_f(text[start:end]) else: t = parse_table(text[start:end]) data[state_map[state]] = t if DEBUG: print "\t%s" % ('\n\t'.join((repr(x) for x in t)) or "none"), state elif issectionhead(decl, text,ptr,state,1,('B',u'Б', u'B')): while len([x for x in text[ptr].split(' ' * 10) if x]) != 2: ptr+=1 if ptr>=len(text): logger.error('[meh] table B not found') raise IndexError start=ptr # skip empty lines while ptr<len(text) and not text[ptr].split(): ptr+=1 while True: if ptr>len(text): logger.error('[meh] fail skip empty lines in B') raise IndexError if [text[ptr].strip(), text[ptr+1]] in (['','1'], ['','']): break if text[ptr].startswith(' ' * 20) and (text[ptr].strip()[1]=='/' and text[ptr].strip()[0] in ['2','3','4']): break ptr+=1 end=ptr state+=1 t = parse_table_b(text[start:end]) if DEBUG: print "\t%s" % ('\n\t'.join((repr(x) for x in t)) or "none"), state data[state_map[state]] = t elif state==6: while not issectionhead(decl, text,ptr,state,6,('G',u'Ж',u'Ζ')): ptr+=1 # skip continuation lines while text[ptr].split(): ptr+=1 if ptr>=len(text): logger.error('[meh] continuation in G fail') raise IndexError # skip empty lines while not text[ptr].split(): ptr+=1 if ptr>=len(text): logger.error('[meh] fail skip empty lines in G') raise IndexError gstart=ptr state+=1 while not issectionhead(decl, text,ptr,state,7,('H',u'З',u'H')): ptr+=1 gend=ptr-1 if DEBUG: print "\t", text[gstart:gend], state data[state_map[state]] = '\n'.join(x for x in map(unicode.strip, text[gstart:gend]) if x) # skip continuation lines while text[ptr].split(): ptr+=1 if ptr>=len(text): logger.error('[meh] continuation in H fail') raise IndexError # skip empty lines while not text[ptr].split(): ptr+=1 if ptr>=len(text): logger.error('[meh] fail skip empty lines in H') raise IndexError hstart=ptr state+=1 while not issectionhead(decl, text,ptr,state,8,('I',u'И',u'Θ')): ptr+=1 hend=ptr-1 if DEBUG: print "\t", text[hstart:hend], state data[state_map[state]] = '\n'.join(x for x in map(unicode.strip, text[hstart:hend]) if x) # skip continuation lines while text[ptr].split(): ptr+=1 if ptr>=len(text): logger.error('[meh] continuation in I fail') raise IndexError # skip empty lines while not text[ptr].split(): ptr+=1 if ptr>=len(text): logger.error('[meh] fail skip empty lines in I') raise IndexError istart=ptr while True: tmp = text[ptr].split() if len(tmp)==3: data['date']=tmp[1] del tmp[1] if tmp in iendsigs: break elif len(tmp)==5: # date=tmp[2] could be preserved in data tmpdate=tmp[2] del tmp[2] if tmp in [['Date', ':','Signature', ':']]: data['date']=tmpdate break ptr+=1 if ptr>=len(text): logger.error('[meh] fail find end in I') if DEBUG: print 'meh\n>>>%s' % '\n>>>'.join(text[istart:istart+14]).encode('utf8') raise IndexError state+=1 if DEBUG: print >> sys.stderr, state #print >> sys.stderr, "\t", text[istart:ptr], state data[state_map[state]] = '\n'.join(x for x in map(unicode.strip, text[istart:ptr]) if x) #else: #print >> sys.stderr, '>>>>>>>>', line.encode('utf8') ptr+=1 if state!=9: print >> sys.stderr, '>>>>>>>>', "wtfwtf", state logger.error('[wtf] did not reach final state %s' % state) return {} else: if (len(data['occupation'])>1 and data['occupation'][-1][0] in [u"No occupation held during the three years preceding the current mandate", u"Καμία επαγγελματική δραστηριότητα κατά τη διάρκεια των τριών ετών που προηγήθηκαν της τρέχουσας εντολής", u"Atividade Liberal como autor/outras atividades artísticas (remuneração inferior a 500 € na totalidade dos 3 anos anteriores)", u"Brak działalności zawodowej w okresie trzech lat poprzedzających obecną kadencję", u"Geen beroep uitgeoefend gedurende de drie jaar voorafgaand aan de huidige zittingsperiode", u"Nessuna attività svolta durante i tre anni precedenti l'attuale mandato", u"Keine Berufstätigkeit während des Dreijahreszeitraums vor der laufenden Wahlperiode", u"Aucune activité professionnelle au cours des trois années ayant précédé le présent mandat", u"Sin ocupación durante los tres años anteriores al actual mandato", u"Intet erhvervsarbejde i de tre år forud for det nuværende mandate", u"Nicio activitate profesională în ultimii trei ani dinaintea preluării mandatului actual", u"Har inte utövat någon yrkesmässig verksamhet under de tre år som föregick det nuvarande mandatet", u"Sem atividade profissional durante os três anos que precederam o atual mandato", u"Nepostojanje profesionalne djelatnosti tijekom tri godine prije aktualnog mandata", u"Ei ammatillista toimintaa kolmena nykyistä edustajantointa edeltävänä vuotena", u"A jelenlegi megbízatást megelőző három évben nem végzett foglalkozást.", u"Без професионална дейност по време на трите години, предшестващи текущия мандат", u"Během tří let před současným mandátem jsem nevykonával(a) žádnou profesní činnost.", ]): del data['occupation'][-1] return data