def save(data, stats): res = db.ep_meps2.find_one({"UserID": data["UserID"]}) or {} d = diff( dict([(k, v) for k, v in res.items() if not k in ["_id", "meta", "changes"]]), dict([(k, v) for k, v in data.items() if not k in ["_id", "meta", "changes"]]), ) if d: now = datetime.utcnow().replace(microsecond=0) if not res: logger.info(("adding %s" % (data["Name"]["full"])).encode("utf8")) data["meta"]["created"] = now if stats: stats[0] += 1 else: logger.info(("updating %s" % (data["Name"]["full"])).encode("utf8")) logger.warn(jdump(d)) data["meta"]["updated"] = now if stats: stats[1] += 1 data["_id"] = res["_id"] data["changes"] = res.get("changes", {}) data["changes"][now.isoformat()] = d db.ep_meps2.save(data) if stats: return stats else: return data
def getComAms(leg=7, update=False): urltpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html" # todo add to searchRPCD, OPCD for doctype in ['AMCO', 'RPCD', 'OPCD']: postdata="clean=false&leg=%s&docType=%s&miType=text" % (leg, doctype) nexttpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?action=%s&tabActif=tabResult#sidesForm" for com in (k for k in COMMITTEE_MAP.keys() if len(k)==4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP']): url=urltpl % (com) i=0 logger.info('%s %s crawling %s' % (datetime.now().isoformat(), doctype, com)) root=fetch(url, params=postdata) prev=[] while True: logger.info("%s %s" % (datetime.now().isoformat(), url)) #logger.info(tostring(root)) tmp={a.get('href'): ' '.join(a.xpath('../../../p[@class="rapporteurs"]//text()')) if doctype != 'AMCO' else None for a in root.xpath('//a[@title="open this PDF in a new window"]') if (len(a.get('href',''))>13)} if not tmp or prev==tmp: break prev=tmp for u, v in sorted(tmp.items()): if db.ep_ams.find_one({'src': u}): continue yield u, v if update: break i+=1 url=nexttpl % (com,i) root=fetch(url)
def getComAgendas(): #urltpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html" urltpl="http://www.europarl.europa.eu/committees/en/%s/search-in-documents.html" postdata="docType=AGEN&leg=8&miType=text&tabActif=tabResult#sidesForm" #nexttpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?action=%s&tabActif=tabResult#sidesForm" nexttpl="http://www.europarl.europa.eu/committees/en/%s/search-in-documents.html?action=%s&tabActif=tabResult#sidesForm" for com in (k for k in COMMITTEE_MAP.keys() if len(k)==4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP', 'SURE', 'CRIM', 'CRIS']): url=urltpl % (com) i=0 agendas=[] logger.info('scraping %s' % com) root=fetch(url, params=postdata) prev=[] while True: logger.info("%s %s" % (datetime.now().isoformat(), url)) tmp=[(a.get('href'), unws(a.xpath('text()')[0])) for a in root.xpath('//p[@class="title"]/a') if len(a.get('href',''))>13] if not tmp or prev==tmp: break prev=tmp for u,title in tmp: if title.startswith('DRAFT AGENDA'): yield (u,com) i+=1 url=nexttpl % (com,i) root=fetch(url)
def save(data, stats): res=db.ep_meps2.find_one({ 'UserID' : data['UserID'] }) or {} if 'Gender' not in data and 'Gender' in res: data['Gender']=res['Gender'] d=diff(dict([(k,v) for k,v in res.items() if not k in ['_id', 'meta', 'changes', 'activities',]]), dict([(k,v) for k,v in data.items() if not k in ['_id', 'meta', 'changes', 'activities',]])) if d: now=datetime.utcnow().replace(microsecond=0) if not res: logger.info(('adding %s' % (data['Name']['full'])).encode('utf8')) data['meta']['created']=now if stats: stats[0]+=1 else: logger.info(('updating %s' % (data['Name']['full'])).encode('utf8')) logger.warn(jdump(d)) data['meta']['updated']=now if stats: stats[1]+=1 data['_id']=res['_id'] data['changes']=res.get('changes',{}) data['changes'][now.isoformat()]=d db.ep_meps2.save(data) del res if stats: del data return stats else: return data
def scrape(celexid, path): logger.info("scraping %s%s:NOT" % (EURLEXURL,celexid)) path.reverse() (code,lang)=celexid.split(":")[1:3] st=6 if len(code)>6: if code[6].isalpha(): st=7 eurlex={'id': {u'celexid': celexid, u'sector': code[0], u'year': code[1:5], u'doctype': code[5:st], u'refno': code[st:], u'lang': lang, u'chapter': path, }} else: eurlex={'id': {u'celexid': celexid, u'sector': code[0], u'year': code[1:5], u'doctype': code[5:6], u'lang': lang, u'chapter': path, }} try: eurlex['id'][u'typeDesc']= CELEXCODES[code[0]]['Document Types'][code[5:st]] if code[5:st] != 'C' else CELEXCODES[code[0]]['Sector'] except: eurlex['id'][u'typeDesc']= u"Unknown" logger.warn("[!] unknown typedesc %s" % celexid) eurlex['meta']={u'src': "%s%s:NOT" % (EURLEXURL,celexid)} root = fetch("%s%s:NOT" % (EURLEXURL,celexid)) if len(root.xpath('//h1[text()="No documents matching criteria."]'))>0: logger.warn('[!] nothing to scrape here: %s', "%s%s:NOT" % (EURLEXURL,celexid)) return eurlex[u'title'] = root.xpath('//h2[text()="Title and reference"]/following-sibling::p/text()')[0] # dates dates=root.xpath('//h2[text()="Dates"]/following-sibling::ul/text()') for y in dates: if not unws(y): continue title, rest=unws(y).split(": ",1) item={u'type': title} date=rest[:10] tail=rest[10:] if tail.startswith('; '): tail=tail[2:] if date=='99/99/9999': item[u'date']= datetime(9999,12,31) elif date=='00/00/0000': item[u'date']= datetime(0001,01,01) elif date=='//': continue else: try: item[u'date']= datetime.strptime(date, u"%d/%m/%Y") except ValueError: try: item[u'date']= datetime.strptime(date, u"%m/%d/%Y") except: pass if len(tail): item['note']=tail try: eurlex['dates'].append(item) except: eurlex['dates']=[item]
def crawl_allseq(saver=jdump): seen=[] stats=[0,0] for term in xrange(1,8): for url, name in get_meps(term=term): if not url in seen: saver(scrape(url),stats) logger.info('end of crawl')
def get_all(term=""): for term in xrange(1, current_term + 1): for url, name in get_meps(term=term): mep = db.ep_meps2.find_one({"Name.full": name}) if not mep: yield (urljoin(urljoin(BASE_URL, url), "get.html"), {}) else: mep["terms"] = list(set(mep.get("terms", []) + [term])) db.ep_meps2.save(mep) logger.info("updated %s" % name)
def seqcrawler(saver=jdump): stats=[0,0] for u, com in getComAgendas(): try: saver(scrape(u,com), stats) except: # ignore failed scrapes logger.warn("[!] failed to scrape: %s" % u) logger.warn(traceback.format_exc()) logger.info("[o] added/updated: %s/%s" % (stats[0],stats[1]))
def get_all(term=''): for term in xrange(1,current_term+1): for url, name in get_meps(term=term): mep=db.ep_meps2.find_one({'Name.full': name}) if not mep: yield (urljoin(urljoin(BASE_URL,url),'get.html'), {}) else: mep['terms']=list(set(mep.get('terms',[]) + [term])) db.ep_meps2.save(mep) logger.info('updated %s' % name)
def crawl_all(saver=jdump,threads=4): m=Multiplexer(scrape,saver,threads=threads) m.start() seen=[] for term in xrange(1,8): for url, name in get_meps(term=term): if not url in seen: m.addjob(url) seen.append(url) m.finish() logger.info('end of crawl')
def fetch(url, **kwargs): timer=8 while True: root=_fetch(url, **kwargs) fail=root.xpath('//h1[text()="The system could not serve your request in time because of a temporary problem; please try again shortly."]') if not len(fail): timer=8 break logger.info('[i] getting "pls wait" msg, sleeping for %ss' % timer) time.sleep(timer) timer=timer*2 return root
def sources(url, path): root=fetch(url) regexpNS = "http://exslt.org/regular-expressions" if len(path): logger.info("[i] crawler: %s" % ' '.join(path[-1])) for doc in root.xpath("//a[re:test(@href, 'LexUriServ[.]do[?]uri=[0-9A-Z:]*:NOT', 'i')]", namespaces={'re':regexpNS}): yield (doc.get('href').split('uri=')[1][:-4], path) for c in root.xpath("//div[@id='content']//a[re:test(@href, 'chap[0-9]*.htm', 'i')]", namespaces={'re':regexpNS}): for res in sources("%s/%s" % (crawlroot,c.get('href')), path+[tuple(c.text.split(' ',1))]): yield res
def fetch(url, **kwargs): timer = 8 while True: root = _fetch(url, **kwargs) fail = root.xpath( '//h1[text()="The system could not serve your request in time because of a temporary problem; please try again shortly."]' ) if not len(fail): timer = 8 break logger.info('[i] getting "pls wait" msg, sleeping for %ss' % timer) time.sleep(timer) timer = timer * 2 return root
def sources(url, path): root = fetch(url) regexpNS = "http://exslt.org/regular-expressions" if len(path): logger.info("[i] crawler: %s" % ' '.join(path[-1])) for doc in root.xpath( "//a[re:test(@href, 'LexUriServ[.]do[?]uri=[0-9A-Z:]*:NOT', 'i')]", namespaces={'re': regexpNS}): yield (doc.get('href').split('uri=')[1][:-4], path) for c in root.xpath( "//div[@id='content']//a[re:test(@href, 'chap[0-9]*.htm', 'i')]", namespaces={'re': regexpNS}): for res in sources("%s/%s" % (crawlroot, c.get('href')), path + [tuple(c.text.split(' ', 1))]): yield res
def save(data, stats): src = data['meta']['source'] res = db.dossiers2.find_one({'meta.source': src}) or {} d = diff( dict([(k, v) for k, v in res.items() if not k in ['_id', 'meta', 'changes']]), dict([(k, v) for k, v in data.items() if not k in [ '_id', 'meta', 'changes', ]])) #logger.warn(d) if d: now = datetime.datetime.utcnow().replace(microsecond=0).isoformat() if not res: logger.info(('adding %s - %s' % (data['procedure']['reference'], data['procedure']['title'])).encode('utf8')) data['meta']['created'] = data['meta']['timestamp'] del data['meta']['timestamp'] sys.stdout.flush() stats[0] += 1 else: logger.info(('updating %s - %s' % (data['procedure']['reference'], data['procedure']['title'])).encode('utf8')) data['meta']['updated'] = data['meta']['timestamp'] del data['meta']['timestamp'] sys.stdout.flush() stats[1] += 1 data['_id'] = res['_id'] #print >> sys.stderr, (d) m = db.notifications.find({'dossiers': data['procedure']['reference']}, ['active_emails']) for g in m: if len(g['active_emails']) == 0: continue msg = Message( "[PT] %s %s" % (data['procedure']['reference'], data['procedure']['title']), sender="*****@*****.**", bcc=g['active_emails']) msg.html = htmldiff(data, d) msg.body = makemsg(data, d) mail.send(msg) data['changes'] = res.get('changes', {}) data['changes'][now] = d db.dossiers2.save(data) return stats
def save(data, stats): src = data["meta"]["source"] res = db.dossiers2.find_one({"meta.source": src}) or {} d = diff( dict([(k, v) for k, v in res.items() if not k in ["_id", "meta", "changes"]]), dict([(k, v) for k, v in data.items() if not k in ["_id", "meta", "changes"]]), ) # logger.warn(d) if d: now = datetime.datetime.utcnow().replace(microsecond=0).isoformat() if not res: logger.info( ("adding %s - %s" % (data["procedure"]["reference"], data["procedure"]["title"])).encode("utf8") ) data["meta"]["created"] = data["meta"]["timestamp"] del data["meta"]["timestamp"] sys.stdout.flush() stats[0] += 1 else: logger.info( ("updating %s - %s" % (data["procedure"]["reference"], data["procedure"]["title"])).encode("utf8") ) data["meta"]["updated"] = data["meta"]["timestamp"] del data["meta"]["timestamp"] sys.stdout.flush() stats[1] += 1 data["_id"] = res["_id"] # print >> sys.stderr, (d) m = db.notifications.find({"dossiers": data["procedure"]["reference"]}, ["active_emails"]) for g in m: if len(g["active_emails"]) == 0: continue msg = Message( "[PT] %s %s" % (data["procedure"]["reference"], data["procedure"]["title"]), sender="*****@*****.**", bcc=g["active_emails"], ) msg.html = htmldiff(data, d) msg.body = makemsg(data, d) mail.send(msg) data["changes"] = res.get("changes", {}) data["changes"][now] = d db.dossiers2.save(data) return stats
def getComAgendas(): urltpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?&docType=AGEN&leg=7&miType=text" nexttpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?tabActif=tabLast&startValue=%s" for com in (k for k in COMMITTEE_MAP.keys() if len(k)==4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP']): url=urltpl % (com) i=0 agendas=[] logger.info('scraping %s' % com) while True: logger.info("crawling %s" % (url)) root=fetch(url) tmp=[(a.get('href'), unws(a.xpath('text()')[0])) for a in root.xpath('//p[@class="title"]/a') if len(a.get('href',''))>13] if not tmp: break for u,_ in tmp: yield (u,com) i+=10 url=nexttpl % (com,i)
def getComAgendas(): urltpl = "http://www.europarl.europa.eu/committees/en/%s/documents-search.html?&docType=AGEN&leg=7&miType=text" nexttpl = "http://www.europarl.europa.eu/committees/en/%s/documents-search.html?tabActif=tabLast&startValue=%s" for com in (k for k in COMMITTEE_MAP.keys() if len(k) == 4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP']): url = urltpl % (com) i = 0 agendas = [] logger.info('scraping %s' % com) while True: logger.info("crawling %s" % (url)) root = fetch(url) tmp = [(a.get('href'), unws(a.xpath('text()')[0])) for a in root.xpath('//p[@class="title"]/a') if len(a.get('href', '')) > 13] if not tmp: break for u, _ in tmp: yield (u, com) i += 10 url = nexttpl % (com, i)
def save(data, stats): res=db.ep_meps2.find_one({ 'UserID' : data['UserID'] }) or {} d=diff(dict([(k,v) for k,v in res.items() if not k in ['_id', 'meta', 'changes']]), dict([(k,v) for k,v in data.items() if not k in ['_id', 'meta', 'changes',]])) if d: now=unicode(datetime.utcnow().replace(microsecond=0).isoformat()) if not res: logger.info(('adding %s' % (data['Name']['full'])).encode('utf8')) data['meta']['created']=now stats[0]+=1 else: logger.info(('updating %s' % (data['Name']['full'])).encode('utf8')) logger.warn(d) data['meta']['updated']=now stats[1]+=1 data['_id']=res['_id'] data['changes']=res.get('changes',{}) data['changes'][now]=d db.ep_meps2.save(data) return stats
def html(self): res=[css] res.append("<p class='statut'>%s</p>" % self.statut) res.append("<p class='type'>%s</p>" % self.type) res.append("<h2><span id='reference'>%s</span> <span id='title'>%s</span></h2>" % (self.ref, self.title)) res.append("<p class='institutions'>%s</p>" % self.institutions) res.extend(["<div class='explanation %s'>%s</div>" % (p.type,'\n'.join(p.html())) for p in self.explanation]) res.extend(["<p class='preamble'>%s</p>" % '\n'.join(p.html()) for p in self.preamble]) tmp=[u"<li class='recital' id='recital_%s'>%s</li>" % (i+1,r) for i, r in enumerate(self.recitals)] res.append("<ol class='number '>%s</ol>" % '\n'.join(tmp)) res.append("<p class='adoption'>%s</p>" % self.adoption) for i, c in enumerate(self.chaps): res.append("<h3 class='chapter' id='chapter%s'>%s</h3>" % (i+1, c.title)) res.append("<ol>") for x in c.nodes: res.extend(x.html()) res.append("</ol>") logger.info(self.annexes) res.extend(["<h3>%s</h3><div class='annex'>%s</div>" % (p['title'], '\n'.join(['\n'.join(x.html()) for x in p['content']])) for p in self.annexes]) res.append("<hr /><h3>Footnotes</h3>") res.extend([u"<p class='footnote'><a name='footnote%s'>[%s] %s</a></p>" % (i+1, i+1 ,r[1]) for i, r in enumerate(self.footnotes)]) return u'\n'.join(res).encode('utf8')
def save(data, stats): for item in data: if not 'committee' in item: continue query={'committee': item['committee'], 'src': item['src'], 'title': item['title']} if 'date' in data: query['date']= item['date'] if 'end' in data: query['end']= item['end'] else: query['seq_no']=item['seq_no'] res=db.ep_comagendas.find_one(query) or {} d=diff(dict([(k,v) for k,v in res.items() if not k in ['_id', 'meta', 'changes']]), dict([(k,v) for k,v in item.items() if not k in ['_id', 'meta', 'changes',]])) if d: now=datetime.utcnow().replace(microsecond=0) if not 'meta' in item: item[u'meta']={} if not res: logger.info((u'adding %s %s' % (item['committee'], item['title'])).encode('utf8')) item['meta']['created']=now if stats: stats[0]+=1 else: logger.info((u'updating %s %s' % (item['committee'], item['title'])).encode('utf8')) logger.info(d) item['meta']['updated']=now if stats: stats[1]+=1 item['_id']=res['_id'] item['changes']=res.get('changes',{}) item['changes'][now.isoformat()]=d db.ep_comagendas.save(item) if stats: return stats else: return data
def save(data, stats): if not data: return stats res=db.eurlex.find_one({ 'id.celexid' : data['id']['celexid'] }) or {} d=diff(dict([(k,v) for k,v in res.items() if not k in ['_id', 'meta', 'changes']]), dict([(k,v) for k,v in data.items() if not k in ['_id', 'meta', 'changes',]])) if d: now=unicode(datetime.utcnow().replace(microsecond=0).isoformat()) if not res: logger.info(('adding %s' % (data['id']['celexid'])).encode('utf8')) data['meta']['created']=now if stats: stats[0]+=1 else: logger.info(('updating %s' % (data['id']['celexid'])).encode('utf8')) logger.warn(d) data['meta']['updated']=now if stats: stats[1]+=1 data['_id']=res['_id'] data['changes']=res.get('changes',{}) data['changes'][now]=d db.eurlex.save(data) if stats: return stats else: return data
def getComAms(leg=TERM, update=False): urltpl = "http://www.europarl.europa.eu/committees/en/%s/documents-search.html" # todo add to searchRPCD, OPCD for doctype in ['AMCO', 'RPCD', 'OPCD']: postdata = "clean=false&leg=%s&docType=%s&miType=text" % (leg, doctype) nexttpl = "http://www.europarl.europa.eu/committees/en/%s/documents-search.html?action=%s&tabActif=tabResult#sidesForm" for com in ( k for k in COMMITTEE_MAP.keys() if len(k) == 4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP']): url = urltpl % (com) i = 0 logger.info('%s %s crawling %s' % (datetime.now().isoformat(), doctype, com)) root = fetch(url, params=postdata) prev = [] while True: logger.info("%s %s" % (datetime.now().isoformat(), url)) #logger.info(tostring(root)) tmp = { a.get('href'): ' '.join( a.xpath('../../../p[@class="rapporteurs"]//text()')) if doctype != 'AMCO' else None for a in root.xpath( '//a[@title="open this PDF in a new window"]') if (len(a.get('href', '')) > 13) } if not tmp or prev == tmp: break prev = tmp for u, v in sorted(tmp.items()): if db.ep_ams.find_one({'src': u}): continue yield u, v if update: break i += 1 url = nexttpl % (com, i) root = fetch(url)
def save(data, stats): for item in data: if not 'committee' in item: continue query = { 'committee': item['committee'], 'src': item['src'], 'title': item['title'] } if 'date' in data: query['date'] = item['date'] if 'end' in data: query['end'] = item['end'] else: query['seq_no'] = item['seq_no'] res = db.ep_comagendas.find_one(query) or {} d = diff( dict([(k, v) for k, v in res.items() if not k in ['_id', 'meta', 'changes']]), dict([(k, v) for k, v in item.items() if not k in [ '_id', 'meta', 'changes', ]])) if d: now = datetime.utcnow().replace(microsecond=0) if not 'meta' in item: item[u'meta'] = {} if not res: logger.info( (u'adding %s%s %s' % (u'%s ' % item['epdoc'] if 'epdoc' in item else '', item['committee'], item['title'])).encode('utf8')) item['meta']['created'] = now if stats: stats[0] += 1 notify(item, None) else: logger.info( (u'updating %s%s %s' % (u'%s ' % item['epdoc'] if 'epdoc' in item else '', item['committee'], item['title'])).encode('utf8')) logger.info(d) item['meta']['updated'] = now if stats: stats[1] += 1 item['_id'] = res['_id'] notify(item, d) item['changes'] = res.get('changes', {}) item['changes'][now.isoformat()] = d db.ep_comagendas.save(item) if stats: return stats else: return data
def crawler(saver=jdump, update=False): stats = [0, 0] for pdf, rapporteur in getComAms(update=update): logger.info(datetime.now().isoformat() + " " + pdf) ctr = [0, 0] try: saver(scrape(pdf, rapporteur), ctr) except: # ignore failed scrapes logger.warn("[!] %s failed to scrape: %s" % (datetime.now().isoformat(), pdf)) # logger.warn(traceback.format_exc()) raise logger.info("%s [i] added/updated: %s/%s" % (datetime.now().isoformat(), ctr[0], ctr[1])) stats[0] += ctr[0] stats[1] += ctr[1] logger.info("%s [o] total added/updated: %s/%s" % (datetime.now().isoformat(), stats[0], stats[1]))
def crawler(saver=jdump, update=False): stats=[0,0] for pdf, rapporteur in getComAms(update=update): logger.info(datetime.now().isoformat()+" "+pdf) ctr=[0,0] try: saver(scrape(pdf, rapporteur), ctr) except: # ignore failed scrapes logger.warn("[!] %s failed to scrape: %s" % (datetime.now().isoformat(), pdf)) #logger.warn(traceback.format_exc()) raise logger.info("%s [i] added/updated: %s/%s" % (datetime.now().isoformat(), ctr[0],ctr[1])) stats[0]+=ctr[0] stats[1]+=ctr[1] logger.info("%s [o] total added/updated: %s/%s" % (datetime.now().isoformat(),stats[0],stats[1]))
def save(data, stats): if not data: return stats src=data['meta']['source'] res=db.dossiers2.find_one({ 'meta.source' : src }) or {} d=diff(dict([(k,v) for k,v in res.items() if not k in ['_id', 'meta', 'changes']]), dict([(k,v) for k,v in data.items() if not k in ['_id', 'meta', 'changes',]])) #logger.warn(pprint.pformat(d)) if d: now=datetime.datetime.utcnow().replace(microsecond=0).isoformat() if not res: logger.info(('adding %s - %s' % (data['procedure']['reference'],data['procedure']['title'])).encode('utf8')) data['meta']['created']=data['meta']['timestamp'] del data['meta']['timestamp'] sys.stdout.flush() stats[0]+=1 else: logger.info(('updating %s - %s' % (data['procedure']['reference'],data['procedure']['title'])).encode('utf8')) data['meta']['updated']=data['meta']['timestamp'] del data['meta']['timestamp'] sys.stdout.flush() stats[1]+=1 data['_id']=res['_id'] logger.info(jdump(d)) if not NOMAIL: m=db.notifications.find({'dossiers': data['procedure']['reference']},['active_emails']) for g in m: if len(g['active_emails'])==0: continue msg = Message("[PT] %s %s" % (data['procedure']['reference'],data['procedure']['title']), sender = "*****@*****.**", bcc = g['active_emails']) #msg.html = htmldiff(data,d) msg.body = makemsg(data,d) mail.send(msg) #logger.info(htmldiff(data,d)) #logger.info(makemsg(data,d)) data['changes']=res.get('changes',{}) data['changes'][now]=d db.dossiers2.save(data) return stats
def crawlseq(urls): [save(scrape(url), [0, 0]) for url, title in urls] logger.info('end of crawl')
def parseMember(userid): url='http://www.europarl.europa.eu/meps/en/%s/get.html' % userid logger.info("scraping %s" % url) root = fetch(url) data = {u'active': True, 'meta': {u'url': url}} # return {'active': False} mepdiv=root.xpath('//div[@class="ep_elementpeople2"]') if len(mepdiv) == 1: mepdiv = mepdiv[0] else: logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv))) data[u'Name'] = mangleName(unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[0])) data[u'Photo'] = unicode(urljoin(BASE_URL,mepdiv.xpath('.//span[@class="ep_img"]/img')[0].get('src')),'utf8') (d, p) = mepdiv.xpath('.//div[@class="ep_elementtext"]/p/text()')[0].split(',', 1) try: data[u'Birth'] = { u'date': datetime.strptime(unws(d), "Born on %d %B %Y"), u'place': unws(p) } except ValueError: logger.warn('[!] failed to scrape birth data %s' % url) logger.warn(traceback.format_exc()) const={u'country': unws(mepdiv.xpath('.//span[@class="ep_country"]/text()')[0])} data[u'Constituencies']=[const] try: const[u'party']=unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[1]), except IndexError: data[u'active']=False else: group=unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[0]) data[u'Groups'] = [{ u'role': unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[1]), u'group': group, u'groupid': group_map[group]}] cdiv=root.xpath('//div[@class="ep_elementcontact"]') if len(cdiv): addif(data,u'RSS',[unicode(urljoin(BASE_URL,x.get('href')),'utf8') for x in cdiv[0].xpath('.//li[@class="ep_rss"]//a')]) addif(data,u'Homepage',[unicode(x.get('href'),'utf8') for x in cdiv[0].xpath('.//li[@class="ep_website"]//a')]) addif(data,u'Mail',[decodemail(unws(x)) for x in cdiv[0].xpath('.//li[@class="ep_email"]//text()') if len(unws(x))]) for span in root.xpath('//div[@id="contextzone"]//span[@class="ep_title"]'): title=unws(''.join(span.xpath('.//text()'))) if title in ['Accredited assistants', 'Local assistants']: addif(data,title,[unws(x) for x in span.xpath('../../..//li/div/text()')]) addif(data,u'Addresses',getAddress(root)) for div in root.xpath('//div[@class="ep_content"]'): key=unws(u''.join(div.xpath('.//span[@class="ep_title"]/text()'))) if not len(key): continue elif key.lower()=='curriculum vitae': data[u'CV'] = [unws(x) for x in div.xpath('.//div[@class="ep_elementtext"]//li/div/text()')] elif key in ['Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President', 'President', 'Vice-President']: for span in div.xpath('.//span[@class="commission_label"]'): item={u'role': key, u'abbr': unws(''.join(span.xpath('text()'))), u'Organization': unws(span.tail)} for start, field in orgmaps: if item['Organization'].startswith(start): if not field in data: data[field]=[] if field=='Committees' and item['Organization'] in COMMITTEE_MAP: item[u'committee_id']=COMMITTEE_MAP[item['Organization']] data[field].append(item) break else: logger.error('[!] unknown field %s' % key) return data
def scrape(url): try: logger.info('scrape ' + url) tree = fetch(url) agents, committees = scrape_actors(tree) forecasts = lst2obj((tree.xpath('//table[@id="forecast"]') or [None])[0], forecastFields) events = scrape_events(tree) procedure = scrape_basic(tree) ipext = [] for ipexd in (IPEXMAP[procedure['reference']] or {}).get('Dates', []): skip = False for event in forecasts + events: if event['type'] == ipexevents.get(ipexd['type'], {}).get( 'oeil', 'asdf') and event['date'] == ipexd['date']: skip = True break if skip: continue ipext.append(ipexd) allevents = agents + scrape_docs(tree) + events + forecasts + ipext other = [x for x in allevents if not x.get('date')] allevents = sorted([x for x in allevents if x.get('date')], key=itemgetter('date')) allevents = merge_events(allevents, committees) res = { u'meta': { 'source': url, 'id': int(url.split('id=')[1]), 'timestamp': datetime.datetime.utcnow() }, u'procedure': procedure, u'links': form2obj((tree.xpath('//table[@id="external_links"]') or [None])[0]), u'committees': committees, u'activities': sorted(allevents, key=itemgetter('date')), u'other': other, } # check for "final act" finalas = tree.xpath('//div[@id="final_act"]//a') final = {} for link in finalas: if link.get('class') == 'sumbutton': try: summary = fetch("http://www.europarl.europa.eu%s" % link.get('href')) except: continue final['text'] = [ tostring(x) for x in summary.xpath('//div[@id="summary"]') ] else: if not 'docs' in final: final['docs'] = [] final['docs'].append({ 'title': link.xpath('text()')[0].strip(), 'url': link.get('href') }) if final and final.get('docs'): res[u'procedure'][u'final'] = final.get('docs', [{}])[0] for item in res['activities']: if item.get( 'type') == u'Final act published in Official Journal': if final.get('text'): item[u'text'] = final['text'] if len(final.get('docs')) > 1: if not 'docs' in item: item[u'docs'] = final['docs'] else: item[u'docs'].extend(final['docs']) break return res except: logger.error("%s\n%s" % (url, traceback.format_exc())) return
def crawl(urls, threads=4): m = Multiplexer(scrape, save, threads=threads) m.start() [m.addjob(url) for url, title in urls] m.finish() logger.info('end of crawl')
def scrape(url, rapporteur=None): if (url in ['http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-483.680%2b02%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-454.387%2b01%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-456.679%2b01%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-494.504%2b01%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.705%2b01%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.767%2b02%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-454.385%2b01%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-465.012%2b01%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-496.504%2b01%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.724%2b01%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.721%2b02%2bDOC%2bPDF%2bV0%2f%2fEN', 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.723%2b03%2bDOC%2bPDF%2bV0%2f%2fEN'] or not url.endswith('EN')): logger.info("skipping unparsable url") return [] prolog=True res=[] block=None reference=None date=None committee=[] text=getraw(url).split('\n') for line in text: if prolog: if amstart.match(line): if reference==None: logger.warn("%s [!] couldn't find ref: %s" % (datetime.now().isoformat(), unws([x for x in text[:20] if unws(x)][2]))) # marking as scraped though db.ep_ams.save({'src': url, 'error': "couldn't find reference in source pdf"}) return [] if date==None or committee==[]: return [] #raise ValueError block=[line] prolog=False continue line=unws(line) if not line: continue if line in COMMITTEE_MAP: committee.append(COMMITTEE_MAP[line]) continue if (committee and not reference and re.match(refre, line)): reference=line if url == 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-506.166%2b03%2bDOC%2bPDF%2bV0%2f%2fEN': logger.info("adjusting reference to eudatap") reference="2012/0011(COD)" continue if (reference and not date): try: date = parse(unws(line), dayfirst=True) except ValueError: pass except TypeError: pass continue if amstart.match(line): # parse block res.append(parse_block(block, url, reference, date, committee, rapporteur)) block=[line] continue block.append(line) if block and filter(None,block): res.append(parse_block(block, url, reference, date, committee, rapporteur)) return res
def scrape(decl): mep_id = decl.split('/')[-1].split('_')[0] data = {'mep_id': mep_id, 'url': unicode(decl), 'date': ''} logger.info("findecl scraping %s" % mep_id) text=getraw(decl).split('\n') state=0 ptr=0 while ptr<len(text): # bg: "А Б В Г Д Е Ж З И" # el: "A B Γ Δ E ΣΤ Ζ H Θ" if (issectionhead(decl, text,ptr,state,0,('A',u'А','A')) or issectionhead(decl, text,ptr,state,2,('C',u'В',u'Γ')) or issectionhead(decl, text,ptr,state,3,('D',u'Г',u'Δ')) or issectionhead(decl, text,ptr,state,4,('E',u'Д',u'E')) or issectionhead(decl, text,ptr,state,5,('F',u'Е',u'ΣΤ'))): # skip to table while (text[ptr].split()[-4:]!=['1','2','3','4']): ptr+=1 if ptr>=len(text): logger.error('[meh] %s table not found' % state) raise IndexError start=ptr # skip empty lines while not text[ptr].split(): ptr+=1 if ptr>=len(text): logger.error('[meh] %s fail skip empty lines' % state) raise IndexError while True: if ptr>len(text): logger.error('[meh] fail past end of block %s' % state) raise IndexError if (text[ptr].strip()=='' and (text[ptr+1] in ['1',''] or text[ptr+1].strip()[:3] == '1/6')): break if text[ptr].startswith(' ' * 20) and (text[ptr].strip()[1]=='/' and text[ptr].strip()[0] in ['2','3','4']): break ptr+=1 end=ptr state+=1 #print >> sys.stderr, text[start:end] if state == 6: t = parse_table_f(text[start:end]) else: t = parse_table(text[start:end]) data[state_map[state]] = t if DEBUG: print "\t%s" % ('\n\t'.join((repr(x) for x in t)) or "none"), state elif issectionhead(decl, text,ptr,state,1,('B',u'Б', u'B')): while len([x for x in text[ptr].split(' ' * 10) if x]) != 2: ptr+=1 if ptr>=len(text): logger.error('[meh] table B not found') raise IndexError start=ptr # skip empty lines while ptr<len(text) and not text[ptr].split(): ptr+=1 while True: if ptr>len(text): logger.error('[meh] fail skip empty lines in B') raise IndexError if [text[ptr].strip(), text[ptr+1]] in (['','1'], ['','']): break if text[ptr].startswith(' ' * 20) and (text[ptr].strip()[1]=='/' and text[ptr].strip()[0] in ['2','3','4']): break ptr+=1 end=ptr state+=1 t = parse_table_b(text[start:end]) if DEBUG: print "\t%s" % ('\n\t'.join((repr(x) for x in t)) or "none"), state data[state_map[state]] = t elif state==6: while not issectionhead(decl, text,ptr,state,6,('G',u'Ж',u'Ζ')): ptr+=1 # skip continuation lines while text[ptr].split(): ptr+=1 if ptr>=len(text): logger.error('[meh] continuation in G fail') raise IndexError # skip empty lines while not text[ptr].split(): ptr+=1 if ptr>=len(text): logger.error('[meh] fail skip empty lines in G') raise IndexError gstart=ptr state+=1 while not issectionhead(decl, text,ptr,state,7,('H',u'З',u'H')): ptr+=1 gend=ptr-1 if DEBUG: print "\t", text[gstart:gend], state data[state_map[state]] = '\n'.join(x for x in map(unicode.strip, text[gstart:gend]) if x) # skip continuation lines while text[ptr].split(): ptr+=1 if ptr>=len(text): logger.error('[meh] continuation in H fail') raise IndexError # skip empty lines while not text[ptr].split(): ptr+=1 if ptr>=len(text): logger.error('[meh] fail skip empty lines in H') raise IndexError hstart=ptr state+=1 while not issectionhead(decl, text,ptr,state,8,('I',u'И',u'Θ')): ptr+=1 hend=ptr-1 if DEBUG: print "\t", text[hstart:hend], state data[state_map[state]] = '\n'.join(x for x in map(unicode.strip, text[hstart:hend]) if x) # skip continuation lines while text[ptr].split(): ptr+=1 if ptr>=len(text): logger.error('[meh] continuation in I fail') raise IndexError # skip empty lines while not text[ptr].split(): ptr+=1 if ptr>=len(text): logger.error('[meh] fail skip empty lines in I') raise IndexError istart=ptr while True: tmp = text[ptr].split() if len(tmp)==3: data['date']=tmp[1] del tmp[1] if tmp in iendsigs: break elif len(tmp)==5: # date=tmp[2] could be preserved in data tmpdate=tmp[2] del tmp[2] if tmp in [['Date', ':','Signature', ':']]: data['date']=tmpdate break ptr+=1 if ptr>=len(text): logger.error('[meh] fail find end in I') if DEBUG: print 'meh\n>>>%s' % '\n>>>'.join(text[istart:istart+14]).encode('utf8') raise IndexError state+=1 if DEBUG: print >> sys.stderr, state #print >> sys.stderr, "\t", text[istart:ptr], state data[state_map[state]] = '\n'.join(x for x in map(unicode.strip, text[istart:ptr]) if x) #else: #print >> sys.stderr, '>>>>>>>>', line.encode('utf8') ptr+=1 if state!=9: print >> sys.stderr, '>>>>>>>>', "wtfwtf", state logger.error('[wtf] did not reach final state %s' % state) return {} else: if (len(data['occupation'])>1 and data['occupation'][-1][0] in [u"No occupation held during the three years preceding the current mandate", u"Καμία επαγγελματική δραστηριότητα κατά τη διάρκεια των τριών ετών που προηγήθηκαν της τρέχουσας εντολής", u"Atividade Liberal como autor/outras atividades artísticas (remuneração inferior a 500 € na totalidade dos 3 anos anteriores)", u"Brak działalności zawodowej w okresie trzech lat poprzedzających obecną kadencję", u"Geen beroep uitgeoefend gedurende de drie jaar voorafgaand aan de huidige zittingsperiode", u"Nessuna attività svolta durante i tre anni precedenti l'attuale mandato", u"Keine Berufstätigkeit während des Dreijahreszeitraums vor der laufenden Wahlperiode", ]): del data['occupation'][-1] return data
def crawler(saver=jdump,threads=4): m=Multiplexer(scrape,saver,threads=threads) m.start() [m.addjob(url, data) for url, data in getComAgendas()] m.finish() logger.info('end of crawl')
def parse_block(block, url, reference, date, committee, rapporteur): am={u'src': url, u'reference': reference, u'date': date, u'committee': committee} #logger.info(block) # get title try: am[u'seq']=int(unws(block[0]).split()[1]) except ValueError: am[u'seq']=unws(block[0]).split()[1] except IndexError: logger.warn("%s wrong seq %s" % (datetime.now().isoformat(), block[0])) am[u'seq']=unws(block[0]) del block[0] strip(block) # find and strip justification i=len(block)-1 while i>2 and not (unws(block[i])=="Justification" and block[i].startswith(' ' * 6)): i-=1 if i>2: if i<len(block)-1 and (not unws(block[i+1]) or not block[i+1].startswith(' ') ): am['justification']='\n'.join(block[i+2:]) del block[i:] strip(block) else: logger.warn("%s wrong justification\n%s" % (datetime.now().isoformat(), '\n'.join(block[i:]))) # get original language if 4<len(unws(block[-1]))<=6 and unws(block[-1]).startswith('Or.'): am['orig_lang']=unws(block[-1])[4:] del block[-1] strip(block) # find split column new/old heading i=len(block)-1 while (i>2 and not ((block[i].endswith(" Amendment") or block[i].endswith(" PARTICULARS") or block[i].endswith(" Remedy") or block[i].endswith(" Amended text") or block[i].endswith(" Amendement") or block[i].endswith(" Amendments by Parliament") or block[i].endswith(" Proposal for rejection") or block[i].endswith(" Proposal for a rejection") or block[i].endswith(" Does not affect English version") or block[i].endswith(" (Does not affect English version)") or block[i].endswith(" Amendment by Parliament")) and len(block[i])>33) and not (unws(block[i])=='Text proposed by the Commission' or unws(block[i]) in types)): i-=1 if i>2: #if block[i].endswith(" Proposal for rejection"): # pass # location will be possibly '-' seq=False if unws(block[i]) in ["Amendment", "Amendment by Parliament"]: # sequential format, search for preceeding original text j=i while (j>2 and not (unws(block[j]) in types or unws(block[j])=='Text proposed by the Commission')): j-=1 if j>2: i=j seq=True; key='old' elif unws(block[i])=='Text proposed by the Commission' or block[i].strip() in types: seq=True; key='old' # throw headers del block[i] while i<len(block) and not unws(block[i]): del block[i] # skip blank lines mid=max([len(x) for x in block])/2 while i<len(block): if seq: if unws(block[i]) in ["Amendment", "Amendment by Parliament", "Text Amended"]: key='new' del block[i] continue try: am[key].append(block[i]) except KeyError: am[key]=[block[i]] del block[i] continue # only new, old is empty if block[i].startswith(' '): try: am['new'].append(unws(block[i])) except KeyError: am['new']=[unws(block[i])] del block[i] continue newstart = block[i].rstrip().rfind(' ') # only old, new is empty if newstart < 6: try: am['old'].append(unws(block[i])) except KeyError: am['old']=[unws(block[i])] del block[i] continue #mid=len(block[i])/2 #mid=40 lsep=block[i].rfind(' ', 0, mid) # todo calculate both, and use the one closer to the center rsep=block[i].find(' ', mid) sep=None if abs(lsep-mid)<abs(rsep-mid): if abs(lsep-mid)<15: sep=lsep else: if abs(rsep-mid)<15: sep=rsep if sep: try: am['old'].append(unws(block[i][:sep])) except KeyError: am['old']=[unws(block[i][:sep])] try: am['new'].append(unws(block[i][sep:])) except KeyError: am['new']=[unws(block[i][sep:])] else: # no sane split found #logger.warn("no split: %s %s\n%s" % (datetime.now().isoformat(), # (sep, mid, len(block[i]), newstart, block[i]), # block[i][mid-1:mid+2])) # fallback to naive splitting try: am['old'].append(unws(block[i][:newstart])) except KeyError: am['old']=[unws(block[i][:newstart])] try: am['new'].append(unws(block[i][newstart:])) except KeyError: am['new']=[unws(block[i][newstart:])] del block[i] strip(block) else: logger.warn("%s no table\n%s" % (datetime.now().isoformat(), '\n'.join(block[i:]))) am['content']=block[i:] return am i=0 # find end of authors while (i<len(block) and unws(block[i]) and not unws(block[i]).lower().startswith('compromise') and not istype(block[i]) and not unws(block[i]).split()[0] in locstarts): i+=1 if i<len(block): if i>0: names=' '.join(block[:i]) am['authors']=names #logger.info("names \n%s" % names) # convert to pt mep _ids for text in filter(None,splitNames(names)): mep=getMep(text,None,False) if mep: try: am['meps'].append(mep['UserID']) except KeyError: am['meps']=[mep['UserID']] else: logger.info("fix %s" % text) del block[:i] strip(block) elif rapporteur: am['authors']=rapporteur for text in filter(None,splitNames(rapporteur)): mep=getMep(text,None,False) if mep: try: am['meps'].append(mep['UserID']) except KeyError: am['meps']=[mep['UserID']] else: logger.info("fix %s" % text) else: logger.info("%s no authors in Amendment %s" % (datetime.now().isoformat(), am['seq'])) else: logger.warn("%s no boundaries in Amendment %s\n%s" % (datetime.now().isoformat(), am['seq'], '\n'.join(block))) am['rest']=block return am # handle compromise info i=0 while (i<len(block) and unws(block[i]) and not istype(block[i]) and not unws(block[i]).split()[0] in locstarts): i+=1 if i<len(block) and i>0: am['compromise']=block[:i] del block[:i] strip(block) i=0 while (i<len(block) and unws(block[i])): if unws(block[i]).split()[0] in locstarts: try: am['location'].append((' '.join(block[:i]),unws(block[i]))) except KeyError: am['location']=[(' '.join(block[:i]),unws(block[i]))] del block[:i+1] i=0 else: i+=1 if len(block)>0 and ((len(block)==1 or not unws(block[1])) and unws(block[0])!='1' and 'location' in am): am['location'][-1]=(am['location'][-1][0],"%s %s" % (am['location'][-1][1],block[0])) del block[0] strip(block) if block: if not ((len(block)==3 and unws(block[0])=='1' and not unws(block[1]) and block[2].startswith(" ")) or (len(block)==2 and unws(block[0])=='1' and block[1].startswith(" "))): # ignore obvious footnotes logger.info("rest in Amendment %s\n%s" % (am['seq'],'\n'.join(block))) return am
def scrape(url): try: logger.info('scrape '+url) tree=fetch(url) agents,committees=scrape_actors(tree) forecasts=lst2obj((tree.xpath('//table[@id="forecast"]') or [None])[0],forecastFields) events=scrape_events(tree) procedure=scrape_basic(tree) if not procedure: return ipext=[] for ipexd in IPEXMAP.get(procedure['reference'], {}).get('Dates',[]): skip=False for event in forecasts+events: if event['type'] in ipexevents.get(ipexd['type'],{}).get('oeil',[]) and event['date']==ipexd['date']: skip=True break if skip: continue ipext.append(ipexd) allevents=agents+scrape_docs(tree)+events+forecasts+ipext other=[x for x in allevents if not x.get('date')] allevents=sorted([x for x in allevents if x.get('date')],key=itemgetter('date')) allevents=merge_events(allevents,committees, agents) res={u'meta': {'source': url, 'timestamp': datetime.datetime.utcnow() }, u'procedure': procedure, u'links': form2obj((tree.xpath('//table[@id="external_links"]') or [None])[0]), u'committees': committees, u'activities': sorted(allevents, key=itemgetter('date')), u'other': other, } tmp=url.split('id=') if len(tmp)>1: res['meta']['id']=int(tmp[1]) # check for "final act" finalas=tree.xpath('//div[@id="final_act"]//a') final={} for link in finalas: if link.get('class')=='sumbutton': try: summary=fetch("http://www.europarl.europa.eu%s" % link.get('href')) except: continue final['text']=[unicode(tostring(x)) for x in summary.xpath('//div[@id="summary"]')] else: if not 'docs' in final: final['docs']=[] final['docs'].append({'title': link.xpath('text()')[0].strip(), 'url': link.get('href')}) if final and final.get('docs'): res[u'procedure'][u'final']=final.get('docs',[{}])[0] for item in res['activities']: if item.get('type')==u'Final act published in Official Journal': if final.get('text'): item[u'text']=final['text'] if len(final.get('docs'))>1: if not 'docs' in item: item[u'docs']=final['docs'] else: item[u'docs'].extend(final['docs']) break return res except: logger.error("%s\n%s" % (url,traceback.format_exc())) return
def crawlseq(urls, null=False): stats=[0,0] [save(scrape(url),stats) for url, title in urls if (null and db.dossiers2.find_one({'meta.source': url},['_id'])==None) or not null] logger.info('end of crawl %s' % stats)
def jdump(d, tmp=None): # simple json dumper default for saver (multiplexer related) logger.info(json.dumps(d, indent=1, default=dateJSONhandler, ensure_ascii=False).encode('utf-8')) return json.dumps(d, indent=1, default=dateJSONhandler, ensure_ascii=False)
if sys.argv[1]=="test": print jdump(scrape('28215')).encode('utf8') print jdump(scrape('113959')).encode('utf8') #print jdump(scrape('108570')).encode('utf8') #print jdump(scrape('1934')).encode('utf8') #print jdump(scrape('96919')).encode('utf8') #import code; code.interact(local=locals()); sys.exit(0) print jdump(scrape("http://www.europarl.europa.eu/meps/en/1934/get.html"),None) print jdump(scrape("http://www.europarl.europa.eu/meps/en/28576/get.html"), None) print jdump(scrape("http://www.europarl.europa.eu/meps/en/1263/Elmar_BROK.html"), None) print jdump(scrape("http://www.europarl.europa.eu/meps/en/96739/Reinhard_B%C3%9CTIKOFER.html"), None) print jdump(scrape("http://www.europarl.europa.eu/meps/en/28269/Jerzy_BUZEK.html"), None) print jdump(scrape("http://www.europarl.europa.eu/meps/en/1186/Astrid_LULLING.html"), None) elif sys.argv[1]=='mepid' and sys.argv[2]: #print saver(scrape(int(sys.argv[2]))).encode('utf8') print jdump(scrape(int(sys.argv[2]))).encode('utf8') sys.exit(0) elif sys.argv[1] in meplists.keys(): logger.info('\n\tsaver: %s\n\tseq: %s' % (saver, 'seq' in args)) meps=getmeps(sys.argv[1]) if 'seq' in args: res=seqcrawl(meps,saver=saver, null=null) if 'dry' in args: print "[%s]" % ',\n'.join(res).encode('utf8') else: crawler(meps,saver=saver)
def crawler(targets,saver=jdump,threads=4, term='7'): m=Multiplexer(scrape,saver,threads=threads) m.start() [m.addjob(url, data) for url, data in targets(term=term)] m.finish() logger.info('end of crawl')
def crawl(saver=jdump,threads=4): m=Multiplexer(scrape,saver,threads=threads) m.start() [m.addjob(url) for url, name in get_meps()] m.finish() logger.info('end of crawl')
def scrape(decl): mep_id = decl.split('/')[-1].split('_')[0] data = {'mep_id': mep_id, 'url': unicode(decl), 'date': ''} logger.info("findecl scraping %s" % mep_id) text = getraw(decl).split('\n') state = 0 ptr = 0 while ptr < len(text): # bg: "А Б В Г Д Е Ж З И" # el: "A B Γ Δ E ΣΤ Ζ H Θ" if (issectionhead(decl, text, ptr, state, 0, ('A', u'А', 'A')) or issectionhead(decl, text, ptr, state, 2, ('C', u'В', u'Γ')) or issectionhead(decl, text, ptr, state, 3, ('D', u'Г', u'Δ')) or issectionhead(decl, text, ptr, state, 4, ('E', u'Д', u'E')) or issectionhead(decl, text, ptr, state, 5, ('F', u'Е', u'ΣΤ'))): # skip to table while (text[ptr].split()[-4:] != ['1', '2', '3', '4']): ptr += 1 if ptr >= len(text): logger.error('[meh] %s table not found' % state) raise IndexError start = ptr # skip empty lines while not text[ptr].split(): ptr += 1 if ptr >= len(text): logger.error('[meh] %s fail skip empty lines' % state) raise IndexError while True: if ptr > len(text): logger.error('[meh] fail past end of block %s' % state) raise IndexError if (text[ptr].strip() == '' and (text[ptr + 1] in ['1', ''] or text[ptr + 1].strip()[:3] == '1/6')): break if text[ptr].startswith(' ' * 20) and ( text[ptr].strip()[1] == '/' and text[ptr].strip()[0] in ['2', '3', '4']): break ptr += 1 end = ptr state += 1 #print >> sys.stderr, text[start:end] if state == 6: t = parse_table_f(text[start:end]) else: t = parse_table(text[start:end]) data[state_map[state]] = t if DEBUG: print "\t%s" % ('\n\t'.join( (repr(x) for x in t)) or "none"), state elif issectionhead(decl, text, ptr, state, 1, ('B', u'Б', u'B')): while len([x for x in text[ptr].split(' ' * 10) if x]) != 2: ptr += 1 if ptr >= len(text): logger.error('[meh] table B not found') raise IndexError start = ptr # skip empty lines while ptr < len(text) and not text[ptr].split(): ptr += 1 while True: if ptr > len(text): logger.error('[meh] fail skip empty lines in B') raise IndexError if [text[ptr].strip(), text[ptr + 1]] in (['', '1'], ['', '']): break if text[ptr].startswith(' ' * 20) and ( text[ptr].strip()[1] == '/' and text[ptr].strip()[0] in ['2', '3', '4']): break ptr += 1 end = ptr state += 1 t = parse_table_b(text[start:end]) if DEBUG: print "\t%s" % ('\n\t'.join( (repr(x) for x in t)) or "none"), state data[state_map[state]] = t elif state == 6: while not issectionhead(decl, text, ptr, state, 6, ('G', u'Ж', u'Ζ')): ptr += 1 # skip continuation lines while text[ptr].split(): ptr += 1 if ptr >= len(text): logger.error('[meh] continuation in G fail') raise IndexError # skip empty lines while not text[ptr].split(): ptr += 1 if ptr >= len(text): logger.error('[meh] fail skip empty lines in G') raise IndexError gstart = ptr state += 1 while not issectionhead(decl, text, ptr, state, 7, ('H', u'З', u'H')): ptr += 1 gend = ptr - 1 if DEBUG: print "\t", text[gstart:gend], state data[state_map[state]] = '\n'.join( x for x in map(unicode.strip, text[gstart:gend]) if x) # skip continuation lines while text[ptr].split(): ptr += 1 if ptr >= len(text): logger.error('[meh] continuation in H fail') raise IndexError # skip empty lines while not text[ptr].split(): ptr += 1 if ptr >= len(text): logger.error('[meh] fail skip empty lines in H') raise IndexError hstart = ptr state += 1 while not issectionhead(decl, text, ptr, state, 8, ('I', u'И', u'Θ')): ptr += 1 hend = ptr - 1 if DEBUG: print "\t", text[hstart:hend], state data[state_map[state]] = '\n'.join( x for x in map(unicode.strip, text[hstart:hend]) if x) # skip continuation lines while text[ptr].split(): ptr += 1 if ptr >= len(text): logger.error('[meh] continuation in I fail') raise IndexError # skip empty lines while not text[ptr].split(): ptr += 1 if ptr >= len(text): logger.error('[meh] fail skip empty lines in I') raise IndexError istart = ptr while True: tmp = text[ptr].split() if len(tmp) == 3: data['date'] = tmp[1] del tmp[1] if tmp in iendsigs: break elif len(tmp) == 5: # date=tmp[2] could be preserved in data del tmp[2] if tmp in [['Date', ':', 'Signature', ':']]: break ptr += 1 if ptr >= len(text): logger.error('[meh] fail find end in I') if DEBUG: print 'meh\n>>>%s' % '\n>>>'.join( text[istart:istart + 14]).encode('utf8') raise IndexError state += 1 if DEBUG: print >> sys.stderr, state #print >> sys.stderr, "\t", text[istart:ptr], state data[state_map[state]] = '\n'.join( x for x in map(unicode.strip, text[istart:ptr]) if x) #else: #print >> sys.stderr, '>>>>>>>>', line.encode('utf8') ptr += 1 if state != 9: print >> sys.stderr, '>>>>>>>>', "wtfwtf", state logger.error('[wtf] did not reach final state %s' % state) return {} else: return data
def parseMember(userid): url='http://www.europarl.europa.eu/meps/en/%s/_history.html' % userid logger.info("scraping %s" % url) root = fetch(url, ignore=[500]) data = { u'active': False, u'Photo': unicode(urljoin(BASE_URL,"/mepphoto/%s.jpg" % userid)), u'meta': {u'url': url} } mepdiv=root.xpath('//div[@class="zone_info_mep_transparent_mep_details"]') if len(mepdiv) == 1: mepdiv = mepdiv[0] else: logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv))) data[u'Name'] = mangleName(unws(' '.join(mepdiv.xpath('.//li[@class="mep_name"]//text()')))) borntxt=mepdiv.xpath('.//span[@class="more_info"]/text()') if len(borntxt)>0: if unws(borntxt[-1]).startswith('Date of death:'): try: data[u'Death'] = datetime.strptime(unws(borntxt[-1]), u"Date of death: %d %B %Y") except ValueError: logger.warn('[!] failed to scrape birth data %s' % url) logger.warn(traceback.format_exc()) tmp = borntxt[-2].split(',', 1) else: tmp = borntxt[-1].split(',', 1) if len(tmp)==2: (d, p) = tmp else: d,p = tmp[0], None try: data[u'Birth'] = { u'date': datetime.strptime(unws(d), u"Date of birth: %d %B %Y")} except ValueError: logger.warn(traceback.format_exc()) finally: if p: if 'Birth' in data: data[u'Birth'][u'place'] = unws(p) else: data[u'Birth'] = unws(p) else: logger.warn('[!] no birth data %s' % url) # scrape stuff from right column addif(data,u'RSS',[unicode(urljoin(BASE_URL,x.get('href')),'utf8') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_rss"]')]) addif(data,u'Homepage',[x.get('href') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_website"]')]) addif(data,u'Twitter',[x.get('href') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_twitt"]')]) addif(data,u'Facebook',[x.get('href') for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_fb"]')]) addif(data,u'Mail',[x.get('href')[7:].replace('[dot]','.').replace('[at]','@')[::-1] for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_email"]')]) # contact information for span in root.xpath('//div[@id="content_right"]//h3'): title=unws(''.join(span.xpath('.//text()'))) if title == "Contacts": addif(data,u'Addresses',getAddress(span)) # scrape main content for section in root.xpath('//div[@id="content_left"]/div[@class="boxcontent nobackground"]/h4'): key=unws(''.join(section.xpath('.//text()'))) if key=="National parties": # constituencies key='Constituencies' for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'): line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')])) try: interval, party = line.split(' : ',1) except ValueError: continue tmp = interval.split(' / ') if not key in data: data[key]=[] if len(tmp)==2: (start, end) = tmp else: start = interval.split()[0] end = "31.12.9999" cstart = party.rfind(' (') if party[cstart+2:-1] in SEIRTNUOC: country = party[cstart+2:-1] party = party[:cstart] else: logger.warn('unknown country: %s' % party[cstart+2:-1]) country='unknown' #print etree.tostring(constlm, pretty_print=True) data[key].append({ u'party': party, u'country': country, u'start': datetime.strptime(unws(start), u"%d.%m.%Y"), u'end': datetime.strptime(unws(end), u"%d.%m.%Y"), }) elif key in ['Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President', 'President', 'Vice-President', 'Observer', 'Quaestor']: # memberships in various committees, delegations and EP mgt for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'): line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')])) try: interval, org = line.split(' : ',1) except ValueError: continue tmp = interval.split(' / ') if len(tmp)==2: (start, end) = tmp else: start = interval.split()[0] end = "31.12.9999" item={u'role': key, u'abbr': COMMITTEE_MAP.get(org), u'Organization': org, u'start': datetime.strptime(unws(start), u"%d.%m.%Y"), u'end': datetime.strptime(unws(end), u"%d.%m.%Y"), } for start, field in orgmaps: if item['abbr'] in COMMITTEE_MAP or item['Organization'].startswith(start): if not field in data: data[field]=[] if field=='Committees' and item['Organization'] in COMMITTEE_MAP: item[u'committee_id']=COMMITTEE_MAP[item['Organization']] data[field].append(item) break elif key == u'Political groups': for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'): line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')])) interval, org = line.split(' : ',1) tmp = org.split(u' - ') if len(tmp)>1: org = ' - '.join(tmp[:-1]) role = tmp[-1] elif org.endswith(' -'): org=org[:-2] role='' else: logger.error('[!] political group line %s' % line) continue tmp = interval.split(' / ') if len(tmp)==2: (start, end) = tmp else: start = interval.split()[0] end = "31.12.9999" if not u'Groups' in data: data[u'Groups']=[] data[u'Groups'].append( {u'role': role, u'Organization': org, u'country': COUNTRIES.get(unws(constlm.get('class')).upper(), 'unknown country: %s' % unws(constlm.get('class'))), u'groupid': group_map[org], u'start': datetime.strptime(unws(start), u"%d.%m.%Y"), u'end': datetime.strptime(unws(end), u"%d.%m.%Y"), }) else: logger.error('[!] unknown field %s' % key) # sort all lists in descending order for fld in ['Constituencies', 'Groups', 'Committees', 'Delegations', 'Staff']: if not fld in data: continue data[fld]=sorted(data[fld], key=lambda x: x.get('end',x['start']), reverse=True) # get CV - page (is on separate http path :/) cvurl='http://www.europarl.europa.eu/meps/en/%s/_cv.html' % userid root = fetch(cvurl, ignore=[500]) data[u'CV']=[unws(x) for x in root.xpath('//p[@class="details_cv"]/text()')] # get assistants also on a separate page :/ assurl='http://www.europarl.europa.eu/meps/en/%s/_assistants.html' % userid root = fetch(assurl, ignore=[500]) for h3 in root.xpath('//h3[@id="section"]'): title=unws(''.join(h3.xpath('.//text()'))) if title in ['Accredited assistants', 'Local assistants']: if not 'assistants' in data: data['assistants']={} addif(data['assistants'], title.lower().split()[0], [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')]) elif title in ['Accredited assistants (grouping)', 'Local assistants (grouping)', 'Service providers', ' Trainees', 'Paying agents (grouping)', 'Paying agents']: if not 'assistants' in data: data['assistants']={} addif(data['assistants'], title.lower(), [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')]) return data
def scrape(url): try: logger.info("scrape " + url) tree = fetch(url) agents, committees = scrape_actors(tree) forecasts = lst2obj((tree.xpath('//table[@id="forecast"]') or [None])[0], forecastFields) events = scrape_events(tree) procedure = scrape_basic(tree) ipext = [] for ipexd in (IPEXMAP[procedure["reference"]] or {}).get("Dates", []): skip = False for event in forecasts + events: if ( event["type"] == ipexevents.get(ipexd["type"], {}).get("oeil", "asdf") and event["date"] == ipexd["date"] ): skip = True break if skip: continue ipext.append(ipexd) allevents = agents + scrape_docs(tree) + events + forecasts + ipext other = [x for x in allevents if not x.get("date")] allevents = sorted([x for x in allevents if x.get("date")], key=itemgetter("date")) allevents = merge_events(allevents, committees) res = { u"meta": {"source": url, "id": int(url.split("id=")[1]), "timestamp": datetime.datetime.utcnow()}, u"procedure": procedure, u"links": form2obj((tree.xpath('//table[@id="external_links"]') or [None])[0]), u"committees": committees, u"activities": sorted(allevents, key=itemgetter("date")), u"other": other, } # check for "final act" finalas = tree.xpath('//div[@id="final_act"]//a') final = {} for link in finalas: if link.get("class") == "sumbutton": try: summary = fetch("http://www.europarl.europa.eu%s" % link.get("href")) except: continue final["text"] = [tostring(x) for x in summary.xpath('//div[@id="summary"]')] else: if not "docs" in final: final["docs"] = [] final["docs"].append({"title": link.xpath("text()")[0].strip(), "url": link.get("href")}) if final and final.get("docs"): res[u"procedure"][u"final"] = final.get("docs", [{}])[0] for item in res["activities"]: if item.get("type") == u"Final act published in Official Journal": if final.get("text"): item[u"text"] = final["text"] if len(final.get("docs")) > 1: if not "docs" in item: item[u"docs"] = final["docs"] else: item[u"docs"].extend(final["docs"]) break return res except: logger.error("%s\n%s" % (url, traceback.format_exc())) return
def crawler(meps,saver=jdump,threads=4): m=Multiplexer(scrape,saver,threads=threads) m.start() [m.addjob(mepid) for mepid in meps] m.finish() logger.info('end of crawl')
def crawlseq(urls): [save(scrape(url), [0, 0]) for url, title in urls] logger.info("end of crawl")
def crawl(urls, threads=4): m=Multiplexer(scrape,save, threads=threads) m.start() [m.addjob(url) for url, title in urls] m.finish() logger.info('end of crawl')
def crawler(meps, saver=jdump, threads=4, term=current_term): m = Multiplexer(scrape, saver, threads=threads) m.start() [m.addjob(url, data) for url, data in meps(term=term)] m.finish() logger.info('end of crawl')
ctr=[0,0] try: saver(scrape(pdf, rapporteur), ctr) except: # ignore failed scrapes logger.warn("[!] %s failed to scrape: %s" % (datetime.now().isoformat(), pdf)) #logger.warn(traceback.format_exc()) raise logger.info("%s [i] added/updated: %s/%s" % (datetime.now().isoformat(), ctr[0],ctr[1])) stats[0]+=ctr[0] stats[1]+=ctr[1] logger.info("%s [o] total added/updated: %s/%s" % (datetime.now().isoformat(),stats[0],stats[1])) if __name__ == "__main__": import pprint, sys if len(sys.argv)>1: if sys.argv[1]=='update': crawler(saver=save,update=True) sys.exit(0) debug=True ctr=[0,0] while len(sys.argv)>1: logger.info(sys.argv[1]) save(scrape(sys.argv[1], sys.argv[2]), ctr) #pprint.pprint(scrape(sys.argv[1], sys.argv[2])) del sys.argv[2] del sys.argv[1] sys.exit(0) else: crawler(saver=save)
def scrape(celexid, path): logger.info("scraping %s%s:NOT" % (EURLEXURL, celexid)) path.reverse() (code, lang) = celexid.split(":")[1:3] st = 6 if len(code) > 6: if code[6].isalpha(): st = 7 eurlex = { 'id': { u'celexid': celexid, u'sector': code[0], u'year': code[1:5], u'doctype': code[5:st], u'refno': code[st:], u'lang': lang, } } else: eurlex = { 'id': { u'celexid': celexid, u'sector': code[0], u'year': code[1:5], u'doctype': code[5:6], u'lang': lang, } } try: eurlex['id'][u'typeDesc'] = CELEXCODES[code[0]]['Document Types'][ code[5:st]] if code[5:st] != 'C' else CELEXCODES[code[0]]['Sector'] except: eurlex['id'][u'typeDesc'] = u"Unknown" logger.warn("[!] unknown typedesc %s" % celexid) eurlex['meta'] = {u'src': "%s%s:NOT" % (EURLEXURL, celexid)} root = fetch("%s%s:NOT" % (EURLEXURL, celexid)) if len(root.xpath('//h1[text()="No documents matching criteria."]')) > 0: logger.warn('[!] nothing to scrape here: %s', "%s%s:NOT" % (EURLEXURL, celexid)) return eurlex[u'title'] = unws( root.xpath( '//h2[text()="Title and reference"]/following-sibling::p/text()') [0]) # dates dates = root.xpath('//h2[text()="Dates"]/following-sibling::ul/text()') for y in dates: if not unws(y): continue title, rest = unws(y).split(": ", 1) item = {} date = rest[:10] tail = rest[10:] if tail.startswith('; '): tail = tail[2:] if date == '99/99/9999': item[u'date'] = datetime(9999, 12, 31) elif date == '00/00/0000': item[u'date'] = datetime(0001, 01, 01) elif date == '//': continue else: try: item[u'date'] = datetime.strptime(date, u"%d/%m/%Y") except ValueError: try: item[u'date'] = datetime.strptime(date, u"%m/%d/%Y") except: pass if len(tail): item['note'] = tail try: eurlex['dates'][title] = item except: eurlex['dates'] = {title: item}