Exemplos de fetch em Python, exemplos de utils.utils.fetch em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: comagendas.py Projeto: parltrack/parltrack

def crawl(term, test=[], **kwargs):
    seen = set()
    url = "https://www.europarl.europa.eu/committees/en/documents/search?committeeMnemoCode=%s&textualSearchMode=TITLE&textualSearch=&documentTypeCode=AGEN&reporterPersId=&procedureYear=&procedureNum=&procedureCodeType=&peNumber=&aNumber=&aNumberYear=&documentDateFrom=&documentDateTo=&meetingDateFrom=&meetingDateTo=&performSearch=true&term=%s&page=%s&pageSize={}".format(
        itemsPerPage)
    jobs = []
    for com in (k for k in test or COMMITTEE_MAP.keys() if len(k) == 4):
        i = 0
        log(3, 'crawling %s, term: %s' % (com, term))
        try:
            root = fetch(url % (com, term, i))
        except requests.exceptions.HTTPError as e:
            #if e.response.status_code == 500:
            log(
                3,
                "failed to get list of draft agendas for %s in term %d, http error code: %s"
                % (com, term, e.response.status_code))
            continue
        prev = []
        while True:
            log(
                3, "crawling comagenda search page %s for %s term %s" %
                (i, com, term))
            tmp = []
            for a in root.xpath('//div[@class="erpl_document-header"]/h3/a'):
                u = a.get('href', '')
                if (len(u) <= 13):
                    log(2, 'url is too short, skipping: "%s"' % u)
                    continue
                if u in seen:
                    log(3, "skipping url: %s" % repr(u))
                    continue
                seen.add(u)
                tmp.append(u)
                try:
                    payload = dict(kwargs)
                    payload['url'] = u
                    payload['committee'] = com
                    if test:
                        print(payload)
                    else:
                        add_job('comagenda', payload=payload)
                except:
                    print(u)

            if not tmp or prev == tmp or len(tmp) < itemsPerPage:
                break
            prev = tmp
            i += 1
            try:
                root = fetch(url % (com, term, i))
            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 500:
                    log(
                        3,
                        "failed to page %s of draft agendas for %s in term %d"
                        % (i, com, term))
                break

Exemplo n.º 2

0

Exibir arquivo

Arquivo: oeil.py Projeto: parltrack/parltrack

def get_all_dossiers():
    for year in xrange(datetime.date.today().year, 1972, -1):
        tree=fetch('http://www.europarl.europa.eu/oeil/widgets/resultwidget.do?lang=en&noHeader=false&q=objectReferenceN:N-%s/????\(*\)'
                   % (year))
        count=int(tree.xpath('//span[@class="ep_title resultNum pdfHide"]/text()')[0].strip()[len('Results found: '):])
        tree=fetch('http://www.europarl.europa.eu/oeil/widgets/resultwidget.do?lang=en&limit=%s&noHeader=false&q=objectReferenceN:N-%s/????\(*\)'
                   % (count,year))
        links=tree.xpath('//a[@class="reference rssEntry_id rssEntry_title rssEntry_updated"]')
        for link in links:
            yield (urljoin(BASE_URL,link.get('href')),
                   (link.xpath('text()') or [''])[0])

Exemplo n.º 3

0

Exibir arquivo

Arquivo: oeil.py Projeto: parltrack/parltrack

def scrape(target):
    url,title=target
    try:
        logger.info('scrape '+url)
        tree=fetch(url)
        agents,committees=scrape_actors(tree)
        forecasts=lst2obj((tree.xpath('//table[@id="forecast"]') or [None])[0],forecastFields)
        events=scrape_events(tree)
        procedure=scrape_basic(tree)
        if not procedure: return
        allevents=agents+scrape_docs(tree)+events+forecasts
        other=[x for x in allevents if not x.get('date')]
        allevents=sorted([x for x in allevents if x.get('date')],key=itemgetter('date'))
        allevents=merge_events(allevents,committees, agents)
        res={u'meta': {'source': url,
                       'timestamp': datetime.datetime.utcnow() },
             u'procedure': procedure,
             u'links': form2obj((tree.xpath('//table[@id="external_links"]') or [None])[0]),
             u'committees': committees,
             u'activities': sorted(allevents, key=itemgetter('date')),
             u'other': other,
             }
        tmp=url.split('id=')
        if len(tmp)>1:
            res['meta']['id']=int(tmp[1])
        # check for "final act"
        finalas=tree.xpath('//div[@id="final_act"]//a')
        final={}
        for link in finalas:
            if link.get('class')=='sumbutton':
                try: summary=fetch("http://www.europarl.europa.eu%s" % link.get('href'))
                except: continue
                final['text']=[unicode(tostring(x)) for x in summary.xpath('//div[@id="summary"]')]
            else:
                if not 'docs' in final: final['docs']=[]
                final['docs'].append({'title': link.xpath('text()')[0].strip(),
                                               'url': link.get('href')})
        if final and final.get('docs'):
            res[u'procedure'][u'final']=final.get('docs',[{}])[0]
            for item in res['activities']:
                if item.get('type')==u'Final act published in Official Journal':
                    if final.get('text'):
                        item[u'text']=final['text']
                    if  len(final.get('docs'))>1:
                       if not 'docs' in item:
                           item[u'docs']=final['docs']
                       else:
                           item[u'docs'].extend(final['docs'])
                    break
        return res
    except:
        logger.error("%s\n%s" % (url,traceback.format_exc()))
        return

Exemplo n.º 4

0

Exibir arquivo

def crawl(term, update=False, test=[], **kwargs):
    seen = set()
    url="https://www.europarl.europa.eu/committees/en/documents/search?committeeMnemoCode=%s&textualSearchMode=TITLE&textualSearch=&documentTypeCode=AMCO&reporterPersId=&procedureYear=&procedureNum=&procedureCodeType=&peNumber=&aNumber=&aNumberYear=&documentDateFrom=&documentDateTo=&meetingDateFrom=&meetingDateTo=&performSearch=true&term=%s&page=%s&pageSize={}".format(itemsPerPage)
    jobs = []
    for com in (k for k in test or COMMITTEE_MAP.keys() if len(k)==4):
        i=0
        log(3,'crawling %s, term: %s' % (com, term))
        try:
            root=fetch(url % (com, term, i))
        except requests.exceptions.HTTPError as e:
            #if e.response.status_code == 500:
            log(3, "failed to get list of amendments for %s in term %d, http error code: %s" % (com, term, e.response.status_code))
            continue
        prev=[]
        while True:
            log(3, "crawling amendments search page %s for %s term %s" % (i, com, term))
            tmp=[]
            for a in root.xpath('//a[@class="erpl_document-subtitle-pdf"]'):
                u=a.get('href','')
                if (len(u)<=13):
                    log(2,'url is too short, skipping: "%s"' % u)
                    continue
                if u in seen or u in skipurls or (not u.endswith('EN') and not u.endswith('_EN.pdf')):
                    log(3,"skipping url: %s" % repr(u))
                    continue
                seen.add(u)
                tmp.append(u)
                rs = a.xpath('../../following-sibling::div/span[@class="erpl_document-subtitle-author"]')
                r = [y for y in [junws(x) for x in rs] if y]
                try:
                    payload = dict(kwargs)
                    payload['url'] = u
                    payload['meps'] = r
                    if test:
                        print(payload)
                    else:
                        add_job('amendment', payload=payload)
                except:
                    print(u, r)

            if not tmp or prev==tmp or len(tmp) < itemsPerPage:
                break
            prev=tmp

            if update: break

            i+=1
            try:
                root=fetch(url % (com, term, i))
            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 500:
                    log(3, "failed to page %s of draft agendas for %s in term %d" % (i, com, term))
                break

Exemplo n.º 5

0

Exibir arquivo

Arquivo: ep_meps.py Projeto: parltrack/parltrack

def crawler(query='current'):
    if query=='unlisted':
        for mep in unlisted:
            yield mep
    elif query=='all':
        for letter in xrange(26):
            tmp=meplists[query]
            a=ord('A')
            root=fetch(tmp%chr(a+letter), ignore=[500])
            for meplm in root.xpath('//id/text()'):
                yield int(meplm)
    else:
        root=fetch(meplists[query], ignore=[500])
        for meplm in root.xpath('//id/text()'):
            yield int(meplm)

Exemplo n.º 6

0

Exibir arquivo

def get_all_dossiers(**kwargs):
    for year in range(datetime.date.today().year, 1971, -1):
        tree = fetch(
            'https://oeil.secure.europarl.europa.eu/oeil/widgets/resultwidget.do?lang=en&noHeader=false&q=objectReferenceN:N-%s/*\(*\)'
            % (year))
        tmp = tree.xpath(
            '//span[@class="ep_name" and (starts-with(normalize-space(),"Results found :") or starts-with(normalize-space(),"Result found :"))]/text()'
        )
        if not tmp:
            log(1, "no dossiers found for %d" % year)
            raise ValueError("failed to find number of dossiers for year %d" %
                             year)
        tmp = unws(tmp[0])
        count = int(tmp[tmp.index(":") + 1:])
        log(4, "year %d, count %d" % (year, count))
        #tree=fetch('https://oeil.secure.europarl.europa.eu/oeil/popups/printresultlist.xml?q=objectReferenceN:N-%s/????\(*\)&lang=en&s1&all&limit=%s&lang=en'
        #           % (year, count), prune_xml=True)
        tree = fromstring(
            fetch_raw(
                'https://oeil.secure.europarl.europa.eu/oeil/popups/printresultlist.xml?q=objectReferenceN:N-%s/*\(*\)&lang=en&s1&all&limit=%s&lang=en'
                % (year, count)).encode("utf8"))
        items = tree.xpath('//item')
        i = 0
        for item in items:
            url = html.unescape(
                urljoin(BASE_URL, str(item.xpath('./link/text()')[0])))
            ref = unws(item.xpath('./reference/text()')[0])
            if '*' in ref: ref = ref[:ref.index('*')]
            log(4, 'adding dossier scraping job %s' % url)
            payload = dict(kwargs)
            payload['url'] = url
            add_job('dossier', payload=payload)
            i += 1
        if i != count: log(1, "total %d, expected %d" % (i, count))

Exemplo n.º 7

0

Exibir arquivo

Arquivo: oeil.py Projeto: parltrack/parltrack

def scrape_docs(tree):
    res=[]
    docs=tree.xpath('//table[@id="doc_gateway"]')
    tabs=[x.xpath('preceding-sibling::h2')[0].xpath('text()')[0] for x in docs]
    for inst, table in izip(tabs, docs):
        if inst in instmap.keys():
            for doc in lst2obj(table, docFields):
                if inst != 'Other institutions':
                    doc[u'body']=instmap[inst]
                else:
                    try:
                        doc[u'body']=otherinst[doc['type'].split(':')[0]]
                    except KeyError:
                        doc[u'body']=''
                if doc['body'] in ['EP','CSL'] and doc['type']=='Joint text approved by Conciliation Committee co-chairs':
                    # skip it twice and hope it's listed in the all documents, so it becomes EP/CSL :)
                    continue
                if doc.get('text'):
                    try: summary=fetch(doc['text']['url'])
                    except: continue
                    doc[u'text']=[unicode(tostring(x)) for x in summary.xpath('//div[@id="summary"]')]
                res.append(doc)
        elif inst != 'All':
            logger.warn(u"[!] unrecognized tab in documents %s" % inst)
    return res

Exemplo n.º 8

0

Exibir arquivo

def scrape(all=False, **kwargs):
    if all:
        sources = ['http://www.europarl.europa.eu/meps/en/directory/xml?letter=&leg=']
    else:
        sources = ['http://www.europarl.europa.eu/meps/en/incoming-outgoing/incoming/xml',
                   'http://www.europarl.europa.eu/meps/en/incoming-outgoing/outgoing/xml',
                   'http://www.europarl.europa.eu/meps/en/full-list/xml']
    payload={}
    if 'onfinished' in kwargs:
        payload['onfinished']=kwargs['onfinished']
    if all:
        actives = {e['UserID'] for e in db.meps_by_activity(True)}
        inactives = {e['UserID'] for e in db.meps_by_activity(False)}
        meps = actives | inactives
        for unlisted in [ 1018, 26833, 1040, 1002, 2046, 23286, 28384, 1866, 28386,
                          1275, 2187, 34004, 28309, 1490, 28169, 28289, 28841, 1566,
                          2174, 4281, 28147, 28302, ]:
            meps.discard(unlisted)
            payload['id']=unlisted
            add_job('mep', dict(payload))
    for src in sources:
        root = fetch(src, prune_xml=True)
        for id in root.xpath("//mep/id/text()"):
            if all: meps.discard(int(id))
            payload['id']=int(id)
            add_job('mep', dict(payload))
    if all:
        log(3,"mepids not in unlisted nor in directory {!r}".format(meps))
        for id in meps:
            payload['id']=id
            add_job('mep', dict(payload))

Exemplo n.º 9

0

Exibir arquivo

def scrape(url):
    log(4, "scraping %s" % url)
    root = fetch(url)
    res = {'responsible': [], 'opinions': []}
    for opinion in root.xpath(
            '//p/span[contains(text(),"FINAL VOTE BY ROLL CALL IN COMMITTEE ASKED FOR OPINION")]'
    ):
        procedure = opinion.xpath(
            '../../p/span[contains(text(),"PROCEDURE – COMMITTEE ASKED FOR OPINION")]'
        )
        if len(procedure) != 1:
            log(
                1, "found %s procedures for opinion in %s" %
                (len(procedure), url))
            raise ValueError
        proc_table = procedure[0].xpath('../following-sibling::p/table')
        proc = extract_proc(proc_table, url)
        date = datetime.strptime(proc['Date adopted'], "%d.%m.%Y")
        cmte = proc['Opinion by Date announced in plenary'].split()[0]
        res_op = {'proc': proc, 'date': date, 'committee': cmte, 'votes': {}}
        res['opinions'].append(res_op)
        for table in opinion.xpath('../following-sibling::p/table'):
            if table == proc_table[0]: continue
            vote = extract_table(table, url, date)
            res_op['votes'][vote['type']] = vote
            del (vote['type'])
    responsible = root.xpath(
        '//tr[@class="doc_title"]//span[contains(text(),"FINAL VOTE BY ROLL CALL IN COMMITTEE RESPONSIBLE")]'
    )
    if len(responsible) != 1:
        log(1, "number of responsible rc votes is not 1: %s" % url)
        raise ValueError
    responsible = responsible[0]
    proc = root.xpath(
        '//tr[@class="doc_title"]//span[contains(text(),"PROCEDURE – COMMITTEE RESPONSIBLE")]'
    )
    if len(proc) != 1:
        log(
            1,
            "could not find exactly one procedure for the responsible committee in %s"
            % url)
        raise ValueError
    proc = extract_proc(
        proc[0].xpath('../../following-sibling::tr/td/p/table'), url)
    cmte = proc['Committee responsible Date announced in plenary'].split()[0]
    date = datetime.strptime(proc['Date adopted'], "%d.%m.%Y")
    res_resp = {
        'proc': proc,
        'date': date,
        'committee': cmte,
        'votes': {},
    }
    res['responsible'].append(res_resp)
    for table in responsible.xpath('../../following-sibling::tr/td/p/table'):
        vote = extract_table(table, url, date)
        res_resp['votes'][vote['type']] = vote
        del (vote['type'])
    return res

Exemplo n.º 10

0

Exibir arquivo

Arquivo: oeil.py Projeto: parltrack/parltrack

def scrape_events(tree):
    res=[]
    for item in lst2obj((tree.xpath('//table[@id="key_events"]') or [None])[0],eventFields):
        if item.get('text'):
            try: summary=fetch(item['text']['url'])
            except: continue
            item['text']=[unicode(tostring(x)) for x in summary.xpath('//div[@id="summary"]')]
        res.append(item)
    return res

Exemplo n.º 11

0

Exibir arquivo

Arquivo: ep_meps.py Projeto: parltrack/parltrack

def getMEPDeclarations(id):
    try:
        dom = fetch("http://www.europarl.europa.eu/meps/en/%s/_declarations.html" % (id), ignore=[500])
    except Exception as e:
        logger.error("mepdeclaration %s" % e)
        return []
    dif_links = dom.xpath('//h3[@id="sectionDIF"]/following-sibling::div//ul[@class="link_collection_noborder"]//a[@class="link_pdf"]/@href')
    dat_links = dom.xpath('//h3[@id="sectionDAT"]/following-sibling::div//ul[@class="link_collection_noborder"]//a[@class="link_pdf"]/@href')
    if not dif_links:
        logger.warn('[!] no declaration data http://www.europarl.europa.eu/meps/en/%s/_declarations.html' % id)
    return dif_links, dat_links

Exemplo n.º 12

0

Exibir arquivo

def crawl(year, term, **kwargs):
    listurl = 'http://www.europarl.europa.eu/plenary/en/minutes.html'
    PARAMS = '?clean=false&leg=%s&refSittingDateStart=01/01/%s&refSittingDateEnd=31/12/%s&miType=title&miText=Roll-call+votes&tabActif=tabResult'
    params = PARAMS % (term, year, year)
    root=fetch(listurl+params)
    prevdates=None
    dates=root.xpath('//span[@class="date"]/text()')
    i=0
    while dates and dates!=prevdates:
        for date in dates:
            if not date.strip(): continue
            #print(term, date.strip())
            date = datetime.strptime(date.strip(), "%d-%m-%Y").strftime("%Y-%m-%d")
            payload = dict(kwargs)
            payload['term'] = term
            payload['date'] = date
            add_job('pvote', payload=payload)
        i+=1
        root=fetch("%s%s&action=%s" % (listurl,params,i))
        prevdates=dates
        dates=root.xpath('//span[@class="date"]/text()')

Exemplo n.º 13

0

Exibir arquivo

Arquivo: oeil.py Projeto: parltrack/parltrack

def checkUrl(url):
    if not url: return False
    if url in seenurls:
        return seenurls[url]
    try:
        res=fetch(url)
    except Exception as e:
        #print >>sys.stderr, "[!] checkurl failed in %s\n%s" % (url, e)
        seenurls[url]=False
    else:
        seenurls[url]=(res.xpath('//h1/text()') or [''])[0]!="Not available in English."
    return seenurls[url]

Exemplo n.º 14

0

Exibir arquivo

def scrape(url, committee, **kwargs):
    comid = committee
    root = fetch(url)
    lines = [
        x for x in root.xpath('//td[@class="contents"]/div/*')
        if unws(' '.join(x.xpath('.//text()')))
    ]
    lines = [
        x for x in lines if unws(' '.join(x.xpath('.//text()'))) not in
        ['<EPHeader>', '</EPHeader>']
    ]
    if not len(lines): return
    if not unws(' '.join(lines[2].xpath('.//text()'))) in [
            'DRAFT AGENDA', '<TitreType> DRAFT AGENDA </TitreType>'
    ]:
        log(
            3, "not DRAFT AGENDA %s in %s" %
            (unws(' '.join(lines[2].xpath('.//text()'))), url))
    agenda = {
        u'committee': comid,
        u'committee_full': unws(' '.join(lines[0].xpath('.//text()'))),
        u'src': url,
    }
    i = 1
    if unws(' '.join(lines[3].xpath(
            './/text()'))) == "INTERPARLIAMENTARY COMMITTEE MEETING":
        log(2, "skipping interparl com meet")
        return
    if len(lines) >= 7 and unws(' '.join(
            lines[6].xpath('.//text()'))).startswith('Room'):
        agenda.update({
            u'docid':
            unws(' '.join(lines[1].xpath('.//text()'))),
            u'type':
            unws(' '.join(lines[3].xpath('.//text()'))),
            u'time':
            toTime(unws(' '.join(lines[4].xpath('.//text()')))),
            u'city':
            unws(' '.join(lines[5].xpath('.//text()'))),
            u'room':
            unws(' '.join(lines[6].xpath('.//text()')))[6:],
        })
        i = 7
    itemcnt = 0
    item = {}
    schedule = None
    res = []
    while i < len(lines):
        line = lines[i]
        i += 1
        txt = unws(' '.join(line.xpath('.//text()')))
        if txt in ['* * *', '***']:
            continue  # skip end of schedule block

        # 20 December 2011, 16.00 – 16.30
        tmp = toTime(txt)
        if tmp:
            schedule = tmp
            if i < len(lines) and unws(' '.join(
                    lines[i].xpath('.//text()'))) == 'In camera':
                schedule[u'incamera'] = True
                i += 1
            continue

        if line.tag == 'div':
            item[u'actors'] = getactors(line)
            continue
        firsttoken = txt.split()[0]
        # 6. Alternative dispute resolution for consumer disputes and
        #    amending Regulation (EC) No 2006/2004 and Directive
        #    2009/22/EC (Directive on consumer ADR)
        if firsttoken[-1] == '.' and firsttoken[:-1].isdigit(
        ) and itemcnt + 1 == int(firsttoken[:-1]):
            if item: res.append(item)
            itemcnt += 1
            item = copy.deepcopy(agenda)
            item.update({
                u'title': ' '.join(txt.split()[1:]),
                u'seq_no': itemcnt,
            })
            if schedule:
                item.update(schedule)
            continue
        # trailing list of "details"
        # · Presentation by the Commission of the proposal & Impact Assessment
        # · Exchange of views
        if firsttoken == u"·":
            if not 'list' in item: item[u'list'] = []
            tmp = ' '.join(txt.split()[1:])
            if tmp.startswith('Deadline for tabling amendments:'):
                try:
                    item[u'tabling_deadline'] = datetime.strptime(
                        tmp.split(':')[1].strip(), "%d %B %Y, %H.%M")
                except ValueError:
                    try:
                        item[u'tabling_deadline'] = datetime.strptime(
                            tmp.split(':')[1].strip(), "%d.%m.%Y at %H.%M")
                    except:
                        log(
                            2, '[$] unknown tabling deadline format %s' %
                            unws(tmp))
            item[u'list'].append(tmp)
            continue
        # committee dossier
        # IMCO/7/08130
        if txt.startswith("%s/7/" % comid) and len(txt) == 12:
            item[u'comdossier'] = txt
            continue
        # ***I    2011/0373(COD)       COM(2011)0793 – C7-0454/2011
        tmp = getdocs(txt)
        if tmp:
            item.update(tmp)
            continue
        # fall-through line
        log(4, "(falltrough) %s %s" % (line.tag, txt.encode('utf8')))
    if item: res.append(item)
    save(res)
    return res

Exemplo n.º 15

0

Exibir arquivo

def parse_history(id, root, mep):
    for term in root.xpath(
            '//div[@id="sectionsNavPositionInitial"]//div[@class="erpl_side-navigation"]/div/ul/li//span[text()="History of parliamentary service"]/../following-sibling::div//ul/li//a/span[@class="t-x"]/text()'
    ):
        if not term.endswith("parliamentary term"):
            log(
                2,
                'history menu item does not end as expected with "parliamentary term": %s http://www.europarl.europa.eu/meps/en/%s/name/declarations'
                % (term, id))
            raise ValueError
            #continue
        term = int(term[0])
        if (id, term) in {(124870, 9), (129141, 9)}:
            continue  # jeppe kofod, and frans timmermanns never really got started.
        root = fetch(
            "http://www.europarl.europa.eu/meps/en/%s/name/history/%s" %
            (id, term))
        body = root.xpath('//div[@id="status"]')[0]
        for title in body.xpath('.//h4'):
            key = unws(''.join(title.xpath('.//text()')))
            if key in [None, '']:
                log(
                    2,
                    "empty history section http://www.europarl.europa.eu/meps/en/%s/name/history/%s"
                    % (id, term))
                raise ValueError
                #continue
            #mep[key] = []
            for item in title.xpath('./following-sibling::ul/li'):
                interval = unws(''.join(item.xpath('./strong/text()')))
                post = item.xpath('./strong/following-sibling::text()')[0][3:]
                if key in ["National parties", "Constituencies"]:
                    key = 'Constituencies'
                    # parse date interval
                    try:
                        start, end = parse_hist_date(interval)
                    except:
                        log(
                            1,
                            "illegal date interval: %s http://www.europarl.europa.eu/meps/en/%s/name/history/%s"
                            % (interval, id, term))
                        raise ValueError
                        #continue
                    # parse party and country
                    cstart = post.rfind(' (')
                    if post[cstart + 2:-1] in SEIRTNUOC:
                        country = post[cstart + 2:-1]
                        party = post[:cstart]
                    else:
                        log(
                            2, '%s unknown country: %s' %
                            (id, post[cstart + 2:-1]))
                        raise ValueError
                        party = 'unknown'
                        country = 'unknown'
                    if not key in mep: mep[key] = []
                    mep[key].append({
                        u'party': party,
                        u'country': country,
                        u'start': start,
                        u'end': end,
                        'term': term
                    })
                    if end == datetime.strptime("31.12.9999", u"%d.%m.%Y"):
                        mep['active'] = True
                elif key in [
                        'Member', 'Substitute', 'Chair', 'Vice-Chair',
                        'Co-President', 'President', 'Vice-President',
                        'Observer', 'Quaestor', 'Substitute observer'
                ]:
                    # memberships in various committees, delegations and EP mgt
                    try:
                        start, end = parse_hist_date(interval)
                    except:
                        log(
                            2,
                            "illegal date interval: %s http://www.europarl.europa.eu/meps/en/%s/name/history/%s"
                            % (interval, id, term))
                        raise ValueError
                        #continue
                    item = {
                        u'role': key,
                        u'Organization': unws(post),
                        u'start': start,
                        u'end': end,
                        u'term': term,
                    }
                    for start, field in ORGMAPS:
                        if item['Organization'].startswith(start):
                            if field == 'Committees':
                                if item['Organization'] in COMMITTEE_MAP:
                                    item[u'abbr'] = COMMITTEE_MAP[
                                        item['Organization']]
                                else:
                                    log(
                                        5, "no abbr found for committee: %s" %
                                        item['Organization'])
                            if field == 'Delegations':
                                if item['Organization'] in DELEGATIONS:
                                    item[u'abbr'] = DELEGATIONS[
                                        item['Organization']]
                                else:
                                    log(
                                        5, "no abbr found for delegation: %s" %
                                        item['Organization'])
                            if not field in mep: mep[field] = []
                            mep[field].append(item)
                            break
                elif key == u'Political groups':
                    try:
                        start, end = parse_hist_date(interval)
                    except:
                        log(
                            1,
                            "illegal date interval: %s http://www.europarl.europa.eu/meps/en/%s/name/history/%s"
                            % (interval, id, term))
                        raise ValueError
                        #continue
                    tmp = post.split(u' - ')
                    if len(tmp) > 1:
                        org = ' - '.join(tmp[:-1])
                        role = tmp[-1]
                    elif post.endswith(' -'):
                        org = post[:-2]
                        role = ''
                    elif post in ['Non-attached Members', 'Non-attached']:
                        org = post
                        role = 'Member'
                    else:
                        log(
                            2,
                            '[!] political group line "%s", http://www.europarl.europa.eu/meps/en/%s/name/history/%s'
                            % (post, id, term))
                        raise ValueError
                        #continue
                    if not u'Groups' in mep: mep[u'Groups'] = []
                    if not org in GROUP_MAP:
                        log(5, "no groupid found for group: %s" % org)
                    mep[u'Groups'].append({
                        u'role': role,
                        u'Organization': org,
                        # u'country':      country, # this value is missing from the latest EP website
                        u'groupid': GROUP_MAP.get(org, org),
                        u'start': start,
                        u'end': end,
                    })
                else:
                    log(
                        2,
                        '[!] unknown field "%s" http://www.europarl.europa.eu/meps/en/%s/name/history/%s'
                        % (key, id, term))
                    raise ValueError

    # reorder historical lists in ascending order, so new entries are appended and don't mess up the diffs
    for k in ('Constituencies', 'Groups', 'Committees', 'Delegations',
              'Staff'):
        if not k in mep: continue
        mep[k] = [
            e
            for e in sorted(mep[k],
                            key=lambda x: (x['start'], x[
                                'end'], x.get('Organization', x.get('party'))))
        ]

Exemplo n.º 16

0

Exibir arquivo

Arquivo: ep_meps.py Projeto: parltrack/parltrack

def parseMember(userid):
    url='http://www.europarl.europa.eu/meps/en/%s/_history.html' % userid
    logger.info("scraping %s" % url)
    root = fetch(url, ignore=[500])

    data = {
        u'active': False,
        u'Photo': unicode(urljoin(BASE_URL,"/mepphoto/%s.jpg" % userid)),
        u'meta': {u'url': url}
        }

    mepdiv=root.xpath('//div[@class="zone_info_mep_transparent_mep_details"]')
    if len(mepdiv) == 1:
        mepdiv = mepdiv[0]
    else:
        logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv)))
    data[u'Name'] = mangleName(unws(' '.join(mepdiv.xpath('.//li[@class="mep_name"]//text()'))))

    borntxt=mepdiv.xpath('.//span[@class="more_info"]/text()')
    if len(borntxt)>0:
        if unws(borntxt[-1]).startswith('Date of death:'):
            try:
                data[u'Death'] = datetime.strptime(unws(borntxt[-1]), u"Date of death: %d %B %Y")
            except ValueError:
                logger.warn('[!] failed to scrape birth data %s' % url)
                logger.warn(traceback.format_exc())
            tmp = borntxt[-2].split(',', 1)
        else:
            tmp = borntxt[-1].split(',', 1)
        if len(tmp)==2:
            (d, p) = tmp
        else:
            d,p = tmp[0], None
        try:
            data[u'Birth'] = { u'date': datetime.strptime(unws(d), u"Date of birth: %d %B %Y")}
        except ValueError:
            logger.warn(traceback.format_exc())
        finally:
            if p:
                if 'Birth' in data:
                    data[u'Birth'][u'place'] = unws(p)
                else:
                    data[u'Birth'] = unws(p)
    else:
        logger.warn('[!] no birth data %s' % url)

    # scrape stuff from right column
    addif(data,u'RSS',[unicode(urljoin(BASE_URL,x.get('href')),'utf8')
                       for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_rss"]')])
    addif(data,u'Homepage',[x.get('href')
                            for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_website"]')])
    addif(data,u'Twitter',[x.get('href')
                           for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_twitt"]')])
    addif(data,u'Facebook',[x.get('href')
                           for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_fb"]')])
    addif(data,u'Mail',[x.get('href')[7:].replace('[dot]','.').replace('[at]','@')[::-1]
                        for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_email"]')])
    # contact information
    for span in root.xpath('//div[@id="content_right"]//h3'):
        title=unws(''.join(span.xpath('.//text()')))
        if title == "Contacts":
            addif(data,u'Addresses',getAddress(span))

    # scrape main content
    for section in root.xpath('//div[@id="content_left"]/div[@class="boxcontent nobackground"]/h4'):
        key=unws(''.join(section.xpath('.//text()')))
        if key=="National parties":
            # constituencies
            key='Constituencies'
            for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'):
                line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')]))
                try:
                    interval, party = line.split(' : ',1)
                except ValueError:
                    continue
                tmp = interval.split(' / ')
                if not key in data: data[key]=[]
                if len(tmp)==2:
                    (start, end) = tmp
                else:
                    start = interval.split()[0]
                    end = "31.12.9999"
                cstart = party.rfind(' (')
                if party[cstart+2:-1] in SEIRTNUOC:
                    country = party[cstart+2:-1]
                    party = party[:cstart]
                else:
                    logger.warn('unknown country: %s' % party[cstart+2:-1])
                    country='unknown'
                #print etree.tostring(constlm, pretty_print=True)
                data[key].append({
                    u'party':     party,
                    u'country':   country,
                    u'start':     datetime.strptime(unws(start), u"%d.%m.%Y"),
                    u'end':       datetime.strptime(unws(end), u"%d.%m.%Y"),
                    })
        elif key in ['Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President', 'President', 'Vice-President', 'Observer', 'Quaestor', 'Substitute observer']:
            # memberships in various committees, delegations and EP mgt
            for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'):
                line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')]))
                try:
                    interval, org = line.split(' : ',1)
                except ValueError:
                    continue
                tmp = interval.split(' / ')
                if len(tmp)==2:
                    (start, end) = tmp
                else:
                    start = interval.split()[0]
                    end = "31.12.9999"
                item={u'role': key,
                      u'abbr': COMMITTEE_MAP.get(org),
                      u'Organization': org,
                      u'start':     datetime.strptime(unws(start), u"%d.%m.%Y"),
                      u'end':       datetime.strptime(unws(end), u"%d.%m.%Y"),
                      }
                for start, field in orgmaps:
                    if item['abbr'] in COMMITTEE_MAP or item['Organization'].startswith(start):
                        if not field in data: data[field]=[]
                        if field=='Committees' and item['Organization'] in COMMITTEE_MAP:
                            item[u'committee_id']=COMMITTEE_MAP[item['Organization']]
                        data[field].append(item)
                        break
        elif key == u'Political groups':
            for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'):
                line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')]))
                interval, org = line.split(' : ',1)
                tmp = org.split(u' - ')
                if len(tmp)>1:
                    org = ' - '.join(tmp[:-1])
                    role = tmp[-1]
                elif org.endswith(' -'):
                        org=org[:-2]
                        role=''
                else:
                    logger.error('[!] political group line %s' % line)
                    continue
                tmp = interval.split(' / ')
                if len(tmp)==2:
                    (start, end) = tmp
                else:
                    start = interval.split()[0]
                    end = "31.12.9999"
                if not u'Groups' in data: data[u'Groups']=[]
                data[u'Groups'].append(
                    {u'role':         role,
                     u'Organization': org,
                     u'country':      COUNTRIES.get(unws(constlm.get('class')).upper(), 'unknown country: %s' % unws(constlm.get('class'))),
                     u'groupid':      group_map[org],
                     u'start':        datetime.strptime(unws(start), u"%d.%m.%Y"),
                     u'end':          datetime.strptime(unws(end), u"%d.%m.%Y"),
                     })
        else:
            logger.error('[!] unknown field %s' % key)

    # sort all lists in descending order
    for fld in ['Constituencies', 'Groups', 'Committees', 'Delegations', 'Staff']:
        if not fld in data: continue
        data[fld]=sorted(data[fld],
                         key=lambda x: x.get('end',x['start']),
                         reverse=True)

    # get CV - page (is on separate http path :/)
    cvurl='http://www.europarl.europa.eu/meps/en/%s/_cv.html' % userid
    root = fetch(cvurl, ignore=[500])
    data[u'CV']={}
    for sec in root.xpath('//h3[@class="collapsible"]'):
        section=unws(''.join(sec.xpath('.//text()')))
        data[u'CV'][section]=[]
        for line in sec.xpath('./following-sibling::div[1]//li'):
            data[u'CV'][section].append(unws(''.join(line.xpath('.//text()'))))


    # get assistants also on a separate page :/
    assurl='http://www.europarl.europa.eu/meps/en/%s/_assistants.html' % userid
    root = fetch(assurl, ignore=[500])
    for h3 in root.xpath('//h3[@id="section"]'):
        title=unws(''.join(h3.xpath('.//text()')))
        if title in ['Accredited assistants', 'Local assistants']:
            if not 'assistants' in data: data['assistants']={}
            addif(data['assistants'],
                  title.lower().split()[0],
                  [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')])
        elif title in ['Accredited assistants (grouping)', 'Local assistants (grouping)',
                       'Service providers', ' Trainees', 'Paying agents (grouping)', 'Paying agents']:
            if not 'assistants' in data: data['assistants']={}
            addif(data['assistants'],
                  title.lower(),
                  [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')])

    return data

Exemplo n.º 17

0

Exibir arquivo

def scrape(id, **kwargs):
    # we ignore the /meps/en/<id>/<name>/home path, since we can get all info also from other pages
    url = "http://www.europarl.europa.eu/meps/en/%s/name/cv" % id
    xml = fetch_raw(url)  # we have to patch up the returned html...
    xml = xml.replace("</br>", "<br/>")  # ...it contains some bad tags..
    root = fromstring(
        xml
    )  # ...which make the lxml soup parser drop some branches in the DOM
    sidebar_check(root, url)

    mep = {
        'UserID':
        id,
        'Name':
        mangleName(
            unws(' '.join(
                root.xpath('//span[@class="sln-member-name"]/text()'))), id),
        'Photo':
        "https://www.europarl.europa.eu/mepphoto/%s.jpg" % id,
        'meta': {
            'url': url
        },
        'Twitter': [
            unws(x.replace("http:// ", "")) for x in root.xpath(
                '//section[@id="presentationmep"]//a[@data-original-title="Twitter"]/@href'
            )
        ],
        'Homepage': [
            unws(x.replace("http:// ", "")) for x in root.xpath(
                '//section[@id="presentationmep"]//a[@data-original-title="Website"]/@href'
            )
        ],
        'Facebook': [
            unws(x.replace("http:// ", "")) for x in root.xpath(
                '//section[@id="presentationmep"]//a[@data-original-title="Facebook"]/@href'
            )
        ],
        'Instagram': [
            unws(x.replace("http:// ", "")) for x in root.xpath(
                '//section[@id="presentationmep"]//a[@data-original-title="Instagram"]/@href'
            )
        ],
        'Mail': [
            deobfus_mail(x) for x in root.xpath(
                '//section[@id="presentationmep"]//a[@data-original-title="E-mail"]/@href'
            )
        ],
        'Addresses':
        parse_addr(root),
        'active':
        False,
    }

    mep = addchangednames(mep)

    birthdate = root.xpath('//time[@id="birthDate"]/text()')
    if len(birthdate) > 0:
        mep['Birth'] = {
            'date': datetime.strptime(unws(birthdate[0]), u"%d-%m-%Y")
        }
        place = root.xpath('//time[@id="birthDate"]/following-sibling::text()')
        if len(place) > 0:
            tmp = unws(' '.join(place))
            if tmp.startswith(", "): tmp = tmp[2:]
            mep['Birth']['place'] = tmp

    death = root.xpath('//time[@id="deathDate"]/text()')
    if death:
        mep['Death'] = datetime.strptime(unws(death[0]), u"%d-%m-%Y")

    body = root.xpath(
        '//span[@id="detailedcardmep"]/following-sibling::section')[0]

    if body.xpath('.//h1[text()="Curriculum vitae "]'):
        if not body.xpath('.//h3[@id="no_cv_available"]'):
            mep['CV'] = {
                'updated':
                datetime.strptime(
                    unws(
                        body.xpath(
                            './/p[@class="small"]/strong[contains(text(),"Updated: ")]/text()'
                        )[0]), u"Updated: %d/%m/%Y")
            }
            mep['CV'].update({
                unws(''.join(title.xpath(".//text()"))): [
                    unws(''.join(item.xpath(".//text()"))).replace(
                        "-...", "- ...")
                    for item in title.xpath("following-sibling::ul/li")
                ]
                for title in body.xpath('.//h4')
                #if not unws(''.join(title.xpath(".//text()"))).startswith("Original version : ")
            })

    # assistants
    url = "http://www.europarl.europa.eu/meps/en/%s/name/assistants" % id
    root = fetch(url)
    body = root.xpath(
        '//span[@id="detailedcardmep"]/following-sibling::section')[0]
    if unws(' '.join(body.xpath(".//h1/text()"))) == "Assistants":
        for h4 in body.xpath('.//h4'):
            title = unws(''.join(h4.xpath(".//text()")))
            assistants = [
                unws(''.join(item.xpath(".//text()")))
                for item in h4.xpath("../div//span")
            ]
            if title in ['Accredited assistants', 'Local assistants']:
                if not 'assistants' in mep: mep['assistants'] = {}
                title = title.lower().split()[0]
                if assistants: mep['assistants'][title] = assistants
            elif title in [
                    'Accredited assistants (grouping)',
                    'Local assistants (grouping)', 'Service providers',
                    'Trainees', 'Paying agents (grouping)', 'Paying agents',
                    'Assistants to the Vice-Presidency/to the Quaestorate'
            ]:
                if not 'assistants' in mep: mep['assistants'] = {}
                title = title.lower()
                if assistants: mep['assistants'][title] = assistants
            else:
                log(2,
                    'unknown title for assistants "{}" {}'.format(title, url))
                raise ValueError

    # declarations
    root = fetch("http://www.europarl.europa.eu/meps/en/%s/name/declarations" %
                 id)
    body = root.xpath(
        '//span[@id="detailedcardmep"]/following-sibling::section')[0]
    if unws(' '.join(body.xpath(".//h1/text()"))) == "Declarations":
        for title in body.xpath('.//h4'):
            key = unws(''.join(title.xpath('.//text()')))
            if key == 'Declaration of financial interests':
                key = 'Financial Declarations'
                mep[key] = []
                for pdf in title.xpath('./following-sibling::ul/li/a'):
                    url = pdf.xpath('./@href')[0]
                    try:
                        mep[key].append(findecl.scrape(url))
                    except:
                        log(1, "failed to extract findecl from %s" % url)
            elif key == 'Declarations of participation by Members in events organised by third parties':
                key = 'Declarations of Participation'
                mep[key] = []
                for pdf in title.xpath(
                        './following-sibling::ul/li/a'
                )[::
                  -1]:  # reversed order, otherwise newer ones get prepended and mess up the diff
                    url = pdf.xpath('./@href')[0]
                    name = unws(''.join(pdf.xpath('.//text()')))
                    mep[key].append({'title': name, 'url': url})
            elif key in [
                    'Declaration of good conduct',
                    'Voluntary confirmation on the use of the General Expenditure Allowance'
            ]:
                mep[key] = []
                for pdf in title.xpath(
                        './following-sibling::ul/li/a'
                )[::
                  -1]:  # reversed order, otherwise newer ones get prepended and mess up the diff
                    url = pdf.xpath('./@href')[0]
                    name = unws(''.join(pdf.xpath('.//text()')))
                    mep[key].append({'title': name, 'url': url})
            else:
                log(
                    2,
                    'unknown type of declaration: "%s" http://www.europarl.europa.eu/meps/en/%s/name/declarations'
                    % (key, id))
                key = None
                raise ValueError

    # history
    parse_history(id, root, mep)
    process(mep,
            id,
            db.mep,
            'ep_meps',
            mep['Name']['full'],
            nopreserve=(['Addresses'], ['assistants']),
            onchanged=onchanged)

    if __name__ == '__main__':
        return mep
    del mep

Exemplo n.º 18

0

Exibir arquivo

def scrape(id, terms, mepname, **kwargs):
    activity_types = (
        ('plenary-speeches', 'CRE'),
        ('reports', "REPORT"),
        ('reports-shadow', "REPORT-SHADOW"),
        ('opinions', "COMPARL"),
        ('opinions-shadow', "COMPARL-SHADOW"),
        ('motions-instit', "MOTION"),
        ('oral-questions', "OQ"),
        # other activities
        ('written-explanations', 'WEXP'),
        ('major-interpellations', 'MINT'),
        ('written-questions', "WQ"),
        ('motions-indiv', "IMOTION"),
        ('written-declarations', "WDECL"),
    )
    activities = {}
    for type, TYPE in activity_types:
        for term in terms:
            page = 0
            cnt = 20
            url = "http://www.europarl.europa.eu/meps/en/%s/loadmore-activities/%s/%s/?page=%s&count=%s" % (
                id, type, term, page, cnt)
            try:
                root = fetch(url)
            except:
                log(1, "failed to fetch {}".format(url))
                raise ValueError
                #continue
            #print(url, file=sys.stderr)
            while (len(root.xpath('//div[@class="erpl_document"]')) > 0):
                for node in root.xpath('//div[@class="erpl_document"]'):
                    if type == 'written-explanations':
                        item = {
                            'title':
                            unws(''.join(
                                node.xpath(
                                    './div/h3/span[@class="t-item"]//text()'))
                                 ),
                            'date':
                            datetime.strptime(
                                node.xpath('./div[1]/div[1]/span[1]/text()')
                                [0], u"%d-%m-%Y"),
                            'text':
                            unws(''.join(node.xpath('./div[2]/div//text()')))
                        }
                    elif type == 'written-declarations':
                        if len(node.xpath('./div[1]/div')) != 3:
                            log(
                                2,
                                "written decl item has not 3 divs but %d %s" %
                                (len(node.xpath('./div[1]/div')), url))
                            continue
                        if len(node.xpath('./div[1]/div[1]/span')) != 3:
                            log(
                                2,
                                "written decl item has not 3 but %d spans in the 1st div at %s"
                                %
                                (len(node.xpath('./div[1]/div[1]/span')), url))
                            continue

                        item = {
                            'title':
                            unws(''.join(
                                node.xpath(
                                    './div/h3/span[@class="t-item"]//text()'))
                                 ),
                            'date':
                            datetime.strptime(
                                node.xpath('./div[1]/div[1]/span[1]/text()')
                                [0], u"%d-%m-%Y"),
                            'id':
                            unws(''.join(
                                node.xpath('./div[1]/div[1]/span[2]/text()')
                                [0])),
                            'status':
                            unws(''.join(
                                node.xpath('./div[1]/div[1]/span[3]/text()')
                                [0])),
                            'formats': [{
                                'type':
                                unws(fnode.xpath('./span/text()')[0]),
                                'url':
                                str(fnode.xpath('./@href')[0]),
                                'size':
                                unws(fnode.xpath('./span/span/text()')[0])
                            } for fnode in node.xpath(
                                './div[1]/div[2]/div[@class="d-inline"]/a')],
                            'authors': [{
                                'name': name.strip(),
                                "mepid": db.mepid_by_name(name.strip())
                            } for name in node.xpath(
                                './div[1]/div[3]/span/text()')],
                        }
                        for info in node.xpath('./div[2]/div'):
                            label = unws(''.join(info.xpath('./text()')))[:-2]
                            value = unws(''.join(info.xpath('./span/text()')))
                            if 'date' in label.lower():
                                value = datetime.strptime(value, u"%d-%m-%Y")
                            if label == 'Number of signatories':
                                number, date = value.split(' - ')
                                value = int(number)
                                item["No of sigs date"] = datetime.strptime(
                                    date, u"%d-%m-%Y")
                            item[label] = value
                    else:
                        #from lxml.etree import tostring
                        #print('\n'.join(tostring(e).decode() for e in node.xpath('./div/div[1]')))
                        # all other activities share the following scraper
                        ref = unws(''.join(
                            node.xpath('./div[1]/div[1]/span[2]/text()')))

                        if ref.startswith('- '):
                            ref = ref[2:]
                        if ref.endswith(' -'):
                            ref = ref[:-2]

                        item = {
                            'date':
                            datetime.strptime(
                                node.xpath('./div[1]/div[1]/span[1]/text()')
                                [0], u"%d-%m-%Y"),
                            'reference':
                            ref,
                        }

                        if type not in ['written-questions', 'oral-questions']:
                            ref = unws(''.join(
                                node.xpath('./div[1]/div[1]/span[3]/text()')))
                            if ref:
                                if not pere.match(ref):
                                    log(
                                        2,
                                        "pe, has not expected format: '%s'" %
                                        ref)
                                else:
                                    item['pe'] = ref

                        # opinions don't have title urls... why would they?
                        refurl = node.xpath('./div[1]/h3/a/@href')
                        if refurl: item['url'] = str(refurl[0])

                        item['title'] = unws(''.join(
                            node.xpath(
                                './div/h3//span[@class="t-item"]//text()')))

                        abbr = node.xpath(
                            './div[1]/div[1]/span/span[contains(concat(" ",normalize-space(@class)," ")," erpl_badge-committee ")]/text()'
                        )
                        if len(abbr):
                            item['committee'] = [
                                a for a in [unws(c) for c in abbr] if a
                            ]

                        formats = []
                        for fnode in node.xpath(
                                './div[1]/div[2]/div[@class="d-inline"]/a'):
                            elem = {
                                'type': unws(fnode.xpath('./span/text()')[0]),
                                'url': str(fnode.xpath('./@href')[0])
                            }
                            tmp = fnode.xpath('./span/span/text()')
                            if len(tmp) > 0:
                                elem['size'] = unws(tmp[0])
                            formats.append(elem)
                        if formats:
                            item['formats'] = formats

                        authors = [{
                            'name': name.strip(),
                            "mepid": db.mepid_by_name(name.strip())
                        } for name in node.xpath('./div[1]/div[3]/span/text()')
                                   ]
                        if authors: item['authors'] = authors

                        if type in ['opinions-shadow', 'opinions']:
                            for f in item['formats']:
                                if f['type'] == 'PDF':
                                    ref = pdf2ref(f['url'])
                                    if ref is not None:
                                        item['dossiers'] = [ref]
                                    break
                        else:
                            # try to deduce dossier from document reference
                            dossiers = db.get('dossiers_by_doc',
                                              item['reference']) or []
                            if len(dossiers) > 0:
                                item['dossiers'] = [
                                    d['procedure']['reference']
                                    for d in dossiers
                                ]
                            elif not '+DOC+PDF+' in item['url']:
                                # try to figure out the associated dossier by making an (expensive) http request to the ep
                                log(
                                    4, "fetching primary activity page %s" %
                                    item['url'])
                                try:
                                    refroot = fetch(item['url'])
                                except:
                                    refroot = None
                                if refroot is not None:
                                    if '/doceo/' in item[
                                            'url']:  # stupid new EP site removed the span with the procedure, bastards.
                                        fulla = refroot.xpath(
                                            '//table[@class="buttondocwin"]//a/img[@src="/doceo/data/img/navi_moredetails.gif"]/..'
                                        )
                                        if fulla:
                                            fullurl = fulla[0].get('href')
                                            if fullurl.endswith('.html'):
                                                if fullurl[-7:-5] != 'EN':
                                                    fullurl = fullurl[:-7] + 'EN.html'
                                                log(
                                                    4,
                                                    'loading activity full text page %s'
                                                    % fullurl)
                                                if fullurl.startswith(
                                                        '/doceo'):
                                                    fullurl = 'https://www.europarl.europa.eu' + fullurl
                                                if fullurl != item['url']:
                                                    refroot = fetch(fullurl)
                                        else:
                                            log(
                                                4, 'no fulla for %s' %
                                                item['url'])
                                    anchor = refroot.xpath(
                                        '//span[@class="contents" and text()="Procedure : " and not(ancestor::div[@style="display:none"])]'
                                    )
                                    if len(anchor) == 1:
                                        dossier = anchor[0].xpath(
                                            "./following-sibling::a/text()")
                                        if len(dossier) == 1:
                                            item['dossiers'] = [
                                                unws(dossier[0])
                                            ]
                                        elif len(dossier) > 1:
                                            log(
                                                2,
                                                "more than one dossier in ep info page: %d %s"
                                                % (len(dossier), item['url']))
                                    elif len(anchor) > 1:
                                        log(
                                            2,
                                            "more than one anchor in ep info page: %d %s"
                                            % (len(anchor), item['url']))

                    item['term'] = term
                    if TYPE not in activities:
                        activities[TYPE] = []
                    activities[TYPE].append(item)
                if len(root.xpath('//div[@class="erpl_document"]')) < cnt:
                    break
                page += 1
                url = "http://www.europarl.europa.eu/meps/en/%s/loadmore-activities/%s/%s/?page=%s&count=%s" % (
                    id, type, term, page, cnt)
                try:
                    root = fetch(url)
                except:
                    log(1, "failed to fetch {}".format(url))
                    #raise ValueError
                    break
                #print(url, file=sys.stderr)
        if TYPE in activities:
            activities[TYPE] = sorted(activities[TYPE],
                                      key=lambda x: x['date'])
    activities['mep_id'] = id
    if len(activities.keys()) > 1:
        process(activities,
                id,
                db.activities,
                'ep_mep_activities',
                mepname,
                nodiff=True)
        return activities
    return {}