Пример #1
0
def scrape_basic(tree):
    res=form2obj((tree.xpath('//table[@id="technicalInformations"]') or [None])[0],detailsheaders) or {}
    if 'dossier_of_the_committee' in res:
        res['dossier_of_the_committee']=';'.join(sorted((unws(x) for x in res['dossier_of_the_committee'].split(';'))))
    table=(tree.xpath('//table[@id="basic_information"]') or [None])[0]
    if table is None: return res
    res.update({'stage_reached': (table.xpath('.//p[@class="pf_stage"]/text()') or [''])[0].strip(),
                'reference': (table.xpath('.//span[@class="basic_reference"]/text()') or [''])[0].strip(),
                'type': (table.xpath('.//p[@class="basic_procedurefile"]/text()') or [''])[0].strip(),
                'title': (table.xpath('.//p[@class="basic_title"]/text()') or [''])[0].strip(),
                })
    if '' in res:
        del res['']
    if 'legal_basis' in res:
        res[u'legal_basis']=sorted((unws(x) for x in res['legal_basis'].split(';')))
    fields=table.xpath('.//p[@class="basic_content"]/*')
    firstline=u' '.join((table.xpath('.//p[@class="basic_content"]/text()') or [''])[0].split())
    attrib=u'summary'
    if len(firstline):
        if not attrib in res: res[attrib]=[]
        res[attrib]=[firstline]
    for elem in fields:
        if elem.tag=='br' and elem.tail and elem.tail.strip():
            if not attrib in res: res[attrib]=[]
            res[attrib].append(u' '.join(elem.tail.split()))
        elif elem.tag=='strong':
            if attrib in res and res[attrib]:
                res[attrib].sort()
            attrib=u' '.join(elem.xpath('text()')[0].split())
            attrib=detailsheaders.get(attrib,attrib).lower().replace(u" ",u"_")
            if attrib:
                res[attrib]=[]
    return res
Пример #2
0
def scrape(celexid, path):
    logger.info("scraping %s%s:NOT" % (EURLEXURL,celexid))
    path.reverse()
    (code,lang)=celexid.split(":")[1:3]
    st=6
    if len(code)>6:
        if code[6].isalpha(): st=7
        eurlex={'id': {u'celexid': celexid,
                       u'sector': code[0],
                       u'year': code[1:5],
                       u'doctype': code[5:st],
                       u'refno': code[st:],
                       u'lang': lang,
                       u'chapter': path,
                       }}
    else:
        eurlex={'id': {u'celexid': celexid,
                       u'sector': code[0],
                       u'year': code[1:5],
                       u'doctype': code[5:6],
                       u'lang': lang,
                       u'chapter': path,
                       }}

    try:
        eurlex['id'][u'typeDesc']= CELEXCODES[code[0]]['Document Types'][code[5:st]] if code[5:st] != 'C' else CELEXCODES[code[0]]['Sector']
    except:
        eurlex['id'][u'typeDesc']= u"Unknown"
        logger.warn("[!] unknown typedesc %s" % celexid)
    eurlex['meta']={u'src': "%s%s:NOT" % (EURLEXURL,celexid)}

    root = fetch("%s%s:NOT" % (EURLEXURL,celexid))
    if len(root.xpath('//h1[text()="No documents matching criteria."]'))>0:
        logger.warn('[!] nothing to scrape here: %s', "%s%s:NOT" % (EURLEXURL,celexid))
        return
    eurlex[u'title'] = root.xpath('//h2[text()="Title and reference"]/following-sibling::p/text()')[0]
    # dates
    dates=root.xpath('//h2[text()="Dates"]/following-sibling::ul/text()')
    for y in dates:
        if not unws(y): continue
        title, rest=unws(y).split(": ",1)
        item={u'type': title}
        date=rest[:10]
        tail=rest[10:]
        if tail.startswith('; '):
            tail=tail[2:]
        if date=='99/99/9999': item[u'date']= datetime(9999,12,31)
        elif date=='00/00/0000': item[u'date']= datetime(0001,01,01)
        elif date=='//': continue
        else:
            try: item[u'date']= datetime.strptime(date, u"%d/%m/%Y")
            except ValueError:
                try: item[u'date']= datetime.strptime(date, u"%m/%d/%Y")
                except: pass
        if len(tail):
            item['note']=tail
        try:
            eurlex['dates'].append(item)
        except:
            eurlex['dates']=[item]
Пример #3
0
def getAddress(root):
    res={}
    for div in root.xpath('//div[@id="contextzone"]//div[@class="ep_title"]'):
        # getAddress(map(strip, div.xpath("../..//div[@class='ep_elementcontact']/ul")))
        key=unws(''.join(div.xpath('.//text()')))
        if key not in ['Bruxelles', 'Strasbourg', 'Postal address', 'Luxembourg']:
            continue
        res[key]={}
        if key in ['Bruxelles', 'Strasbourg', 'Luxembourg']:
            tmp=div.xpath('../..//li[@class="ep_phone"]/div/text()')
            if tmp:
                res[key]['Phone'] = unws(tmp[0]).replace('(0)','')
            tmp=div.xpath('../..//li[@class="ep_fax"]/div/text()')
            if tmp:
                res[key]['Fax'] = unws(tmp[0]).replace('(0)','')
        tmp=[unws(x) for x in div.xpath('../..//li[@class="ep_address"]/div/text()') if len(unws(x))]
        if key=='Strasbourg':
            res[key].update(dict(zip(['Organization','Building', 'Office', 'Street','Zip1', 'Zip2'],tmp)))
            res[key]['City']=res[key]['Zip2'].split()[1]
            res[key]['Zip2']=res[key]['Zip2'].split()[0]
            res[key]['building_code']=buildings[res[key]['Building']]
        elif key=='Bruxelles':
            res[key].update(dict(zip(['Organization','Building', 'Office', 'Street','Zip'],tmp)))
            res[key]['City']=res[key]['Zip'].split()[1]
            res[key]['Zip']=res[key]['Zip'].split()[0]
            res[key]['building_code']=buildings[res[key]['Building']]
        elif key=='Luxembourg':
            res[key]['Address']=tmp
        elif key=='Postal address':
            res[key]=tmp
        else:
            logger.error("wtf %s" % key)
    return res
Пример #4
0
def getactors(node):
    res={}
    ax=[None,[]]
    for row in node.xpath('.//tr'):
        cells=row.xpath('./td/p')
        if not cells: continue

        # get role Rapporteur|Responsible|Rapporteur for the opinion|Opinions
        role=cells[0].xpath('text()')
        if role and unws(role[0]):
            if ax[0] and ax[1]: res[ax[0]]=sorted(ax[1])
            tmp=unws(role[0])[:-1]
            if tmp=="Rapporteur for the opinion":
                tmp="Rapporteur"
            ax=[tmp,[]]

        tmp=unws((cells[1].xpath('text()') or [None])[0])
        if ax[0] in ["Rapporteur", "Rapporteur for the opinion"] and tmp:
            name=' '.join(tmp.split()[:-1])
            item={u'group': tmp.split()[-1][1:-1],
                  u'name': name,
                  u'mepref': getMEPRef(name) }
            if len(cells)>2:
                item[u'docs']=getdoclist(cells[2])
            ax[1].append(item)
            continue
        if ax[0] in ["Opinions", "Responsible"] and tmp:
            tmp1=tmp.split(u' –',1)
            if len(tmp1)==2:
                (comid, rest)=tmp1
            elif len(tmp1)==1:
                skip=False
                for com in tmp.split(' ,'):
                    if com in COMMITTEE_MAP and len(com)==4:
                        ax[1].append({u'comid': com})
                        skip=True
                if skip:
                    continue
            else:
                logger.warn("[!] unknown committee: %s" % tmp)
                raise
            item={u'comid': comid}
            if rest==' Decision: no opinion':
                item[u'response']=u'Decision: no opinion'
            if not rest and len(comid)>4:
                for com in comid.split(', '):
                    ax[1].append({u'comid': com})
                continue
            if len(cells)>2:
                tmp=unws((cells[2].xpath('text()') or [None])[0])
                if tmp:
                    name=' '.join(tmp.split()[:-1])
                    item.update({u'group': tmp.split()[-1][1:-1],
                                 u'name': name,
                                 u'mepref': getMEPRef(name)})
                    if len(cells)>3:
                        item[u'docs']=getdoclist(cells[3])
            ax[1].append(item)
    if ax[0] and ax[1]: res[ax[0]]=sorted(ax[1])
    return res
Пример #5
0
def getAddress(root):
    res={}
    for div in root.xpath('../following-sibling::div[@class="boxcontent " or @class="boxcontent nobordertop"]/ul[@class="contact"]'):
        key=unws(''.join(div.xpath('./preceding-sibling::h4/text()')))
        if key not in ['Bruxelles', 'Strasbourg', 'Postal address', 'Luxembourg']:
            continue
        if key=='Bruxelles': key=u'Brussels'
        elif key=='Postal address': key=u'Postal'
        res[key]={}
        if key in ['Brussels', 'Strasbourg', 'Luxembourg']:
            tmp=div.xpath('./following-sibling::ul[@class="link_collection_noborder"]//span[@class="phone"]/text()')
            if tmp:
                res[key][u'Phone'] = unws(tmp[0]).replace('(0)','')
            tmp=div.xpath('./following-sibling::ul[@class="link_collection_noborder"]//span[@class="fax"]/text()')
            if tmp:
                res[key][u'Fax'] = unws(tmp[0]).replace('(0)','')
        tmp=[unws(x) for x in div.xpath('./li[@class="address"]//text()') if len(unws(x))]
        if key=='Strasbourg':
            res[key][u'Address']=dict(zip([u'Organization',u'Building', u'Office', u'Street',u'Zip1', u'Zip2'],tmp))
            res[key][u'Address']['City']=res[key]['Address']['Zip2'].split()[1]
            res[key][u'Address']['Zip2']=res[key]['Address']['Zip2'].split()[0]
            res[key][u'Address']['building_code']=buildings.get(res[key]['Address']['Building'])
        elif key=='Brussels':
            res[key][u'Address']=dict(zip([u'Organization',u'Building', u'Office', u'Street',u'Zip'],tmp))
            res[key][u'Address']['City']=res[key]['Address']['Zip'].split()[1]
            res[key][u'Address']['Zip']=res[key]['Address']['Zip'].split()[0]
            res[key][u'Address']['building_code']=buildings.get(res[key]['Address']['Building'])
        elif key=='Luxembourg':
            res[key][u'Address']=tmp
        elif key=='Postal':
            res[key]=tmp
        else:
            logger.error("wtf %s" % key)
    return res
Пример #6
0
def getInOut(term=current_term, dir="in", res={}):
    # returns dict of new incoming meps. this is being checked when
    # crawling, to set more accurate groups and constituency info
    i = 0
    page = fetch("http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?type=%s" % dir, ignore=[500])
    last = None
    while True:
        meps = []
        for x in page.xpath('//div[@class="ep_elementpeople1"]//a[@class="ep_title"]'):
            mepid = int(urljoin(BASE_URL, x.get("href")).split("/")[-2])
            const = {u"country": unws((x.xpath('..//span[@class="ep_country"]/text()') or [""])[0])}
            if dir == "out":
                const["start"], const["end"] = [
                    datetime.strptime(d, "%B %d, %Y")
                    for d in unws((x.xpath('../span[@class="meps_date_inout"]/text()') or [""])[0]).split(" - ")
                ]
            else:
                const["start"] = datetime.strptime(
                    unws((x.xpath('../span[@class="meps_date_inout"]/text()') or [""])[0]), "%B %d, %Y"
                )
            if not mepid in res:
                res[mepid] = [const]
            else:
                res[mepid].append(const)
            meps.append((mepid, const))
        if meps == last:
            break
        last = meps
        i += 1
        page = fetch(
            "http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?action=%s&webCountry=&webTermId=%s&name=&politicalGroup=&bodyType=&bodyValue=&type=%s&filter="
            % (i, term, dir),
            ignore=[500],
        )
    return res
Пример #7
0
def toLinks(node):
    if node is None: return
    for br in node.xpath("br"):
        br.text="\n"
    ret=[]
    for line in node.xpath(".//text()"):
        if len(unws(line))<1:
            continue
        if line.getparent().tag=='a':
            ret.append({u'title': unws(line), 'url': unicode(urljoin(BASE_URL,line.getparent().get('href')),'utf8')})
        else:
            ret.append({u'title': unws(line)})
    return ret
Пример #8
0
def toLinks(node):
    if node is None:
        return
    for br in node.xpath("br"):
        br.text = "\n"
    ret = []
    for line in node.xpath(".//text()"):
        if len(unws(line)) < 1:
            continue
        if line.getparent().tag == "a":
            ret.append({u"title": unws(line), "url": unicode(urljoin(BASE_URL, line.getparent().get("href")), "utf8")})
        else:
            ret.append({u"title": unws(line)})
    return ret
Пример #9
0
def getComAgendas():
    #urltpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html"
    urltpl="http://www.europarl.europa.eu/committees/en/%s/search-in-documents.html"
    postdata="docType=AGEN&leg=8&miType=text&tabActif=tabResult#sidesForm"
    #nexttpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?action=%s&tabActif=tabResult#sidesForm"
    nexttpl="http://www.europarl.europa.eu/committees/en/%s/search-in-documents.html?action=%s&tabActif=tabResult#sidesForm"
    for com in (k for k in COMMITTEE_MAP.keys()
                if len(k)==4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP', 'SURE', 'CRIM', 'CRIS']):
        url=urltpl % (com)
        i=0
        agendas=[]
        logger.info('scraping %s' % com)
        root=fetch(url, params=postdata)
        prev=[]
        while True:
            logger.info("%s %s" % (datetime.now().isoformat(), url))
            tmp=[(a.get('href'), unws(a.xpath('text()')[0]))
                 for a in root.xpath('//p[@class="title"]/a')
                 if len(a.get('href',''))>13]
            if not tmp or prev==tmp: break
            prev=tmp
            for u,title in tmp:
                if title.startswith('DRAFT AGENDA'):
                    yield (u,com)
            i+=1
            url=nexttpl % (com,i)
            root=fetch(url)
Пример #10
0
def istype(text):
    # get type
    found = False
    for t in types:
        if unws(text).lower().startswith(t.lower()):
            found = True
            break
    return found
Пример #11
0
def getIncomming(term=7):
    # returns dict of new incoming meps. this is being checked when
    # crawling, to set more accurate groups and constituency info
    i=0
    page=fetch('http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?type=in')
    last=None
    res={}
    while True:
        meps=[((u'name', unws(x.xpath('text()')[0])),
               (u'meta', {u'url': urljoin(urljoin(BASE_URL,x.get('href')),'get.html')}),
               (u'Constituencies', {u'start': datetime.strptime(unws((x.xpath('../span[@class="meps_date_inout"]/text()') or [''])[0]), "%B %d, %Y"),
                                    u'country': unws((x.xpath('..//span[@class="ep_country"]/text()') or [''])[0])}),
               (u'Groups', {u'start': datetime.strptime(unws((x.xpath('../span[@class="meps_date_inout"]/text()') or [''])[0]), "%B %d, %Y"),
                            u'group': unws((x.xpath('..//span[@class="ep_group"]/text()') or [''])[0]),
                            u'groupid': group_map[unws((x.xpath('..//span[@class="ep_group"]/text()') or [''])[0])],
                            u'role': unws((x.xpath('..//span[@class="ep_group"]/span[@class="ep_title"]/text()') or [''])[0])}),
               )
              for x in page.xpath('//div[@class="ep_elementpeople1"]//a[@class="ep_title"]')]
        if meps==last:
            break
        last=meps
        for mep in meps:
            res[int(mep[1][1]['url'].split('/')[-2])]=dict(mep[1:])
        i+=1
        page=fetch('http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?action=%s&webCountry=&webTermId=%s&name=&politicalGroup=&bodyType=&bodyValue=&type=in&filter=' % (i, term))
    return res
Пример #12
0
def getdoclist(node):
    txt=[x for x in node.xpath('.//text()') if unws(x)]
    i=0
    res=[]
    while i+1 < len(txt):
        if unws(txt[i])[-1]==u"\u2013":
            res.append({u'type': unws(txt[i])[:-2],
                        u'title': unws(txt[i+1]),
                        u'url': urljoin(BASE_URL, txt[i+1].getparent().get('href'))})
            i+=2
        elif len(unws(txt[i]).split(u" \u2013 "))>1:
            res.append({u'type': unws(txt[i].split(u" \u2013 ")[0]),
                        u'title': unws(txt[i].split(u" \u2013 ")[1] if len(txt[i].split(u" \u2013 "))>1 else u'')})
            i+=1
        else:
            i+=1
    if i < len(txt) and len(unws(txt[i]).split(u" \u2013 "))>1:
        res.append({u'type': unws(txt[i]).split(u" \u2013 ")[0],
                    u'title': unws(txt[i]).split(u" \u2013 ")[1]})
    return res
Пример #13
0
def get_meps(term='7'):
    i=0
    page=fetch("http://www.europarl.europa.eu/meps/en/performsearch.html?webCountry=&webTermId=%s&name=&politicalGroup=&bodyType=ALL&bodyValue=&type=&filter=&search=Show+result" % (term))
    last=None
    while True:
        meps=[(x.get('href'), unws(x.xpath('text()')[0])) for x in page.xpath('//div[@class="ep_elementpeople1"]//a[@class="ep_title"]')]
        if meps==last:
            break
        for url,name in meps:
            yield (urljoin(urljoin(BASE_URL,url),'get.html'), name)
        last=meps
        i+=1
        page=fetch("http://www.europarl.europa.eu/meps/en/performsearch.html?action=%s&webCountry=&webTermId=%s&name=&politicalGroup=&bodyType=ALL&bodyValue=&type=&filter=" % (i, term))
Пример #14
0
def getAddress(root):
    res = {}
    for div in root.xpath('//div[@id="contextzone"]//div[@class="ep_title"]'):
        # getAddress(map(strip, div.xpath("../..//div[@class='ep_elementcontact']/ul")))
        key = unws("".join(div.xpath(".//text()")))
        if key not in ["Bruxelles", "Strasbourg", "Postal address", "Luxembourg"]:
            continue
        if key == "Bruxelles":
            key = u"Brussels"
        elif key == "Postal address":
            key = u"Postal"
        res[key] = {}
        if key in ["Brussels", "Strasbourg", "Luxembourg"]:
            tmp = div.xpath('../..//li[@class="ep_phone"]/div/text()')
            if tmp:
                res[key][u"Phone"] = unws(tmp[0]).replace("(0)", "")
            tmp = div.xpath('../..//li[@class="ep_fax"]/div/text()')
            if tmp:
                res[key][u"Fax"] = unws(tmp[0]).replace("(0)", "")
        tmp = [unws(x) for x in div.xpath('../..//li[@class="ep_address"]/div/text()') if len(unws(x))]
        if key == "Strasbourg":
            res[key][u"Address"] = dict(
                zip([u"Organization", u"Building", u"Office", u"Street", u"Zip1", u"Zip2"], tmp)
            )
            res[key][u"Address"]["City"] = res[key]["Address"]["Zip2"].split()[1]
            res[key][u"Address"]["Zip2"] = res[key]["Address"]["Zip2"].split()[0]
            res[key][u"Address"]["building_code"] = buildings[res[key]["Address"]["Building"]]
        elif key == "Brussels":
            res[key][u"Address"] = dict(zip([u"Organization", u"Building", u"Office", u"Street", u"Zip"], tmp))
            res[key][u"Address"]["City"] = res[key]["Address"]["Zip"].split()[1]
            res[key][u"Address"]["Zip"] = res[key]["Address"]["Zip"].split()[0]
            res[key][u"Address"]["building_code"] = buildings[res[key]["Address"]["Building"]]
        elif key == "Luxembourg":
            res[key][u"Address"] = tmp
        elif key == "Postal":
            res[key] = tmp
        else:
            logger.error("wtf %s" % key)
    return res
Пример #15
0
def getOutgoing(term=7):
    # returns an iter over ex meps from the current term, these are
    # missing from the get_meps result
    i=0
    page=fetch('http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?type=out')
    last=None
    while True:
        meps=[((u'url', urljoin(BASE_URL,x.get('href'))),
               (u'name', unws(x.xpath('text()')[0])),
               ('dates', unws((x.xpath('../span[@class="meps_date_inout"]/text()') or [''])[0])),
               ('country', unws((x.xpath('../span[@class="ep_country"]/text()') or [''])[0])),
               ('group', unws((x.xpath('..//span[@class="ep_group"]/text()') or [''])[0])),
               ('role', unws((x.xpath('..//span[@class="ep_group"]/span[@class="ep_title"]/text()') or [''])[0])),
               )
              for x in page.xpath('//div[@class="ep_elementpeople1"]//a[@class="ep_title"]')]
        if meps==last:
            break
        last=meps
        for mep in meps:
            mep=dict(mep)
            tmp=mep['dates'].split(' - ')
            if tmp:
                mep[u'Constituencies']={u'start': datetime.strptime(tmp[0], "%B %d, %Y"),
                                       u'end': datetime.strptime(tmp[1], "%B %d, %Y"),
                                       u'country': mep['country']}
                mep[u'Groups']={u'start': datetime.strptime(tmp[0], "%B %d, %Y"),
                               u'end': datetime.strptime(tmp[1], "%B %d, %Y"),
                               u'group': mep['group'],
                               u'role': mep['role']}
                del mep['dates']
                del mep['country']
                del mep['group']
                del mep['role']
                yield (urljoin(urljoin(BASE_URL,mep['url']),'get.html'), mep)
        i+=1
        page=fetch('http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?action=%s&webCountry=&webTermId=%s&name=&politicalGroup=&bodyType=&bodyValue=&type=out&filter=' % (i, term))
Пример #16
0
def getComAgendas():
    urltpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?&docType=AGEN&leg=7&miType=text"
    nexttpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?tabActif=tabLast&startValue=%s"
    for com in (k for k in COMMITTEE_MAP.keys() if len(k)==4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP']):
        url=urltpl % (com)
        i=0
        agendas=[]
        logger.info('scraping %s' % com)
        while True:
            logger.info("crawling %s" % (url))
            root=fetch(url)
            tmp=[(a.get('href'), unws(a.xpath('text()')[0]))
                 for a in root.xpath('//p[@class="title"]/a')
                 if len(a.get('href',''))>13]
            if not tmp: break
            for u,_ in tmp:
                yield (u,com)
            i+=10
            url=nexttpl % (com,i)
Пример #17
0
def splitNames(text):
    text = text.split(' on behalf ',1)[0]
    res=[]
    for delim in (', ', ' and ', ' & ', '; ', ','):
        if not res:
            res=filter(None,[item[:-1] if item[-1] in [',', "'", ';'] else item
                              for item in unws(text).split(delim)
                              if item])
            continue
        res=filter(None,[item[:-1] if item[-1] in [',', "'", ';'] else item
                         for elem in res
                         for item in elem.split(delim)
                         if item])
    # only for devel.
    # for mep in res:
    #     if mep.startswith('on behalf of'): continue
    #     if mep.endswith('Shadow)'):
    #         logger.info('shadow: %s' % mep)
    res=[mep if not mep.endswith('Shadow)') else mep[:mep.rfind(' (')]
         for mep in res
         if not mep.startswith('on behalf of')]
    res=[y for x in res for y in mansplits.get(x,[x])]
    return [mepmaps.get(x,x) for x in res]
Пример #18
0
def parse_block(block, url, reference, date, committee, rapporteur):
    am = {
        u'src': url,
        u'reference': reference,
        u'date': date,
        u'committee': committee
    }

    #logger.info(block)
    # get title
    try:
        am[u'seq'] = int(unws(block[0]).split()[1])
    except ValueError:
        am[u'seq'] = unws(block[0]).split()[1]
    except IndexError:
        logger.warn("%s wrong seq %s" % (datetime.now().isoformat(), block[0]))
        am[u'seq'] = unws(block[0])
    del block[0]

    strip(block)

    # find and strip justification
    i = len(block) - 1
    while i > 2 and not (unws(block[i]) == "Justification"
                         and block[i].startswith(' ' * 6)):
        i -= 1
    if i > 2:
        if i < len(block) - 1 and (not unws(block[i + 1])
                                   or not block[i + 1].startswith(' ')):
            am['justification'] = '\n'.join(block[i + 2:])
            del block[i:]
            strip(block)
        else:
            logger.warn("%s wrong justification\n%s" %
                        (datetime.now().isoformat(), '\n'.join(block[i:])))

    # get original language
    if 4 < len(unws(block[-1])) <= 6 and unws(block[-1]).startswith('Or.'):
        am['orig_lang'] = unws(block[-1])[4:]
        del block[-1]
        strip(block)

    # find split column new/old heading
    i = len(block) - 1
    while (i > 2 and not (
        (block[i].endswith("     Amendment")
         or block[i].endswith("     PARTICULARS")
         or block[i].endswith("     Remedy")
         or block[i].endswith("     Amended text")
         or block[i].endswith("     Amendement")
         or block[i].endswith("           Amendments by Parliament")
         or block[i].endswith("           Proposal for rejection")
         or block[i].endswith("           Proposal for a rejection")
         or block[i].endswith("           Does not affect English version")
         or block[i].endswith("           (Does not affect English version)")
         or block[i].endswith("      Amendment by Parliament"))
            and len(block[i]) > 33)
           and not (unws(block[i]) == 'Text proposed by the Commission'
                    or unws(block[i]) in types)):
        i -= 1
    if i > 2:
        #if block[i].endswith("               Proposal for rejection"):
        #    pass # location will be possibly '-'
        seq = False
        if unws(block[i]) in ["Amendment", "Amendment by Parliament"]:
            # sequential format, search for preceeding original text
            j = i
            while (j > 2 and not (unws(block[j]) in types or unws(block[j])
                                  == 'Text proposed by the Commission')):
                j -= 1
            if j > 2: i = j
            seq = True
            key = 'old'
        elif unws(
                block[i]
        ) == 'Text proposed by the Commission' or block[i].strip() in types:
            seq = True
            key = 'old'
        # throw headers
        del block[i]
        while i < len(block) and not unws(block[i]):
            del block[i]  # skip blank lines
        mid = max([len(x) for x in block]) / 2
        while i < len(block):
            if seq:
                if unws(block[i]) in [
                        "Amendment", "Amendment by Parliament", "Text Amended"
                ]:
                    key = 'new'
                    del block[i]
                    continue
                try:
                    am[key].append(block[i])
                except KeyError:
                    am[key] = [block[i]]
                del block[i]
                continue
            # only new, old is empty
            if block[i].startswith('         '):
                try:
                    am['new'].append(unws(block[i]))
                except KeyError:
                    am['new'] = [unws(block[i])]
                del block[i]
                continue
            newstart = block[i].rstrip().rfind('  ')
            # only old, new is empty
            if newstart < 6:
                try:
                    am['old'].append(unws(block[i]))
                except KeyError:
                    am['old'] = [unws(block[i])]
                del block[i]
                continue
            #mid=len(block[i])/2
            #mid=40
            lsep = block[i].rfind('  ', 0, mid)
            # todo calculate both, and use the one closer to the center
            rsep = block[i].find('  ', mid)
            sep = None
            if abs(lsep - mid) < abs(rsep - mid):
                if abs(lsep - mid) < 15:
                    sep = lsep
            else:
                if abs(rsep - mid) < 15:
                    sep = rsep
            if sep:
                try:
                    am['old'].append(unws(block[i][:sep]))
                except KeyError:
                    am['old'] = [unws(block[i][:sep])]
                try:
                    am['new'].append(unws(block[i][sep:]))
                except KeyError:
                    am['new'] = [unws(block[i][sep:])]
            else:
                # no sane split found
                #logger.warn("no split: %s %s\n%s" % (datetime.now().isoformat(),
                #                                     (sep, mid, len(block[i]), newstart, block[i]),
                #                                     block[i][mid-1:mid+2]))
                # fallback to naive splitting
                try:
                    am['old'].append(unws(block[i][:newstart]))
                except KeyError:
                    am['old'] = [unws(block[i][:newstart])]
                try:
                    am['new'].append(unws(block[i][newstart:]))
                except KeyError:
                    am['new'] = [unws(block[i][newstart:])]
            del block[i]
        strip(block)
    else:
        logger.warn("%s no table\n%s" %
                    (datetime.now().isoformat(), '\n'.join(block[i:])))
        am['content'] = block[i:]
        return am

    i = 0
    # find end of authors
    while (i < len(block) and unws(block[i])
           and not unws(block[i]).lower().startswith('compromise')
           and not istype(block[i])
           and not unws(block[i]).split()[0] in locstarts):
        i += 1
    if i < len(block):
        if i > 0:
            names = ' '.join(block[:i])
            am['authors'] = names
            #logger.info("names \n%s" % names)

            # convert to pt mep _ids
            for text in filter(None, splitNames(names)):
                mep = getMep(text, None, False)
                if mep:
                    try:
                        am['meps'].append(mep['UserID'])
                    except KeyError:
                        am['meps'] = [mep['UserID']]
                else:
                    logger.info("fix %s" % text)
            del block[:i]
            strip(block)
        elif rapporteur:
            am['authors'] = rapporteur
            for text in filter(None, splitNames(rapporteur)):
                mep = getMep(text, None, False)
                if mep:
                    try:
                        am['meps'].append(mep['UserID'])
                    except KeyError:
                        am['meps'] = [mep['UserID']]
                else:
                    logger.info("fix %s" % text)
        else:
            logger.info("%s no authors in Amendment %s" %
                        (datetime.now().isoformat(), am['seq']))
    else:
        logger.warn("%s no boundaries in Amendment %s\n%s" %
                    (datetime.now().isoformat(), am['seq'], '\n'.join(block)))
        am['rest'] = block
        return am

    # handle compromise info
    i = 0
    while (i < len(block) and unws(block[i]) and not istype(block[i])
           and not unws(block[i]).split()[0] in locstarts):
        i += 1
    if i < len(block) and i > 0:
        am['compromise'] = block[:i]
        del block[:i]
        strip(block)

    i = 0
    while (i < len(block) and unws(block[i])):
        if unws(block[i]).split()[0] in locstarts:
            try:
                am['location'].append((' '.join(block[:i]), unws(block[i])))
            except KeyError:
                am['location'] = [(' '.join(block[:i]), unws(block[i]))]
            del block[:i + 1]
            i = 0
        else:
            i += 1
    if len(block) > 0 and ((len(block) == 1 or not unws(block[1]))
                           and unws(block[0]) != '1' and 'location' in am):
        am['location'][-1] = (am['location'][-1][0],
                              "%s %s" % (am['location'][-1][1], block[0]))
        del block[0]
        strip(block)

    if block:
        if not ((len(block) == 3 and unws(block[0]) == '1'
                 and not unws(block[1]) and block[2].startswith("  ")) or
                (len(block) == 2 and unws(block[0]) == '1'
                 and block[1].startswith("  "))):
            # ignore obvious footnotes
            logger.info("rest in Amendment %s\n%s" %
                        (am['seq'], '\n'.join(block)))
    return am
Пример #19
0
def scrape_epagents(table):
    heading = ''.join(
        table.xpath('.//td[@class="players_committee"]')[0].xpath(
            ".//text()")).strip()
    responsible = None
    if heading in ["Committee responsible", "Former committee responsible"]:
        responsible = True
    elif heading in ["Committee for opinion", "Former committee for opinion"]:
        responsible = False
    else:
        logger.warn(u"[!] unknown committee heading %s" % heading)

    # handle shadows
    shadowelems = table.xpath(
        '//a[@id="shadowRapporteurHeader"]/../following-sibling::div/p//span[@class="players_rapporter_text"]/a'
    )
    tips = [
        t.xpath('text()')[0] if len(t.xpath('text()')) > 0 else
        groupurlmap[t.xpath("a")[0].get('href')]
        if len(t.xpath("a")) > 0 else groupurlmap[t.xpath("img")[0].get('src')]
        for t in table.xpath(
            '//a[@id="shadowRapporteurHeader"]/../following-sibling::div//span[@class="tiptip"]'
        )
    ]
    shadows = {}
    for shadow, group in izip_longest(shadowelems, tips):
        committee = shadow.xpath(
            './ancestor::td/preceding-sibling::td//acronym/text()')[0]
        if not committee in shadows: shadows[committee] = []
        if group == 'NI': group = u'NI'
        mep = {
            u'name': unicode(shadow.xpath('text()')[0]),
            u'group': unicode(group)
        }
        tmp = getMEPRef(shadow.xpath('text()')[0])
        if tmp:
            mep[u'mepref'] = tmp
        #else:
        #    raise IndexError
        shadows[committee].append(mep)
    # delete the uneccessary shadow elements - so the following regular lst2obj get's what it expects
    for todel in table.xpath('//a[@id="shadowRapporteurHeader"]/..'):
        parent = todel.xpath('..')[0]
        parent.remove(todel.xpath('following-sibling::div')[0])
        parent.remove(todel)

    # handle each row of agents
    agents = []
    for agent in lst2obj(table, epagents, 1):
        agent[u'responsible'] = responsible
        agent[u'body'] = u'EP'
        if agent.get('rapporteur'):
            meps = []
            for mep in agent['rapporteur']:
                if unws(mep['name']).startswith(
                        "The committee decided not to give an opinion"):
                    del agent['rapporteur'][agent['rapporteur'].index(mep)]
                    agent[u'opinion'] = None
                    continue
                tmp = getMEPRef(mep['name'])
                if tmp:
                    meps.append({
                        u'mepref': tmp,
                        u'group': mep['group'],
                        u'name': mep['name']
                    })
                else:
                    meps.append({u'group': mep['group'], u'name': mep['name']})
            agent[u'rapporteur'] = meps

        abbr = agent['committee'][:4]
        if abbr == 'BUDE': abbr = 'BUDG'
        if not abbr in COMMITTEE_MAP.keys():
            logger.warn(u"[!] uknown committee abbrev %s" % abbr)
            agent[u'committee_full'] = agent['committee']
            if agent['committee'][4] == ' ' and abbr.isalpha():
                agent[u'committee'] = abbr
        else:
            agent[u'committee_full'] = agent['committee'][5:]
            agent[u'committee'] = abbr

        if agent.get(u'committee') in shadows.keys():
            agent[u'shadows'] = shadows[agent['committee']]

        if not agent in agents: agents.append(agent)
    return agents
Пример #20
0
def scrape(url, rapporteur=None):
    if (url in [
            'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-483.680%2b02%2bDOC%2bPDF%2bV0%2f%2fEN',
            'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-454.387%2b01%2bDOC%2bPDF%2bV0%2f%2fEN',
            'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-456.679%2b01%2bDOC%2bPDF%2bV0%2f%2fEN',
            'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-494.504%2b01%2bDOC%2bPDF%2bV0%2f%2fEN',
            'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.705%2b01%2bDOC%2bPDF%2bV0%2f%2fEN',
            'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.767%2b02%2bDOC%2bPDF%2bV0%2f%2fEN',
            'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-454.385%2b01%2bDOC%2bPDF%2bV0%2f%2fEN',
            'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-465.012%2b01%2bDOC%2bPDF%2bV0%2f%2fEN',
            'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-496.504%2b01%2bDOC%2bPDF%2bV0%2f%2fEN',
            'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.724%2b01%2bDOC%2bPDF%2bV0%2f%2fEN',
            'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.721%2b02%2bDOC%2bPDF%2bV0%2f%2fEN',
            'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.723%2b03%2bDOC%2bPDF%2bV0%2f%2fEN'
    ] or not url.endswith('EN')):
        logger.info("skipping unparsable url")
        return []
    prolog = True
    res = []
    block = None
    reference = None
    date = None
    committee = []
    text = getraw(url).split('\n')
    for line in text:
        if prolog:
            if amstart.match(line):
                if reference == None:
                    logger.warn("%s [!] couldn't find ref: %s" %
                                (datetime.now().isoformat(),
                                 unws([x for x in text[:20] if unws(x)][2])))
                    # marking as scraped though
                    db.ep_ams.save({
                        'src':
                        url,
                        'error':
                        "couldn't find reference in source pdf"
                    })
                    return []
                if date == None or committee == []:
                    return []
                    #raise ValueError
                block = [line]
                prolog = False
                continue

            line = unws(line)

            if not line: continue

            if line in COMMITTEE_MAP:
                committee.append(COMMITTEE_MAP[line])
                continue

            if (committee and not reference and re.match(refre, line)):
                reference = line
                if url == 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-506.166%2b03%2bDOC%2bPDF%2bV0%2f%2fEN':
                    logger.info("adjusting reference to eudatap")
                    reference = "2012/0011(COD)"
                continue

            if (reference and not date):
                try:
                    date = parse(unws(line), dayfirst=True)
                except ValueError:
                    pass
            continue

        if amstart.match(line):
            # parse block
            res.append(
                parse_block(block, url, reference, date, committee,
                            rapporteur))
            block = [line]
            continue
        block.append(line)
    if block and filter(None, block):
        res.append(
            parse_block(block, url, reference, date, committee, rapporteur))
    return res
Пример #21
0
def scrape(url, comid):
    root=fetch(url)
    lines=[x for x in root.xpath('//td[@class="contents"]/div/*') if unws(' '.join(x.xpath('.//text()')))]
    if not len(lines): return
    if not unws(' '.join(lines[2].xpath('.//text()')))=='DRAFT AGENDA':
        logger.error("NOT DRAFT AGENDA %s" % unws(' '.join(lines[2].xpath('.//text()'))))
    agenda={u'committee': comid,
            u'committee_full': unws(' '.join(lines[0].xpath('.//text()'))),
            u'src': url,
        }
    i=1
    if unws(' '.join(lines[3].xpath('.//text()')))=="INTERPARLIAMENTARY COMMITTEE MEETING":
        logger.warn("skipping interparl com meet")
        return
    if unws(' '.join(lines[6].xpath('.//text()'))).startswith('Room'):
            agenda.update({u'docid': unws(' '.join(lines[1].xpath('.//text()'))),
                           u'type': unws(' '.join(lines[3].xpath('.//text()'))),
                           u'time': toTime(unws(' '.join(lines[4].xpath('.//text()')))),
                           u'city': unws(' '.join(lines[5].xpath('.//text()'))),
                           u'room': unws(' '.join(lines[6].xpath('.//text()')))[6:],
                           })
            i=7
    itemcnt=0
    item={}
    schedule=None
    res=[]
    while i < len(lines):
        line=lines[i]
        i+=1
        txt=unws(' '.join(line.xpath('.//text()')))
        if txt in ['* * *', '***']:
            continue # skip end of schedule block

        # 20 December 2011, 16.00 – 16.30
        tmp=toTime(txt)
        if tmp:
            schedule=tmp
            if i<len(lines) and unws(' '.join(lines[i].xpath('.//text()'))) == 'In camera':
                schedule[u'incamera']=True
                i+=1
            continue

        if line.tag=='div':
            item[u'actors']=getactors(line)
            continue
        firsttoken=txt.split()[0]
        # 6. Alternative dispute resolution for consumer disputes and
        #    amending Regulation (EC) No 2006/2004 and Directive
        #    2009/22/EC (Directive on consumer ADR)
        if firsttoken[-1]=='.' and firsttoken[:-1].isdigit() and itemcnt+1==int(firsttoken[:-1]):
            if item: res.append(item)
            itemcnt+=1
            item=copy.deepcopy(agenda)
            item.update({u'title': ' '.join(txt.split()[1:]),
                         u'seq_no': itemcnt,})
            if schedule:
                item.update(schedule)
            continue
        # trailing list of "details"
        # · Presentation by the Commission of the proposal & Impact Assessment
        # · Exchange of views
        if firsttoken==u"·":
            if not 'list' in item: item[u'list']=[]
            tmp=' '.join(txt.split()[1:])
            if tmp.startswith('Deadline for tabling amendments:'):
                try:
                    item[u'tabling_deadline']=datetime.strptime(tmp.split(':')[1].strip(),"%d %B %Y, %H.%M")
                except ValueError:
                    try:
                        item[u'tabling_deadline']=datetime.strptime(tmp.split(':')[1].strip(),"%d.%m.%Y at %H.%M")
                    except:
                        logger.warn('[$] unknown tabling deadline format %s' % unws(tmp))
            item[u'list'].append(tmp)
            continue
        # committee dossier
        # IMCO/7/08130
        if txt.startswith("%s/7/" % comid) and len(txt)==12:
            item[u'comdossier']=txt
            continue
        # ***I    2011/0373(COD)       COM(2011)0793 – C7-0454/2011
        tmp=getdocs(txt)
        if tmp:
            item.update(tmp)
            continue
        # fall-through line
        logger.debug("(falltrough) %s %s" % (line.tag, txt.encode('utf8')))
    if item: res.append(item)
    return res
Пример #22
0
def parseMember(userid):
    url = "http://www.europarl.europa.eu/meps/en/%s/get.html" % userid
    logger.info("scraping %s" % url)
    root = fetch(url, ignore=[500])
    data = {u"active": False, "meta": {u"url": url}}  # return {'active': False}
    mepdiv = root.xpath('//div[@class="ep_elementpeople2"]')
    if len(mepdiv) == 1:
        mepdiv = mepdiv[0]
    else:
        logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv)))
    data[u"Name"] = mangleName(unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[0]))
    data[u"Photo"] = unicode(urljoin(BASE_URL, mepdiv.xpath('.//span[@class="ep_img"]/img')[0].get("src")), "utf8")
    borntxt = mepdiv.xpath('.//div[@class="ep_elementtext"]/p/text()')
    if len(borntxt) > 0:
        (d, p) = borntxt[0].split(",", 1)
        try:
            data[u"Birth"] = {u"date": datetime.strptime(unws(d), u"Born on %d %B %Y"), u"place": unws(p)}
        except ValueError:
            logger.warn("[!] failed to scrape birth data %s" % url)
            logger.warn(traceback.format_exc())
    else:
        logger.warn("[!] no birth data %s" % url)
    const = {u"country": unws(mepdiv.xpath('.//span[@class="ep_country"]/text()')[0]), u"start": datetime(2009, 7, 14)}
    data[u"Constituencies"] = [const]
    try:
        data[u"party"] = unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[1])
    except IndexError:
        pass
    else:
        group = unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[0])
        try:
            role = unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[1])
        except IndexError:
            role = u"Member"
        data[u"Groups"] = [{u"role": role, u"Organization": group, u"groupid": group_map[group]}]
    cdiv = root.xpath('//div[@class="ep_elementcontact"]')
    if len(cdiv):
        addif(
            data,
            u"RSS",
            [unicode(urljoin(BASE_URL, x.get("href")), "utf8") for x in cdiv[0].xpath('.//li[@class="ep_rss"]//a')],
        )
        addif(
            data, u"Homepage", [unicode(x.get("href"), "utf8") for x in cdiv[0].xpath('.//li[@class="ep_website"]//a')]
        )
        addif(
            data,
            u"Mail",
            [decodemail(unws(x)) for x in cdiv[0].xpath('.//li[@class="ep_email"]//text()') if len(unws(x))],
        )
    for span in root.xpath('//div[@id="contextzone"]//span[@class="ep_title"]'):
        title = unws("".join(span.xpath(".//text()")))
        if title in ["Accredited assistants", "Local assistants"]:
            if not "assistants" in data:
                data["assistants"] = {}
            addif(
                data["assistants"], title.lower().split()[0], [unws(x) for x in span.xpath("../../..//li/div/text()")]
            )
    addif(data, u"Addresses", getAddress(root))
    for div in root.xpath('//div[@class="ep_content"]'):
        key = unws(u"".join(div.xpath('.//span[@class="ep_title"]/text()')))
        if not len(key):
            continue
        elif key.lower() == "curriculum vitae":
            data[u"CV"] = [unws(x) for x in div.xpath('.//div[@class="ep_elementtext"]//li/div/text()')]
        elif key in ["Member", "Substitute", "Chair", "Vice-Chair", "Co-President", "President", "Vice-President"]:
            for span in div.xpath('.//span[@class="commission_label"]'):
                item = {u"role": key, u"abbr": unws("".join(span.xpath(".//text()"))), u"Organization": unws(span.tail)}
                for start, field in orgmaps:
                    if item["abbr"] in COMMITTEE_MAP or item["Organization"].startswith(start):
                        if not field in data:
                            data[field] = []
                        if field == "Committees" and item["Organization"] in COMMITTEE_MAP:
                            item[u"committee_id"] = COMMITTEE_MAP[item["Organization"]]
                        data[field].append(item)
                        break
        else:
            logger.error("[!] unknown field %s" % key)
    return data
Пример #23
0
                    pass
        if len(tail):
            item['note'] = tail
        try:
            eurlex['dates'].append(item)
        except:
            eurlex['dates'] = [item]

    for t, l in GENERIC_FIELDS:
        try:
            s = root.xpath('//h2[text()="%s"]/following-sibling::ul' % t)[0]
        except:
            continue
        if not len(s): continue
        tmp = dict([(field, [
            unws(x) if x.getparent().tag != 'a' else {
                u'text': unws(x),
                u'url': x.getparent().get('href')
            } for x in s.xpath('./li/strong[text()="%s"]/..//text()' % field)
            if unws(x) and unws(x) != '/'
        ][1:]) for field in l])

        # merge multi-text items into one dict
        for k in ['Amended by:', "Legal basis:", 'Amendment to:']:
            tmp1 = {}
            for v in tmp.get(k, []):
                if type(v) == type(dict()):
                    if not v['url'] in tmp1:
                        tmp1[v['url']] = {
                            u'url': v['url'],
                            u'text': [v['text']]
Пример #24
0
            res[key][u'Address']=tmp
        elif key=='Postal':
            res[key]=tmp
        else:
            logger.error("wtf %s" % key)
    return res

def getMEPGender(id):
    try:
        mepraw=fetch("http://www.europarl.europa.eu/meps/fr/%s/_home.html" % (id), ignore=[500])
    except Exception, e:
        logger.error("mepgender %s" % e)
        return 'n/a'
    borntxt=mepraw.xpath('//div[@class="zone_info_mep_transparent_mep_details"]//span[@class="more_info"]/text()')
    if len(borntxt)>0:
        if unws(borntxt[-1]).startswith(u'décédé'):
            hint=borntxt[-2].replace(u"\u00A0",' ').split()[0]
        else:
            hint=borntxt[-1].replace(u"\u00A0",' ').split()[0]
        if hint==u"Née":
            return "F"
        elif hint==u"Né":
            return "M"
    logger.warn('[!] no birth/gender data http://www.europarl.europa.eu/meps/fr/%s/get.html' % id)
    return 'n/a'

def getMEPDeclarations(id):
    try:
        dom = fetch("http://www.europarl.europa.eu/meps/en/%s/_declarations.html" % (id), ignore=[500])
    except Exception, e:
        logger.error("mepdeclaration %s" % e)
Пример #25
0
def parseMember(userid):
    url='http://www.europarl.europa.eu/meps/en/%s/get.html' % userid
    logger.info("scraping %s" % url)
    root = fetch(url)
    data = {u'active': True, 'meta': {u'url': url}} # return {'active': False}
    mepdiv=root.xpath('//div[@class="ep_elementpeople2"]')
    if len(mepdiv) == 1:
        mepdiv = mepdiv[0]
    else:
        logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv)))
    data[u'Name'] = mangleName(unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[0]))
    data[u'Photo'] = unicode(urljoin(BASE_URL,mepdiv.xpath('.//span[@class="ep_img"]/img')[0].get('src')),'utf8')
    (d, p) = mepdiv.xpath('.//div[@class="ep_elementtext"]/p/text()')[0].split(',', 1)
    try:
        data[u'Birth'] = { u'date': datetime.strptime(unws(d), "Born on %d %B %Y"),
                           u'place': unws(p) }
    except ValueError:
        logger.warn('[!] failed to scrape birth data %s' % url)
        logger.warn(traceback.format_exc())
    const={u'country': unws(mepdiv.xpath('.//span[@class="ep_country"]/text()')[0])}
    data[u'Constituencies']=[const]
    try:
        const[u'party']=unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[1]),
    except IndexError:
        data[u'active']=False
    else:
        group=unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[0])
        data[u'Groups'] = [{ u'role': unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[1]),
                             u'group': group,
                             u'groupid': group_map[group]}]
    cdiv=root.xpath('//div[@class="ep_elementcontact"]')
    if len(cdiv):
        addif(data,u'RSS',[unicode(urljoin(BASE_URL,x.get('href')),'utf8') for x in cdiv[0].xpath('.//li[@class="ep_rss"]//a')])
        addif(data,u'Homepage',[unicode(x.get('href'),'utf8') for x in cdiv[0].xpath('.//li[@class="ep_website"]//a')])
        addif(data,u'Mail',[decodemail(unws(x)) for x in cdiv[0].xpath('.//li[@class="ep_email"]//text()') if len(unws(x))])
    for span in root.xpath('//div[@id="contextzone"]//span[@class="ep_title"]'):
        title=unws(''.join(span.xpath('.//text()')))
        if title in ['Accredited assistants', 'Local assistants']:
            addif(data,title,[unws(x) for x in span.xpath('../../..//li/div/text()')])
    addif(data,u'Addresses',getAddress(root))
    for div in root.xpath('//div[@class="ep_content"]'):
        key=unws(u''.join(div.xpath('.//span[@class="ep_title"]/text()')))
        if not len(key):
            continue
        elif key.lower()=='curriculum vitae':
            data[u'CV'] = [unws(x) for x in div.xpath('.//div[@class="ep_elementtext"]//li/div/text()')]
        elif key in ['Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President', 'President', 'Vice-President']:
            for span in div.xpath('.//span[@class="commission_label"]'):
                item={u'role': key,
                      u'abbr': unws(''.join(span.xpath('text()'))),
                      u'Organization': unws(span.tail)}
                for start, field in orgmaps:
                    if item['Organization'].startswith(start):
                        if not field in data: data[field]=[]
                        if field=='Committees' and item['Organization'] in COMMITTEE_MAP:
                            item[u'committee_id']=COMMITTEE_MAP[item['Organization']]
                        data[field].append(item)
                        break
        else:
            logger.error('[!] unknown field %s' % key)
    return data
Пример #26
0
def parseMember(userid):
    url = 'http://www.europarl.europa.eu/meps/en/%s/_history.html' % userid
    logger.info("scraping %s" % url)
    root = fetch(url, ignore=[500])

    data = {
        u'active': False,
        u'Photo': unicode(urljoin(BASE_URL, "/mepphoto/%s.jpg" % userid)),
        u'meta': {
            u'url': url
        }
    }

    mepdiv = root.xpath(
        '//div[@class="zone_info_mep_transparent_mep_details"]')
    if len(mepdiv) == 1:
        mepdiv = mepdiv[0]
    else:
        logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv)))
    data[u'Name'] = mangleName(
        unws(' '.join(mepdiv.xpath('.//li[@class="mep_name"]//text()'))))

    borntxt = mepdiv.xpath('.//span[@class="more_info"]/text()')
    if len(borntxt) > 0:
        if unws(borntxt[-1]).startswith('died on '):
            try:
                data[u'Death'] = datetime.strptime(unws(borntxt[-1]),
                                                   u"died on %d %B %Y")
            except ValueError:
                logger.warn('[!] failed to scrape birth data %s' % url)
                logger.warn(traceback.format_exc())
            tmp = borntxt[-2].split(',', 1)
        else:
            tmp = borntxt[-1].split(',', 1)
        if len(tmp) == 2:
            (d, p) = tmp
        else:
            d, p = tmp[0], None
        try:
            data[u'Birth'] = {
                u'date': datetime.strptime(unws(d), u"Born on %d %B %Y")
            }
        except ValueError:
            logger.warn('[!] failed to scrape birth data %s' % url)
            logger.warn(traceback.format_exc())
        finally:
            if p:
                if 'Birth' in data:
                    data[u'Birth'][u'place'] = unws(p)
                else:
                    data[u'Birth'] = unws(p)
    else:
        logger.warn('[!] no birth data %s' % url)

    # scrape stuff from right column
    addif(data, u'RSS', [
        unicode(urljoin(BASE_URL, x.get('href')), 'utf8') for x in root.xpath(
            '//ul[@class="link_collection_noborder"]/li/a[@class="link_rss"]')
    ])
    addif(data, u'Homepage', [
        x.get('href') for x in root.xpath(
            '//ul[@class="link_collection_noborder"]/li/a[@class="link_website"]'
        )
    ])
    addif(data, u'Twitter', [
        x.get('href') for x in root.xpath(
            '//ul[@class="link_collection_noborder"]/li/a[@class="link_twitt"]'
        )
    ])
    addif(data, u'Facebook', [
        x.get('href') for x in root.xpath(
            '//ul[@class="link_collection_noborder"]/li/a[@class="link_fb"]')
    ])
    addif(data, u'Mail', [
        x.get('href')[7:].replace('[dot]', '.').replace('[at]', '@')[::-1]
        for x in root.xpath(
            '//ul[@class="link_collection_noborder"]/li/a[@class="link_email"]'
        )
    ])
    # contact information
    for span in root.xpath('//div[@id="content_right"]//h3'):
        title = unws(''.join(span.xpath('.//text()')))
        if title in ['Accredited assistants', 'Local assistants']:
            if not 'assistants' in data: data['assistants'] = {}
            addif(
                data['assistants'],
                title.lower().split()[0], [
                    unws(x) for x in span.xpath(
                        '../following-sibling::div[@class="boxcontent"][1]//li/text()'
                    )
                ])
        elif title == "Contacts":
            addif(data, u'Addresses', getAddress(span))

    # scrape main content
    for section in root.xpath(
            '//div[@id="content_left"]/div[@class="boxcontent nobackground"]/h4'
    ):
        key = unws(''.join(section.xpath('.//text()')))
        if key == "National parties":
            # constituencies
            key = 'Constituencies'
            for constlm in section.xpath(
                    './following-sibling::ul[@class="events_collection bullets"][1]/li'
            ):
                line = unws(u' '.join(
                    [unicode(x) for x in constlm.xpath('.//text()')]))
                try:
                    interval, party = line.split(' : ', 1)
                except ValueError:
                    continue
                tmp = interval.split(' / ')
                if not key in data: data[key] = []
                if len(tmp) == 2:
                    (start, end) = tmp
                else:
                    start = interval.split()[0]
                    end = "31.12.9999"
                cstart = party.rfind(' (')
                if party[cstart + 2:-1] in SEIRTNUOC:
                    country = party[cstart + 2:-1]
                    party = party[:cstart]
                else:
                    logger.warn('unknown country: %s' % party[cstart + 2:-1])
                    country = 'unknown'
                #print etree.tostring(constlm, pretty_print=True)
                data[key].append({
                    u'party':
                    party,
                    u'country':
                    country,
                    u'start':
                    datetime.strptime(unws(start), u"%d.%m.%Y"),
                    u'end':
                    datetime.strptime(unws(end), u"%d.%m.%Y"),
                })
        elif key in [
                'Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President',
                'President', 'Vice-President', 'Observer', 'Quaestor'
        ]:
            # memberships in various committees, delegations and EP mgt
            for constlm in section.xpath(
                    './following-sibling::ul[@class="events_collection bullets"][1]/li'
            ):
                line = unws(u' '.join(
                    [unicode(x) for x in constlm.xpath('.//text()')]))
                try:
                    interval, org = line.split(' : ', 1)
                except ValueError:
                    continue
                tmp = interval.split(' / ')
                if len(tmp) == 2:
                    (start, end) = tmp
                else:
                    start = interval.split()[0]
                    end = "31.12.9999"
                item = {
                    u'role': key,
                    u'abbr': COMMITTEE_MAP.get(org),
                    u'Organization': org,
                    u'start': datetime.strptime(unws(start), u"%d.%m.%Y"),
                    u'end': datetime.strptime(unws(end), u"%d.%m.%Y"),
                }
                for start, field in orgmaps:
                    if item['abbr'] in COMMITTEE_MAP or item[
                            'Organization'].startswith(start):
                        if not field in data: data[field] = []
                        if field == 'Committees' and item[
                                'Organization'] in COMMITTEE_MAP:
                            item[u'committee_id'] = COMMITTEE_MAP[
                                item['Organization']]
                        data[field].append(item)
                        break
        elif key == u'Political groups':
            for constlm in section.xpath(
                    './following-sibling::ul[@class="events_collection bullets"][1]/li'
            ):
                line = unws(u' '.join(
                    [unicode(x) for x in constlm.xpath('.//text()')]))
                interval, org = line.split(' : ', 1)
                tmp = org.split(u' - ')
                if len(tmp) > 1:
                    org = ' - '.join(tmp[:-1])
                    role = tmp[-1]
                elif org.endswith(' -'):
                    org = org[:-2]
                    role = ''
                else:
                    logger.error('[!] political group line %s' % line)
                    continue
                tmp = interval.split(' / ')
                if len(tmp) == 2:
                    (start, end) = tmp
                else:
                    start = interval.split()[0]
                    end = "31.12.9999"
                if not u'Groups' in data: data[u'Groups'] = []
                data[u'Groups'].append({
                    u'role':
                    role,
                    u'Organization':
                    org,
                    u'country':
                    COUNTRIES.get(
                        unws(constlm.get('class')).upper(),
                        'unknown country: %s' % unws(constlm.get('class'))),
                    u'groupid':
                    group_map[org],
                    u'start':
                    datetime.strptime(unws(start), u"%d.%m.%Y"),
                    u'end':
                    datetime.strptime(unws(end), u"%d.%m.%Y"),
                })
        else:
            logger.error('[!] unknown field %s' % key)

    # sort all lists in descending order
    for fld in [
            'Constituencies', 'Groups', 'Committees', 'Delegations', 'Staff'
    ]:
        if not fld in data: continue
        data[fld] = sorted(data[fld],
                           key=lambda x: x.get('end', x['start']),
                           reverse=True)

    # get CV - page (is on separate http path :/)
    cvurl = 'http://www.europarl.europa.eu/meps/en/%s/_cv.html' % userid
    root = fetch(cvurl, ignore=[500])
    data[u'CV'] = [
        unws(x) for x in root.xpath('//p[@class="details_cv"]/text()')
    ]

    return data
Пример #27
0
def parseMember(userid):
    url = 'http://www.europarl.europa.eu/meps/en/%s/get.html' % userid
    logger.info("scraping %s" % url)
    root = fetch(url, ignore=[500])
    data = {u'active': True, 'meta': {u'url': url}}  # return {'active': False}
    mepdiv = root.xpath('//div[@class="ep_elementpeople2"]')
    if len(mepdiv) == 1:
        mepdiv = mepdiv[0]
    else:
        logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv)))
    data[u'Name'] = mangleName(
        unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[0]))
    data[u'Photo'] = unicode(
        urljoin(BASE_URL,
                mepdiv.xpath('.//span[@class="ep_img"]/img')[0].get('src')),
        'utf8')
    (d, p) = mepdiv.xpath('.//div[@class="ep_elementtext"]/p/text()')[0].split(
        ',', 1)
    try:
        data[u'Birth'] = {
            u'date': datetime.strptime(unws(d), u"Born on %d %B %Y"),
            u'place': unws(p)
        }
    except ValueError:
        logger.warn('[!] failed to scrape birth data %s' % url)
        logger.warn(traceback.format_exc())
    const = {
        u'country':
        unws(mepdiv.xpath('.//span[@class="ep_country"]/text()')[0]),
        u'start': datetime(2009, 7, 14)
    }
    data[u'Constituencies'] = [const]
    try:
        const[u'party'] = unws(
            mepdiv.xpath('.//span[@class="ep_group"]/text()')[1])
    except IndexError:
        data[u'active'] = False
    else:
        group = unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[0])
        try:
            role = unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[1])
        except IndexError:
            role = u"Member"
        data[u'Groups'] = [{
            u'role': role,
            u'Organization': group,
            u'groupid': group_map[group]
        }]
    cdiv = root.xpath('//div[@class="ep_elementcontact"]')
    if len(cdiv):
        addif(data, u'RSS', [
            unicode(urljoin(BASE_URL, x.get('href')), 'utf8')
            for x in cdiv[0].xpath('.//li[@class="ep_rss"]//a')
        ])
        addif(data, u'Homepage', [
            unicode(x.get('href'), 'utf8')
            for x in cdiv[0].xpath('.//li[@class="ep_website"]//a')
        ])
        addif(data, u'Mail', [
            decodemail(unws(x))
            for x in cdiv[0].xpath('.//li[@class="ep_email"]//text()')
            if len(unws(x))
        ])
    for span in root.xpath(
            '//div[@id="contextzone"]//span[@class="ep_title"]'):
        title = unws(''.join(span.xpath('.//text()')))
        if title in ['Accredited assistants', 'Local assistants']:
            if not 'assistants' in data: data['assistants'] = {}
            addif(data['assistants'],
                  title.lower().split()[0],
                  [unws(x) for x in span.xpath('../../..//li/div/text()')])
    addif(data, u'Addresses', getAddress(root))
    for div in root.xpath('//div[@class="ep_content"]'):
        key = unws(u''.join(div.xpath('.//span[@class="ep_title"]/text()')))
        if not len(key):
            continue
        elif key.lower() == 'curriculum vitae':
            data[u'CV'] = [
                unws(x) for x in div.xpath(
                    './/div[@class="ep_elementtext"]//li/div/text()')
            ]
        elif key in [
                'Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President',
                'President', 'Vice-President'
        ]:
            for span in div.xpath('.//span[@class="commission_label"]'):
                item = {
                    u'role': key,
                    u'abbr': unws(''.join(span.xpath('.//text()'))),
                    u'Organization': unws(span.tail)
                }
                for start, field in orgmaps:
                    if item['abbr'] in COMMITTEE_MAP or item[
                            'Organization'].startswith(start):
                        if not field in data: data[field] = []
                        if field == 'Committees' and item[
                                'Organization'] in COMMITTEE_MAP:
                            item[u'committee_id'] = COMMITTEE_MAP[
                                item['Organization']]
                        data[field].append(item)
                        break
        else:
            logger.error('[!] unknown field %s' % key)
    return data
Пример #28
0
def splitNames(text):
    text = text.split(" on behalf ", 1)[0]
    res = []
    for delim in (", ", " and ", " & ", "; ", ","):
        if not res:
            res = filter(
                None, [item[:-1] if item[-1] in [",", "'", ";"] else item for item in unws(text).split(delim) if item]
            )
            continue
        res = filter(
            None,
            [item[:-1] if item[-1] in [",", "'", ";"] else item for elem in res for item in elem.split(delim) if item],
        )
    # only for devel.
    # for mep in res:
    #     if mep.startswith('on behalf of'): continue
    #     if mep.endswith('Shadow)'):
    #         logger.info('shadow: %s' % mep)
    res = [
        mep if not mep.endswith("Shadow)") else mep[: mep.rfind(" (")]
        for mep in res
        if not mep.startswith("on behalf of")
    ]
    res = [y for x in res for y in mansplits.get(x, [x])]
    return [mepmaps.get(x, x) for x in res]
Пример #29
0
def scrape(celexid, path):
    logger.info("scraping %s%s:NOT" % (EURLEXURL, celexid))
    path.reverse()
    (code, lang) = celexid.split(":")[1:3]
    st = 6
    if len(code) > 6:
        if code[6].isalpha(): st = 7
        eurlex = {
            'id': {
                u'celexid': celexid,
                u'sector': code[0],
                u'year': code[1:5],
                u'doctype': code[5:st],
                u'refno': code[st:],
                u'lang': lang,
                u'chapter': path,
            }
        }
    else:
        eurlex = {
            'id': {
                u'celexid': celexid,
                u'sector': code[0],
                u'year': code[1:5],
                u'doctype': code[5:6],
                u'lang': lang,
                u'chapter': path,
            }
        }

    try:
        eurlex['id'][u'typeDesc'] = CELEXCODES[code[0]]['Document Types'][
            code[5:st]] if code[5:st] != 'C' else CELEXCODES[code[0]]['Sector']
    except:
        eurlex['id'][u'typeDesc'] = u"Unknown"
        logger.warn("[!] unknown typedesc %s" % celexid)
    eurlex['meta'] = {u'src': "%s%s:NOT" % (EURLEXURL, celexid)}

    root = fetch("%s%s:NOT" % (EURLEXURL, celexid))
    if len(root.xpath('//h1[text()="No documents matching criteria."]')) > 0:
        logger.warn('[!] nothing to scrape here: %s',
                    "%s%s:NOT" % (EURLEXURL, celexid))
        return
    eurlex[u'title'] = root.xpath(
        '//h2[text()="Title and reference"]/following-sibling::p/text()')[0]
    # dates
    dates = root.xpath('//h2[text()="Dates"]/following-sibling::ul/text()')
    for y in dates:
        if not unws(y): continue
        title, rest = unws(y).split(": ", 1)
        item = {u'type': title}
        date = rest[:10]
        tail = rest[10:]
        if tail.startswith('; '):
            tail = tail[2:]
        if date == '99/99/9999': item[u'date'] = datetime(9999, 12, 31)
        elif date == '00/00/0000': item[u'date'] = datetime(0001, 01, 01)
        elif date == '//': continue
        else:
            try:
                item[u'date'] = datetime.strptime(date, u"%d/%m/%Y")
            except ValueError:
                try:
                    item[u'date'] = datetime.strptime(date, u"%m/%d/%Y")
                except:
                    pass
        if len(tail):
            item['note'] = tail
        try:
            eurlex['dates'].append(item)
        except:
            eurlex['dates'] = [item]
Пример #30
0
                try: item[u'date']= datetime.strptime(date, u"%m/%d/%Y")
                except: pass
        if len(tail):
            item['note']=tail
        try:
            eurlex['dates'].append(item)
        except:
            eurlex['dates']=[item]

    for t,l in GENERIC_FIELDS:
        try:
            s=root.xpath('//h2[text()="%s"]/following-sibling::ul' % t)[0]
        except:
            continue
        if not len(s): continue
        tmp=dict([(field, [unws(x) if x.getparent().tag!='a' else {u'text': unws(x),
                                                                   u'url': x.getparent().get('href')}
                           for x in s.xpath('./li/strong[text()="%s"]/..//text()' % field)
                           if unws(x) and unws(x)!='/'][1:])
                  for field in l])

        # merge multi-text items into one dict
        for k in ['Amended by:', "Legal basis:", 'Amendment to:']:
            tmp1={}
            for v in tmp.get(k,[]):
                if type(v)==type(dict()):
                    if not v['url'] in tmp1: tmp1[v['url']]={u'url': v['url'],
                                                             u'text': [v['text']]}
                    elif not v['text'] in tmp1[v['url']]['text']:
                        tmp1[v['url']]['text'].append(v['text'])
            if tmp1:
Пример #31
0
def parse_block(block, url, reference, date, committee, rapporteur):
    am = {u"src": url, u"reference": reference, u"date": date, u"committee": committee}

    # logger.info(block)
    # get title
    try:
        am[u"seq"] = int(unws(block[0]).split()[1])
    except ValueError:
        am[u"seq"] = unws(block[0]).split()[1]
    except IndexError:
        logger.warn("%s wrong seq %s" % (datetime.now().isoformat(), block[0]))
        am[u"seq"] = unws(block[0])
    del block[0]

    strip(block)

    # find and strip justification
    i = len(block) - 1
    while i > 2 and not (unws(block[i]) == "Justification" and block[i].startswith(" " * 6)):
        i -= 1
    if i > 2:
        if i < len(block) - 1 and (not unws(block[i + 1]) or not block[i + 1].startswith(" ")):
            am["justification"] = "\n".join(block[i + 2 :])
            del block[i:]
            strip(block)
        else:
            logger.warn("%s wrong justification\n%s" % (datetime.now().isoformat(), "\n".join(block[i:])))

    # get original language
    if 4 < len(unws(block[-1])) <= 6 and unws(block[-1]).startswith("Or."):
        am["orig_lang"] = unws(block[-1])[4:]
        del block[-1]
        strip(block)

    # find split column new/old heading
    i = len(block) - 1
    while (
        i > 2
        and not (
            (
                block[i].endswith("     Amendment")
                or block[i].endswith("     PARTICULARS")
                or block[i].endswith("     Remedy")
                or block[i].endswith("     Amended text")
                or block[i].endswith("     Amendement")
                or block[i].endswith("           Amendments by Parliament")
                or block[i].endswith("           Proposal for rejection")
                or block[i].endswith("           Proposal for a rejection")
                or block[i].endswith("           Does not affect English version")
                or block[i].endswith("           (Does not affect English version)")
                or block[i].endswith("      Amendment by Parliament")
            )
            and len(block[i]) > 33
        )
        and not (unws(block[i]) == "Text proposed by the Commission" or unws(block[i]) in types)
    ):
        i -= 1
    if i > 2:
        # if block[i].endswith("               Proposal for rejection"):
        #    pass # location will be possibly '-'
        seq = False
        if unws(block[i]) in ["Amendment", "Amendment by Parliament"]:
            # sequential format, search for preceeding original text
            j = i
            while j > 2 and not (unws(block[j]) in types or unws(block[j]) == "Text proposed by the Commission"):
                j -= 1
            if j > 2:
                i = j
            seq = True
            key = "old"
        elif unws(block[i]) == "Text proposed by the Commission" or block[i].strip() in types:
            seq = True
            key = "old"
        # throw headers
        del block[i]
        while i < len(block) and not unws(block[i]):
            del block[i]  # skip blank lines
        mid = max([len(x) for x in block]) / 2
        while i < len(block):
            if seq:
                if unws(block[i]) in ["Amendment", "Amendment by Parliament", "Text Amended"]:
                    key = "new"
                    del block[i]
                    continue
                try:
                    am[key].append(block[i])
                except KeyError:
                    am[key] = [block[i]]
                del block[i]
                continue
            # only new, old is empty
            if block[i].startswith("         "):
                try:
                    am["new"].append(unws(block[i]))
                except KeyError:
                    am["new"] = [unws(block[i])]
                del block[i]
                continue
            newstart = block[i].rstrip().rfind("  ")
            # only old, new is empty
            if newstart < 6:
                try:
                    am["old"].append(unws(block[i]))
                except KeyError:
                    am["old"] = [unws(block[i])]
                del block[i]
                continue
            # mid=len(block[i])/2
            # mid=40
            lsep = block[i].rfind("  ", 0, mid)
            # todo calculate both, and use the one closer to the center
            rsep = block[i].find("  ", mid)
            sep = None
            if abs(lsep - mid) < abs(rsep - mid):
                if abs(lsep - mid) < 15:
                    sep = lsep
            else:
                if abs(rsep - mid) < 15:
                    sep = rsep
            if sep:
                try:
                    am["old"].append(unws(block[i][:sep]))
                except KeyError:
                    am["old"] = [unws(block[i][:sep])]
                try:
                    am["new"].append(unws(block[i][sep:]))
                except KeyError:
                    am["new"] = [unws(block[i][sep:])]
            else:
                # no sane split found
                # logger.warn("no split: %s %s\n%s" % (datetime.now().isoformat(),
                #                                     (sep, mid, len(block[i]), newstart, block[i]),
                #                                     block[i][mid-1:mid+2]))
                # fallback to naive splitting
                try:
                    am["old"].append(unws(block[i][:newstart]))
                except KeyError:
                    am["old"] = [unws(block[i][:newstart])]
                try:
                    am["new"].append(unws(block[i][newstart:]))
                except KeyError:
                    am["new"] = [unws(block[i][newstart:])]
            del block[i]
        strip(block)
    else:
        logger.warn("%s no table\n%s" % (datetime.now().isoformat(), "\n".join(block[i:])))
        am["content"] = block[i:]
        return am

    i = 0
    # find end of authors
    while (
        i < len(block)
        and unws(block[i])
        and not unws(block[i]).lower().startswith("compromise")
        and not istype(block[i])
        and not unws(block[i]).split()[0] in locstarts
    ):
        i += 1
    if i < len(block):
        if i > 0:
            names = " ".join(block[:i])
            am["authors"] = names
            # logger.info("names \n%s" % names)

            # convert to pt mep _ids
            for text in filter(None, splitNames(names)):
                mep = getMep(text, None, False)
                if mep:
                    try:
                        am["meps"].append(mep["UserID"])
                    except KeyError:
                        am["meps"] = [mep["UserID"]]
                else:
                    logger.info("fix %s" % text)
            del block[:i]
            strip(block)
        elif rapporteur:
            am["authors"] = rapporteur
            for text in filter(None, splitNames(rapporteur)):
                mep = getMep(text, None, False)
                if mep:
                    try:
                        am["meps"].append(mep["UserID"])
                    except KeyError:
                        am["meps"] = [mep["UserID"]]
                else:
                    logger.info("fix %s" % text)
        else:
            logger.info("%s no authors in Amendment %s" % (datetime.now().isoformat(), am["seq"]))
    else:
        logger.warn("%s no boundaries in Amendment %s\n%s" % (datetime.now().isoformat(), am["seq"], "\n".join(block)))
        am["rest"] = block
        return am

    # handle compromise info
    i = 0
    while i < len(block) and unws(block[i]) and not istype(block[i]) and not unws(block[i]).split()[0] in locstarts:
        i += 1
    if i < len(block) and i > 0:
        am["compromise"] = block[:i]
        del block[:i]
        strip(block)

    i = 0
    while i < len(block) and unws(block[i]):
        if unws(block[i]).split()[0] in locstarts:
            try:
                am["location"].append((" ".join(block[:i]), unws(block[i])))
            except KeyError:
                am["location"] = [(" ".join(block[:i]), unws(block[i]))]
            del block[: i + 1]
            i = 0
        else:
            i += 1
    if len(block) > 0 and ((len(block) == 1 or not unws(block[1])) and unws(block[0]) != "1" and "location" in am):
        am["location"][-1] = (am["location"][-1][0], "%s %s" % (am["location"][-1][1], block[0]))
        del block[0]
        strip(block)

    if block:
        if not (
            (len(block) == 3 and unws(block[0]) == "1" and not unws(block[1]) and block[2].startswith("  "))
            or (len(block) == 2 and unws(block[0]) == "1" and block[1].startswith("  "))
        ):
            # ignore obvious footnotes
            logger.info("rest in Amendment %s\n%s" % (am["seq"], "\n".join(block)))
    return am
Пример #32
0
                try: item[u'date']= datetime.strptime(date, u"%m/%d/%Y")
                except: pass
        if len(tail):
            item['note']=tail
        try:
            eurlex['dates'][title]=item
        except:
            eurlex['dates']={title: item}

    for t,l in GENERIC_FIELDS:
        try:
            s=root.xpath('//h2[text()="%s"]/following-sibling::ul' % t)[0]
        except:
            continue
        if not len(s): continue
        tmp=dict([(field, [{u'text': unws(x), u'url': x.getparent().get('href')}
                           for x in s.xpath('./li/strong[text()="%s"]/..//text()' % field)[2:]
                           if unws(x) and unws(x)!='/'])
                  for field in l
                  if field is not "Directory code:"])

        # merge multi-text items into one dict
        for k in ['Amended by:', "Legal basis:", 'Amendment to:']:
            tmp1={}
            for v in tmp.get(k,[]):
                if type(v)==type(dict()):
                    if not v['url'] in tmp1: tmp1[v['url']]={u'url': v['url'],
                                                             u'text': [v['text']]}
                    elif not v['text'] in tmp1[v['url']]['text']:
                        tmp1[v['url']]['text'].append(v['text'])
            if tmp1:
Пример #33
0
def strip(block):
    while len(block) and not unws(block[0]):
        del block[0]
    while len(block) and not unws(block[-1]):
        del block[-1]
Пример #34
0
def scrape(url, rapporteur=None):
    if url in [
        "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-483.680%2b02%2bDOC%2bPDF%2bV0%2f%2fEN",
        "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-454.387%2b01%2bDOC%2bPDF%2bV0%2f%2fEN",
        "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-456.679%2b01%2bDOC%2bPDF%2bV0%2f%2fEN",
        "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-494.504%2b01%2bDOC%2bPDF%2bV0%2f%2fEN",
        "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.705%2b01%2bDOC%2bPDF%2bV0%2f%2fEN",
        "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.767%2b02%2bDOC%2bPDF%2bV0%2f%2fEN",
        "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-454.385%2b01%2bDOC%2bPDF%2bV0%2f%2fEN",
        "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-465.012%2b01%2bDOC%2bPDF%2bV0%2f%2fEN",
        "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-496.504%2b01%2bDOC%2bPDF%2bV0%2f%2fEN",
        "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.724%2b01%2bDOC%2bPDF%2bV0%2f%2fEN",
        "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.721%2b02%2bDOC%2bPDF%2bV0%2f%2fEN",
        "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.723%2b03%2bDOC%2bPDF%2bV0%2f%2fEN",
    ] or not url.endswith("EN"):
        logger.info("skipping unparsable url")
        return []
    prolog = True
    res = []
    block = None
    reference = None
    date = None
    committee = []
    text = getraw(url).split("\n")
    for line in text:
        if prolog:
            if amstart.match(line):
                if reference == None:
                    logger.warn(
                        "%s [!] couldn't find ref: %s"
                        % (datetime.now().isoformat(), unws([x for x in text[:20] if unws(x)][2]))
                    )
                    # marking as scraped though
                    db.ep_ams.save({"src": url, "error": "couldn't find reference in source pdf"})
                    return []
                if date == None or committee == []:
                    return []
                    # raise ValueError
                block = [line]
                prolog = False
                continue

            line = unws(line)

            if not line:
                continue

            if line in COMMITTEE_MAP:
                committee.append(COMMITTEE_MAP[line])
                continue

            if committee and not reference and re.match(refre, line):
                reference = line
                if (
                    url
                    == "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-506.166%2b03%2bDOC%2bPDF%2bV0%2f%2fEN"
                ):
                    logger.info("adjusting reference to eudatap")
                    reference = "2012/0011(COD)"
                continue

            if reference and not date:
                try:
                    date = parse(unws(line), dayfirst=True)
                except ValueError:
                    pass
                except TypeError:
                    pass
            continue

        if amstart.match(line):
            # parse block
            res.append(parse_block(block, url, reference, date, committee, rapporteur))
            block = [line]
            continue
        block.append(line)
    if block and filter(None, block):
        res.append(parse_block(block, url, reference, date, committee, rapporteur))
    return res
Пример #35
0
def parseMember(userid):
    url='http://www.europarl.europa.eu/meps/en/%s/_history.html' % userid
    logger.info("scraping %s" % url)
    root = fetch(url, ignore=[500])

    data = {
        u'active': False,
        u'Photo': unicode(urljoin(BASE_URL,"/mepphoto/%s.jpg" % userid)),
        u'meta': {u'url': url}
        }

    mepdiv=root.xpath('//div[@class="zone_info_mep_transparent_mep_details"]')
    if len(mepdiv) == 1:
        mepdiv = mepdiv[0]
    else:
        logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv)))
    data[u'Name'] = mangleName(unws(' '.join(mepdiv.xpath('.//li[@class="mep_name"]//text()'))))

    borntxt=mepdiv.xpath('.//span[@class="more_info"]/text()')
    if len(borntxt)>0:
        if unws(borntxt[-1]).startswith('Date of death:'):
            try:
                data[u'Death'] = datetime.strptime(unws(borntxt[-1]), u"Date of death: %d %B %Y")
            except ValueError:
                logger.warn('[!] failed to scrape birth data %s' % url)
                logger.warn(traceback.format_exc())
            tmp = borntxt[-2].split(',', 1)
        else:
            tmp = borntxt[-1].split(',', 1)
        if len(tmp)==2:
            (d, p) = tmp
        else:
            d,p = tmp[0], None
        try:
            data[u'Birth'] = { u'date': datetime.strptime(unws(d), u"Date of birth: %d %B %Y")}
        except ValueError:
            logger.warn(traceback.format_exc())
        finally:
            if p:
                if 'Birth' in data:
                    data[u'Birth'][u'place'] = unws(p)
                else:
                    data[u'Birth'] = unws(p)
    else:
        logger.warn('[!] no birth data %s' % url)

    # scrape stuff from right column
    addif(data,u'RSS',[unicode(urljoin(BASE_URL,x.get('href')),'utf8')
                       for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_rss"]')])
    addif(data,u'Homepage',[x.get('href')
                            for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_website"]')])
    addif(data,u'Twitter',[x.get('href')
                           for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_twitt"]')])
    addif(data,u'Facebook',[x.get('href')
                           for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_fb"]')])
    addif(data,u'Mail',[x.get('href')[7:].replace('[dot]','.').replace('[at]','@')[::-1]
                        for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_email"]')])
    # contact information
    for span in root.xpath('//div[@id="content_right"]//h3'):
        title=unws(''.join(span.xpath('.//text()')))
        if title == "Contacts":
            addif(data,u'Addresses',getAddress(span))

    # scrape main content
    for section in root.xpath('//div[@id="content_left"]/div[@class="boxcontent nobackground"]/h4'):
        key=unws(''.join(section.xpath('.//text()')))
        if key=="National parties":
            # constituencies
            key='Constituencies'
            for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'):
                line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')]))
                try:
                    interval, party = line.split(' : ',1)
                except ValueError:
                    continue
                tmp = interval.split(' / ')
                if not key in data: data[key]=[]
                if len(tmp)==2:
                    (start, end) = tmp
                else:
                    start = interval.split()[0]
                    end = "31.12.9999"
                cstart = party.rfind(' (')
                if party[cstart+2:-1] in SEIRTNUOC:
                    country = party[cstart+2:-1]
                    party = party[:cstart]
                else:
                    logger.warn('unknown country: %s' % party[cstart+2:-1])
                    country='unknown'
                #print etree.tostring(constlm, pretty_print=True)
                data[key].append({
                    u'party':     party,
                    u'country':   country,
                    u'start':     datetime.strptime(unws(start), u"%d.%m.%Y"),
                    u'end':       datetime.strptime(unws(end), u"%d.%m.%Y"),
                    })
        elif key in ['Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President', 'President', 'Vice-President', 'Observer', 'Quaestor']:
            # memberships in various committees, delegations and EP mgt
            for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'):
                line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')]))
                try:
                    interval, org = line.split(' : ',1)
                except ValueError:
                    continue
                tmp = interval.split(' / ')
                if len(tmp)==2:
                    (start, end) = tmp
                else:
                    start = interval.split()[0]
                    end = "31.12.9999"
                item={u'role': key,
                      u'abbr': COMMITTEE_MAP.get(org),
                      u'Organization': org,
                      u'start':     datetime.strptime(unws(start), u"%d.%m.%Y"),
                      u'end':       datetime.strptime(unws(end), u"%d.%m.%Y"),
                      }
                for start, field in orgmaps:
                    if item['abbr'] in COMMITTEE_MAP or item['Organization'].startswith(start):
                        if not field in data: data[field]=[]
                        if field=='Committees' and item['Organization'] in COMMITTEE_MAP:
                            item[u'committee_id']=COMMITTEE_MAP[item['Organization']]
                        data[field].append(item)
                        break
        elif key == u'Political groups':
            for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'):
                line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')]))
                interval, org = line.split(' : ',1)
                tmp = org.split(u' - ')
                if len(tmp)>1:
                    org = ' - '.join(tmp[:-1])
                    role = tmp[-1]
                elif org.endswith(' -'):
                        org=org[:-2]
                        role=''
                else:
                    logger.error('[!] political group line %s' % line)
                    continue
                tmp = interval.split(' / ')
                if len(tmp)==2:
                    (start, end) = tmp
                else:
                    start = interval.split()[0]
                    end = "31.12.9999"
                if not u'Groups' in data: data[u'Groups']=[]
                data[u'Groups'].append(
                    {u'role':         role,
                     u'Organization': org,
                     u'country':      COUNTRIES.get(unws(constlm.get('class')).upper(), 'unknown country: %s' % unws(constlm.get('class'))),
                     u'groupid':      group_map[org],
                     u'start':        datetime.strptime(unws(start), u"%d.%m.%Y"),
                     u'end':          datetime.strptime(unws(end), u"%d.%m.%Y"),
                     })
        else:
            logger.error('[!] unknown field %s' % key)

    # sort all lists in descending order
    for fld in ['Constituencies', 'Groups', 'Committees', 'Delegations', 'Staff']:
        if not fld in data: continue
        data[fld]=sorted(data[fld],
                         key=lambda x: x.get('end',x['start']),
                         reverse=True)

    # get CV - page (is on separate http path :/)
    cvurl='http://www.europarl.europa.eu/meps/en/%s/_cv.html' % userid
    root = fetch(cvurl, ignore=[500])
    data[u'CV']=[unws(x) for x in root.xpath('//p[@class="details_cv"]/text()')]

    # get assistants also on a separate page :/
    assurl='http://www.europarl.europa.eu/meps/en/%s/_assistants.html' % userid
    root = fetch(assurl, ignore=[500])
    for h3 in root.xpath('//h3[@id="section"]'):
        title=unws(''.join(h3.xpath('.//text()')))
        if title in ['Accredited assistants', 'Local assistants']:
            if not 'assistants' in data: data['assistants']={}
            addif(data['assistants'],
                  title.lower().split()[0],
                  [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')])
        elif title in ['Accredited assistants (grouping)', 'Local assistants (grouping)',
                       'Service providers', ' Trainees', 'Paying agents (grouping)', 'Paying agents']:
            if not 'assistants' in data: data['assistants']={}
            addif(data['assistants'],
                  title.lower(),
                  [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')])

    return data
Пример #36
0
def getactors(node):
    res = {}
    ax = [None, []]
    for row in node.xpath('.//tr'):
        cells = row.xpath('./td/p')
        if not cells: continue

        # get role Rapporteur|Responsible|Rapporteur for the opinion|Opinions
        role = cells[0].xpath('text()')
        if role and unws(role[0]):
            if ax[0] and ax[1]: res[ax[0]] = sorted(ax[1])
            tmp = unws(role[0])[:-1]
            if tmp == "Rapporteur for the opinion":
                tmp = "Rapporteur"
            ax = [tmp, []]

        tmp = unws((cells[1].xpath('text()') or [None])[0])
        if ax[0] in ["Rapporteur", "Rapporteur for the opinion"] and tmp:
            name = ' '.join(tmp.split()[:-1])
            item = {
                u'group': tmp.split()[-1][1:-1],
                u'name': name,
                u'mepref': getMEPRef(name)
            }
            if len(cells) > 2:
                item[u'docs'] = getdoclist(cells[2])
            ax[1].append(item)
            continue
        if ax[0] in ["Opinions", "Responsible"] and tmp:
            tmp1 = tmp.split(u' –', 1)
            if len(tmp1) == 2:
                (comid, rest) = tmp1
            elif len(tmp1) == 1:
                skip = False
                for com in tmp.split(' ,'):
                    if com in COMMITTEE_MAP and len(com) == 4:
                        ax[1].append({u'comid': com})
                        skip = True
                if skip:
                    continue
            else:
                logger.warn("[!] unknown committee: %s" % tmp)
                raise
            item = {u'comid': comid}
            if rest == ' Decision: no opinion':
                item[u'response'] = u'Decision: no opinion'
            if not rest and len(comid) > 4:
                for com in comid.split(', '):
                    ax[1].append({u'comid': com})
                continue
            if len(cells) > 2:
                tmp = unws((cells[2].xpath('text()') or [None])[0])
                if tmp:
                    name = ' '.join(tmp.split()[:-1])
                    item.update({
                        u'group': tmp.split()[-1][1:-1],
                        u'name': name,
                        u'mepref': getMEPRef(name)
                    })
                    if len(cells) > 3:
                        item[u'docs'] = getdoclist(cells[3])
            ax[1].append(item)
    if ax[0] and ax[1]: res[ax[0]] = sorted(ax[1])
    return res
Пример #37
0
def strip(block):
    while len(block) and not unws(block[0]):
        del block[0]
    while len(block) and not unws(block[-1]):
        del block[-1]
Пример #38
0
def scrape(url, comid):
    root = fetch(url)
    lines = [
        x for x in root.xpath('//td[@class="contents"]/div/*')
        if unws(' '.join(x.xpath('.//text()')))
    ]
    if not len(lines): return
    if not unws(' '.join(lines[2].xpath('.//text()'))) == 'DRAFT AGENDA':
        logger.error("NOT DRAFT AGENDA %s" %
                     unws(' '.join(lines[2].xpath('.//text()'))))
    agenda = {
        u'committee': comid,
        u'committee_full': unws(' '.join(lines[0].xpath('.//text()'))),
        u'src': url,
    }
    i = 1
    if unws(' '.join(lines[6].xpath('.//text()'))).startswith('Room'):
        agenda.update({
            u'docid':
            unws(' '.join(lines[1].xpath('.//text()'))),
            u'type':
            unws(' '.join(lines[3].xpath('.//text()'))),
            u'time':
            toTime(unws(' '.join(lines[4].xpath('.//text()')))),
            u'city':
            unws(' '.join(lines[5].xpath('.//text()'))),
            u'room':
            unws(' '.join(lines[6].xpath('.//text()')))[6:],
        })
        i = 7
    itemcnt = 0
    item = {}
    schedule = None
    res = []
    while i < len(lines):
        line = lines[i]
        i += 1
        txt = unws(' '.join(line.xpath('.//text()')))
        if txt in ['* * *', '***']:
            continue  # skip end of schedule block

        # 20 December 2011, 16.00 – 16.30
        tmp = toTime(txt)
        if tmp:
            schedule = tmp
            if i < len(lines) and unws(' '.join(
                    lines[i].xpath('.//text()'))) == 'In camera':
                schedule[u'incamera'] = True
                i += 1
            continue

        if line.tag == 'div':
            item[u'actors'] = getactors(line)
            continue
        firsttoken = txt.split()[0]
        # 6. Alternative dispute resolution for consumer disputes and
        #    amending Regulation (EC) No 2006/2004 and Directive
        #    2009/22/EC (Directive on consumer ADR)
        if firsttoken[-1] == '.' and firsttoken[:-1].isdigit(
        ) and itemcnt + 1 == int(firsttoken[:-1]):
            if item: res.append(item)
            itemcnt += 1
            item = copy.deepcopy(agenda)
            item.update({
                u'title': ' '.join(txt.split()[1:]),
                u'seq_no': itemcnt,
            })
            if schedule:
                item.update(schedule)
            continue
        # trailing list of "details"
        # · Presentation by the Commission of the proposal & Impact Assessment
        # · Exchange of views
        if firsttoken == u"·":
            if not 'list' in item: item[u'list'] = []
            tmp = ' '.join(txt.split()[1:])
            if tmp.startswith('Deadline for tabling amendments:'):
                try:
                    item[u'tabling_deadline'] = datetime.strptime(
                        tmp.split(':')[1].strip(), "%d %B %Y, %H.%M")
                except ValueError:
                    try:
                        item[u'tabling_deadline'] = datetime.strptime(
                            tmp.split(':')[1].strip(), "%d.%m.%Y at %H.%M")
                    except:
                        logger.warn('[$] unknown tabling deadline format',
                                    tmp.split(':')[1].strip())
            item[u'list'].append(tmp)
            continue
        # committee dossier
        # IMCO/7/08130
        if txt.startswith("%s/7/" % comid) and len(txt) == 12:
            item[u'comdossier'] = txt
            continue
        # ***I    2011/0373(COD)       COM(2011)0793 – C7-0454/2011
        tmp = getdocs(txt)
        if tmp:
            item.update(tmp)
            continue
        # fall-through line
        logger.debug("(falltrough) %s %s" % (line.tag, txt.encode('utf8')))
    if item: res.append(item)
    return res
Пример #39
0
def scrape_epagents(table):
    heading=''.join(table.xpath('.//td[@class="players_committee"]')[0].xpath(".//text()")).strip()
    responsible=None
    if heading in [ "Committee responsible", "Former committee responsible"]:
        responsible=True
    elif heading in ["Committee for opinion", "Former committee for opinion"]:
        responsible=False
    else:
        logger.warn(u"[!] unknown committee heading %s" % heading)

    # handle shadows
    shadowelems=table.xpath('//a[@id="shadowRapporteurHeader"]/../following-sibling::div/p//span[@class="players_rapporter_text"]/a')
    tips=[t.xpath('text()')[0] if len(t.xpath('text()'))>0 else groupurlmap[t.xpath("a")[0].get('href')]
          for t in table.xpath('//a[@id="shadowRapporteurHeader"]/../following-sibling::div//span[@class="tiptip"]')]
    shadows={}
    for shadow, group in izip_longest(shadowelems, tips):
        committee=shadow.xpath('./ancestor::td/preceding-sibling::td//acronym/text()')[0]
        if not committee in shadows: shadows[committee]=[]
        if group=='NI': group=u'NI'
        mep={u'name': unicode(shadow.xpath('text()')[0]),
             u'group': unicode(group)}
        tmp=getMEPRef(shadow.xpath('text()')[0])
        if tmp:
           mep[u'mepref']=tmp
        #else:
        #    raise IndexError
        shadows[committee].append(mep)
    # delete the uneccessary shadow elements - so the following regular lst2obj get's what it expects
    for todel in table.xpath('//a[@id="shadowRapporteurHeader"]/..'):
        parent=todel.xpath('..')[0]
        parent.remove(todel.xpath('following-sibling::div')[0])
        parent.remove(todel)

    # handle each row of agents
    agents=[]
    for agent in lst2obj(table,epagents,1):
        agent[u'responsible']=responsible
        agent[u'body']=u'EP'
        if agent.get('rapporteur'):
            meps=[]
            for mep in agent['rapporteur']:
                if unws(mep['name']).startswith("The committee decided not to give an opinion"):
                    del agent['rapporteur'][agent['rapporteur'].index(mep)]
                    agent[u'opinion']=None
                    continue
                tmp=getMEPRef(mep['name'])
                if tmp:
                    meps.append({u'mepref': tmp,
                                 u'group': mep['group'],
                                 u'name': mep['name']})
                else:
                    meps.append({u'group': mep['group'],
                                 u'name': mep['name']})
            agent[u'rapporteur']=meps

        abbr=agent['committee'][:4]
        if abbr=='BUDE': abbr='BUDG'
        if not abbr in COMMITTEE_MAP.keys():
            logger.warn(u"[!] uknown committee abbrev %s" % abbr)
            agent[u'committee_full']=agent['committee']
            if agent['committee'][4]==' ' and abbr.isalpha():
                agent[u'committee']=abbr
        else:
            agent[u'committee_full']=agent['committee'][5:]
            agent[u'committee']=abbr

        if agent.get(u'committee') in shadows.keys():
            agent[u'shadows']=shadows[agent['committee']]

        if not agent in agents: agents.append(agent)
    return agents
Пример #40
0
    return res


def getMEPGender(id):
    try:
        mepraw = fetch("http://www.europarl.europa.eu/meps/fr/%s/_home.html" %
                       (id),
                       ignore=[500])
    except Exception, e:
        logger.error("mepgender %s" % e)
        return 'n/a'
    borntxt = mepraw.xpath(
        '//div[@class="zone_info_mep_transparent_mep_details"]//span[@class="more_info"]/text()'
    )
    if len(borntxt) > 0:
        if unws(borntxt[-1]).startswith(u'décédé'):
            hint = borntxt[-2].replace(u"\u00A0", ' ').split()[0]
        else:
            hint = borntxt[-1].replace(u"\u00A0", ' ').split()[0]
        if hint == u"Née":
            return "F"
        elif hint == u"Né":
            return "M"
    logger.warn(
        '[!] no birth/gender data http://www.europarl.europa.eu/meps/fr/%s/get.html'
        % id)
    return 'n/a'


def getMEPDeclarations(id):
    try: