Exemplo n.º 1
0
def scrape_docs(tree):
    res=[]
    docs=tree.xpath('//table[@id="doc_gateway"]')
    tabs=[x.xpath('preceding-sibling::h2')[0].xpath('text()')[0] for x in docs]
    for inst, table in izip(tabs, docs):
        if inst in instmap.keys():
            for doc in lst2obj(table, docFields):
                if inst != 'Other institutions':
                    doc[u'body']=instmap[inst]
                else:
                    try:
                        doc[u'body']=otherinst[doc['type'].split(':')[0]]
                    except KeyError:
                        doc[u'body']=''
                if doc['body'] in ['EP','CSL'] and doc['type']=='Joint text approved by Conciliation Committee co-chairs':
                    # skip it twice and hope it's listed in the all documents, so it becomes EP/CSL :)
                    continue
                if doc.get('text'):
                    try: summary=fetch(doc['text']['url'])
                    except: continue
                    doc[u'text']=[unicode(tostring(x)) for x in summary.xpath('//div[@id="summary"]')]
                res.append(doc)
        elif inst != 'All':
            logger.warn(u"[!] unrecognized tab in documents %s" % inst)
    return res
Exemplo n.º 2
0
def scrape_docs(tree):
    res = []
    docs = tree.xpath('//table[@id="doc_gateway"]')
    tabs = [
        x.xpath('preceding-sibling::h2')[0].xpath('text()')[0] for x in docs
    ]
    for inst, table in izip(tabs, docs):
        if inst in instmap.keys():
            for doc in lst2obj(table, docFields):
                if inst != 'Other institutions':
                    doc[u'body'] = instmap[inst]
                else:
                    try:
                        doc[u'body'] = otherinst[doc['type'].split(':')[0]]
                    except KeyError:
                        doc[u'body'] = ''
                if doc['body'] in ['EP', 'CSL'] and doc[
                        'type'] == 'Joint text approved by Conciliation Committee co-chairs':
                    # skip it twice and hope it's listed in the all documents, so it becomes EP/CSL :)
                    continue
                if doc.get('text'):
                    try:
                        summary = fetch(doc['text']['url'])
                    except:
                        continue
                    doc[u'text'] = [
                        unicode(tostring(x))
                        for x in summary.xpath('//div[@id="summary"]')
                    ]
                res.append(doc)
        elif inst != 'All documents':
            logger.warn(u"[!] unrecognized tab in documents %s" % inst)
    return res
Exemplo n.º 3
0
def save(data, stats):
    if not data: return stats
    res = db.eurlex.find_one({'id.celexid': data['id']['celexid']}) or {}
    d = diff(
        dict([(k, v) for k, v in res.items()
              if not k in ['_id', 'meta', 'changes']]),
        dict([(k, v) for k, v in data.items() if not k in [
            '_id',
            'meta',
            'changes',
        ]]))
    if d:
        now = unicode(datetime.utcnow().replace(microsecond=0).isoformat())
        if not res:
            logger.info(('adding %s' % (data['id']['celexid'])).encode('utf8'))
            data['meta']['created'] = now
            if stats: stats[0] += 1
        else:
            logger.info(
                ('updating %s' % (data['id']['celexid'])).encode('utf8'))
            logger.warn(d)
            data['meta']['updated'] = now
            if stats: stats[1] += 1
            data['_id'] = res['_id']
        data['changes'] = res.get('changes', {})
        data['changes'][now] = d
        db.eurlex.save(data)
    if stats: return stats
    else: return data
Exemplo n.º 4
0
def scrape(celexid, path):
    logger.info("scraping %s%s:NOT" % (EURLEXURL,celexid))
    path.reverse()
    (code,lang)=celexid.split(":")[1:3]
    st=6
    if len(code)>6:
        if code[6].isalpha(): st=7
        eurlex={'id': {u'celexid': celexid,
                       u'sector': code[0],
                       u'year': code[1:5],
                       u'doctype': code[5:st],
                       u'refno': code[st:],
                       u'lang': lang,
                       u'chapter': path,
                       }}
    else:
        eurlex={'id': {u'celexid': celexid,
                       u'sector': code[0],
                       u'year': code[1:5],
                       u'doctype': code[5:6],
                       u'lang': lang,
                       u'chapter': path,
                       }}

    try:
        eurlex['id'][u'typeDesc']= CELEXCODES[code[0]]['Document Types'][code[5:st]] if code[5:st] != 'C' else CELEXCODES[code[0]]['Sector']
    except:
        eurlex['id'][u'typeDesc']= u"Unknown"
        logger.warn("[!] unknown typedesc %s" % celexid)
    eurlex['meta']={u'src': "%s%s:NOT" % (EURLEXURL,celexid)}

    root = fetch("%s%s:NOT" % (EURLEXURL,celexid))
    if len(root.xpath('//h1[text()="No documents matching criteria."]'))>0:
        logger.warn('[!] nothing to scrape here: %s', "%s%s:NOT" % (EURLEXURL,celexid))
        return
    eurlex[u'title'] = root.xpath('//h2[text()="Title and reference"]/following-sibling::p/text()')[0]
    # dates
    dates=root.xpath('//h2[text()="Dates"]/following-sibling::ul/text()')
    for y in dates:
        if not unws(y): continue
        title, rest=unws(y).split(": ",1)
        item={u'type': title}
        date=rest[:10]
        tail=rest[10:]
        if tail.startswith('; '):
            tail=tail[2:]
        if date=='99/99/9999': item[u'date']= datetime(9999,12,31)
        elif date=='00/00/0000': item[u'date']= datetime(0001,01,01)
        elif date=='//': continue
        else:
            try: item[u'date']= datetime.strptime(date, u"%d/%m/%Y")
            except ValueError:
                try: item[u'date']= datetime.strptime(date, u"%m/%d/%Y")
                except: pass
        if len(tail):
            item['note']=tail
        try:
            eurlex['dates'].append(item)
        except:
            eurlex['dates']=[item]
Exemplo n.º 5
0
def scrape_actors(tree):
    insts=tree.xpath('//td[@class="players_institution" or @class="players_institution inst_separator"]')
    agents=[]
    meps=[]
    for inst in insts:
        inst_name=''.join([x.strip() for x in inst.xpath('.//text()')])
        for table in inst.xpath('following-sibling::td/table'):
            if inst_name == 'European Parliament':
                meps.extend([x for x in scrape_epagents(table) if x not in meps])
            # Handle council
            elif inst_name == 'Council of the European Union':
                for agent in lst2obj(table, cslagents, 1):
                    agent[u'body']=u'CSL'
                    agent[u'type']=u'Council Meeting'
                    agents.append(agent)
            # and commission
            elif inst_name == 'European Commission':
                for p in table.xpath('.//p[@class="players_head"]'):
                    p.getparent().remove(p)
                for agent in lst2obj(table, ecagents, 0):
                    if len(agent['dg'])==len(agent['commissioner']):
                        for dg,cmnr in izip(agent['dg'], agent['commissioner']):
                            agent[u'body']=u'EC'
                            agents.append({u'body': u'EC',
                                           u'dg': dg,
                                           u'commissioner': cmnr})
                    else:
                        logger.warn("commission data wrong: %s" % (agent))
            else:
                "[!] wrong institution name", inst_name
    return (agents, sorted(meps,key=itemgetter('committee')))
Exemplo n.º 6
0
def save(data, stats):
    res=db.ep_meps2.find_one({ 'UserID' : data['UserID'] }) or {}
    if 'Gender' not in data and 'Gender' in res: data['Gender']=res['Gender']
    d=diff(dict([(k,v) for k,v in res.items() if not k in ['_id', 'meta', 'changes', 'activities',]]),
           dict([(k,v) for k,v in data.items() if not k in ['_id', 'meta', 'changes', 'activities',]]))
    if d:
        now=datetime.utcnow().replace(microsecond=0)
        if not res:
            logger.info(('adding %s' % (data['Name']['full'])).encode('utf8'))
            data['meta']['created']=now
            if stats: stats[0]+=1
        else:
            logger.info(('updating %s' % (data['Name']['full'])).encode('utf8'))
            logger.warn(jdump(d))
            data['meta']['updated']=now
            if stats: stats[1]+=1
            data['_id']=res['_id']
        data['changes']=res.get('changes',{})
        data['changes'][now.isoformat()]=d
        db.ep_meps2.save(data)
    del res
    if stats: 
        del data
        return stats
    else: return data
Exemplo n.º 7
0
def save(data, stats):
    res = db.ep_meps2.find_one({"UserID": data["UserID"]}) or {}
    d = diff(
        dict([(k, v) for k, v in res.items() if not k in ["_id", "meta", "changes"]]),
        dict([(k, v) for k, v in data.items() if not k in ["_id", "meta", "changes"]]),
    )
    if d:
        now = datetime.utcnow().replace(microsecond=0)
        if not res:
            logger.info(("adding %s" % (data["Name"]["full"])).encode("utf8"))
            data["meta"]["created"] = now
            if stats:
                stats[0] += 1
        else:
            logger.info(("updating %s" % (data["Name"]["full"])).encode("utf8"))
            logger.warn(jdump(d))
            data["meta"]["updated"] = now
            if stats:
                stats[1] += 1
            data["_id"] = res["_id"]
        data["changes"] = res.get("changes", {})
        data["changes"][now.isoformat()] = d
        db.ep_meps2.save(data)
    if stats:
        return stats
    else:
        return data
Exemplo n.º 8
0
def getMEPRef(name, retfields=['_id']):
    if not name: return
    mep = db.ep_meps2.find_one({'Name.aliases': ''.join(name.split()).lower()},
                               retfields)
    if not mep and u'ß' in name:
        mep = db.ep_meps2.find_one(
            {
                'Name.aliases': ''.join(name.replace(u'ß',
                                                     'ss').split()).lower()
            }, retfields)
    if not mep and unicodedata.normalize('NFKD', unicode(name)).encode(
            'ascii', 'ignore') != name:
        mep = db.ep_meps2.find_one(
            {
                'Name.aliases':
                ''.join(
                    unicodedata.normalize('NFKD', unicode(name)).encode(
                        'ascii', 'ignore').split()).lower()
            }, retfields)
    if not mep and len([x for x in name if ord(x) > 128]):
        mep = db.ep_meps2.find_one(
            {
                'Name.aliases':
                re.compile(''.join([x if ord(x) < 128 else '.'
                                    for x in name]), re.I)
            }, retfields)
    if mep:
        return mep['_id']
    else:
        logger.warn('[!] lookup oops %s' % name.encode('utf8'))
Exemplo n.º 9
0
def getactors(node):
    res={}
    ax=[None,[]]
    for row in node.xpath('.//tr'):
        cells=row.xpath('./td/p')
        if not cells: continue

        # get role Rapporteur|Responsible|Rapporteur for the opinion|Opinions
        role=cells[0].xpath('text()')
        if role and unws(role[0]):
            if ax[0] and ax[1]: res[ax[0]]=sorted(ax[1])
            tmp=unws(role[0])[:-1]
            if tmp=="Rapporteur for the opinion":
                tmp="Rapporteur"
            ax=[tmp,[]]

        tmp=unws((cells[1].xpath('text()') or [None])[0])
        if ax[0] in ["Rapporteur", "Rapporteur for the opinion"] and tmp:
            name=' '.join(tmp.split()[:-1])
            item={u'group': tmp.split()[-1][1:-1],
                  u'name': name,
                  u'mepref': getMEPRef(name) }
            if len(cells)>2:
                item[u'docs']=getdoclist(cells[2])
            ax[1].append(item)
            continue
        if ax[0] in ["Opinions", "Responsible"] and tmp:
            tmp1=tmp.split(u' –',1)
            if len(tmp1)==2:
                (comid, rest)=tmp1
            elif len(tmp1)==1:
                skip=False
                for com in tmp.split(' ,'):
                    if com in COMMITTEE_MAP and len(com)==4:
                        ax[1].append({u'comid': com})
                        skip=True
                if skip:
                    continue
            else:
                logger.warn("[!] unknown committee: %s" % tmp)
                raise
            item={u'comid': comid}
            if rest==' Decision: no opinion':
                item[u'response']=u'Decision: no opinion'
            if not rest and len(comid)>4:
                for com in comid.split(', '):
                    ax[1].append({u'comid': com})
                continue
            if len(cells)>2:
                tmp=unws((cells[2].xpath('text()') or [None])[0])
                if tmp:
                    name=' '.join(tmp.split()[:-1])
                    item.update({u'group': tmp.split()[-1][1:-1],
                                 u'name': name,
                                 u'mepref': getMEPRef(name)})
                    if len(cells)>3:
                        item[u'docs']=getdoclist(cells[3])
            ax[1].append(item)
    if ax[0] and ax[1]: res[ax[0]]=sorted(ax[1])
    return res
Exemplo n.º 10
0
def crawl(saver=jdump, null=False):
    for celexid, data in sources("%s/index.htm" % crawlroot, []):
        if (null and db.eurlex.find_one({'id.celexid': celexid},['_id'])==None) or not null:
            try:
                tmp = saver(scrape(celexid, data),[0,0])
            except:
                logger.warn("[!] failed to scrape %s" % celexid)
                continue
            yield tmp
Exemplo n.º 11
0
def crawl(saver=jdump, null=False):
    for celexid, data in sources("%s/index.htm" % crawlroot, []):
        if (null and db.eurlex.find_one({'id.celexid': celexid}, ['_id'])
                == None) or not null:
            try:
                tmp = saver(scrape(celexid, data), [0, 0])
            except:
                logger.warn("[!] failed to scrape %s" % celexid)
                continue
            yield tmp
Exemplo n.º 12
0
def seqcrawler(saver=jdump):
    stats=[0,0]
    for u, com in getComAgendas():
        try:
            saver(scrape(u,com), stats)
        except:
            # ignore failed scrapes
            logger.warn("[!] failed to scrape: %s" % u)
            logger.warn(traceback.format_exc())
    logger.info("[o] added/updated: %s/%s" % (stats[0],stats[1]))
Exemplo n.º 13
0
def getMEPRef(name, retfields=['_id']):
    if not name: return
    mep=db.ep_meps2.find_one({'Name.aliases': ''.join(name.split()).lower()},retfields)
    if not mep and u'ß' in name:
        mep=db.ep_meps2.find_one({'Name.aliases': ''.join(name.replace(u'ß','ss').split()).lower()},retfields)
    if not mep and unicodedata.normalize('NFKD', unicode(name)).encode('ascii','ignore')!=name:
        mep=db.ep_meps2.find_one({'Name.aliases': ''.join(unicodedata.normalize('NFKD', unicode(name)).encode('ascii','ignore').split()).lower()},retfields)
    if not mep and len([x for x in name if ord(x)>128]):
        mep=db.ep_meps2.find_one({'Name.aliases': re.compile(''.join([x if ord(x)<128 else '.' for x in name]),re.I)},retfields)
    if mep:
        return mep['_id']
    else:
        logger.warn('[!] lookup oops %s' % name.encode('utf8'))
Exemplo n.º 14
0
def crawler(saver=jdump, update=False):
    stats=[0,0]
    for pdf, rapporteur in getComAms(update=update):
        logger.info(datetime.now().isoformat()+" "+pdf)
        ctr=[0,0]
        try:
            saver(scrape(pdf, rapporteur), ctr)
        except:
            # ignore failed scrapes
            logger.warn("[!] %s failed to scrape: %s" % (datetime.now().isoformat(), pdf))
            #logger.warn(traceback.format_exc())
            raise
        logger.info("%s [i] added/updated: %s/%s" % (datetime.now().isoformat(), ctr[0],ctr[1]))
        stats[0]+=ctr[0]
        stats[1]+=ctr[1]
    logger.info("%s [o] total added/updated: %s/%s" % (datetime.now().isoformat(),stats[0],stats[1]))
Exemplo n.º 15
0
def crawler(saver=jdump, update=False):
    stats = [0, 0]
    for pdf, rapporteur in getComAms(update=update):
        logger.info(datetime.now().isoformat() + " " + pdf)
        ctr = [0, 0]
        try:
            saver(scrape(pdf, rapporteur), ctr)
        except:
            # ignore failed scrapes
            logger.warn("[!] %s failed to scrape: %s" % (datetime.now().isoformat(), pdf))
            # logger.warn(traceback.format_exc())
            raise
        logger.info("%s [i] added/updated: %s/%s" % (datetime.now().isoformat(), ctr[0], ctr[1]))
        stats[0] += ctr[0]
        stats[1] += ctr[1]
    logger.info("%s [o] total added/updated: %s/%s" % (datetime.now().isoformat(), stats[0], stats[1]))
Exemplo n.º 16
0
def save(data, stats):
    res=db.ep_meps2.find_one({ 'UserID' : data['UserID'] }) or {}
    d=diff(dict([(k,v) for k,v in res.items() if not k in ['_id', 'meta', 'changes']]),
           dict([(k,v) for k,v in data.items() if not k in ['_id', 'meta', 'changes',]]))
    if d:
        now=unicode(datetime.utcnow().replace(microsecond=0).isoformat())
        if not res:
            logger.info(('adding %s' % (data['Name']['full'])).encode('utf8'))
            data['meta']['created']=now
            stats[0]+=1
        else:
            logger.info(('updating %s' % (data['Name']['full'])).encode('utf8'))
            logger.warn(d)
            data['meta']['updated']=now
            stats[1]+=1
            data['_id']=res['_id']
        data['changes']=res.get('changes',{})
        data['changes'][now]=d
        db.ep_meps2.save(data)
    return stats
Exemplo n.º 17
0
def save(data, stats):
    if not data: return stats
    res=db.eurlex.find_one({ 'id.celexid' : data['id']['celexid'] }) or {}
    d=diff(dict([(k,v) for k,v in res.items() if not k in ['_id', 'meta', 'changes']]),
           dict([(k,v) for k,v in data.items() if not k in ['_id', 'meta', 'changes',]]))
    if d:
        now=unicode(datetime.utcnow().replace(microsecond=0).isoformat())
        if not res:
            logger.info(('adding %s' % (data['id']['celexid'])).encode('utf8'))
            data['meta']['created']=now
            if stats: stats[0]+=1
        else:
            logger.info(('updating %s' % (data['id']['celexid'])).encode('utf8'))
            logger.warn(d)
            data['meta']['updated']=now
            if stats: stats[1]+=1
            data['_id']=res['_id']
        data['changes']=res.get('changes',{})
        data['changes'][now]=d
        db.eurlex.save(data)
    if stats: return stats
    else: return data
Exemplo n.º 18
0
def scrape_actors(tree):
    insts = tree.xpath(
        '//td[@class="players_institution" or @class="players_institution inst_separator"]'
    )
    agents = []
    meps = []
    for inst in insts:
        inst_name = ''.join([x.strip() for x in inst.xpath('.//text()')])
        for table in inst.xpath('following-sibling::td/table'):
            if inst_name == 'European Parliament':
                meps.extend(
                    [x for x in scrape_epagents(table) if x not in meps])
            # Handle council
            elif inst_name == 'Council of the European Union':
                for agent in lst2obj(table, cslagents, 1):
                    agent[u'body'] = u'CSL'
                    agent[u'type'] = u'Council Meeting'
                    agents.append(agent)
            # and commission
            elif inst_name == 'European Commission':
                for p in table.xpath('.//p[@class="players_head"]'):
                    p.getparent().remove(p)
                for agent in lst2obj(table, ecagents, 0):
                    if len(agent['dg']) == len(agent['commissioner']):
                        for dg, cmnr in izip(agent['dg'],
                                             agent['commissioner']):
                            agent[u'body'] = u'EC'
                            agents.append({
                                u'body': u'EC',
                                u'dg': dg,
                                u'commissioner': cmnr
                            })
                    else:
                        logger.warn("commission data wrong: %s" % (agent))
            else:
                "[!] wrong institution name", inst_name
    return (agents, sorted(meps, key=itemgetter('committee')))
Exemplo n.º 19
0
def parseMember(userid):
    url = "http://www.europarl.europa.eu/meps/en/%s/get.html" % userid
    logger.info("scraping %s" % url)
    root = fetch(url, ignore=[500])
    data = {u"active": False, "meta": {u"url": url}}  # return {'active': False}
    mepdiv = root.xpath('//div[@class="ep_elementpeople2"]')
    if len(mepdiv) == 1:
        mepdiv = mepdiv[0]
    else:
        logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv)))
    data[u"Name"] = mangleName(unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[0]))
    data[u"Photo"] = unicode(urljoin(BASE_URL, mepdiv.xpath('.//span[@class="ep_img"]/img')[0].get("src")), "utf8")
    borntxt = mepdiv.xpath('.//div[@class="ep_elementtext"]/p/text()')
    if len(borntxt) > 0:
        (d, p) = borntxt[0].split(",", 1)
        try:
            data[u"Birth"] = {u"date": datetime.strptime(unws(d), u"Born on %d %B %Y"), u"place": unws(p)}
        except ValueError:
            logger.warn("[!] failed to scrape birth data %s" % url)
            logger.warn(traceback.format_exc())
    else:
        logger.warn("[!] no birth data %s" % url)
    const = {u"country": unws(mepdiv.xpath('.//span[@class="ep_country"]/text()')[0]), u"start": datetime(2009, 7, 14)}
    data[u"Constituencies"] = [const]
    try:
        data[u"party"] = unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[1])
    except IndexError:
        pass
    else:
        group = unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[0])
        try:
            role = unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[1])
        except IndexError:
            role = u"Member"
        data[u"Groups"] = [{u"role": role, u"Organization": group, u"groupid": group_map[group]}]
    cdiv = root.xpath('//div[@class="ep_elementcontact"]')
    if len(cdiv):
        addif(
            data,
            u"RSS",
            [unicode(urljoin(BASE_URL, x.get("href")), "utf8") for x in cdiv[0].xpath('.//li[@class="ep_rss"]//a')],
        )
        addif(
            data, u"Homepage", [unicode(x.get("href"), "utf8") for x in cdiv[0].xpath('.//li[@class="ep_website"]//a')]
        )
        addif(
            data,
            u"Mail",
            [decodemail(unws(x)) for x in cdiv[0].xpath('.//li[@class="ep_email"]//text()') if len(unws(x))],
        )
    for span in root.xpath('//div[@id="contextzone"]//span[@class="ep_title"]'):
        title = unws("".join(span.xpath(".//text()")))
        if title in ["Accredited assistants", "Local assistants"]:
            if not "assistants" in data:
                data["assistants"] = {}
            addif(
                data["assistants"], title.lower().split()[0], [unws(x) for x in span.xpath("../../..//li/div/text()")]
            )
    addif(data, u"Addresses", getAddress(root))
    for div in root.xpath('//div[@class="ep_content"]'):
        key = unws(u"".join(div.xpath('.//span[@class="ep_title"]/text()')))
        if not len(key):
            continue
        elif key.lower() == "curriculum vitae":
            data[u"CV"] = [unws(x) for x in div.xpath('.//div[@class="ep_elementtext"]//li/div/text()')]
        elif key in ["Member", "Substitute", "Chair", "Vice-Chair", "Co-President", "President", "Vice-President"]:
            for span in div.xpath('.//span[@class="commission_label"]'):
                item = {u"role": key, u"abbr": unws("".join(span.xpath(".//text()"))), u"Organization": unws(span.tail)}
                for start, field in orgmaps:
                    if item["abbr"] in COMMITTEE_MAP or item["Organization"].startswith(start):
                        if not field in data:
                            data[field] = []
                        if field == "Committees" and item["Organization"] in COMMITTEE_MAP:
                            item[u"committee_id"] = COMMITTEE_MAP[item["Organization"]]
                        data[field].append(item)
                        break
        else:
            logger.error("[!] unknown field %s" % key)
    return data
Exemplo n.º 20
0
    try:
        mepraw=fetch("http://www.europarl.europa.eu/meps/fr/%s/_home.html" % (id), ignore=[500])
    except Exception, e:
        logger.error("mepgender %s" % e)
        return 'n/a'
    borntxt=mepraw.xpath('//div[@class="zone_info_mep_transparent_mep_details"]//span[@class="more_info"]/text()')
    if len(borntxt)>0:
        if unws(borntxt[-1]).startswith(u'décédé'):
            hint=borntxt[-2].replace(u"\u00A0",' ').split()[0]
        else:
            hint=borntxt[-1].replace(u"\u00A0",' ').split()[0]
        if hint==u"Née":
            return "F"
        elif hint==u"Né":
            return "M"
    logger.warn('[!] no birth/gender data http://www.europarl.europa.eu/meps/fr/%s/get.html' % id)
    return 'n/a'

def getMEPDeclarations(id):
    try:
        dom = fetch("http://www.europarl.europa.eu/meps/en/%s/_declarations.html" % (id), ignore=[500])
    except Exception, e:
        logger.error("mepdeclaration %s" % e)
        return []
    dif_links = dom.xpath('//h3[@id="sectionDIF"]/following-sibling::div//ul[@class="link_collection_noborder"]//a[@class="link_pdf"]/@href')
    dat_links = dom.xpath('//h3[@id="sectionDAT"]/following-sibling::div//ul[@class="link_collection_noborder"]//a[@class="link_pdf"]/@href')
    if not dif_links:
        logger.warn('[!] no declaration data http://www.europarl.europa.eu/meps/en/%s/_declarations.html' % id)
    return dif_links, dat_links

activitymap={"CRE" : "Speeches",
Exemplo n.º 21
0
def parseMember(userid):
    url='http://www.europarl.europa.eu/meps/en/%s/_history.html' % userid
    logger.info("scraping %s" % url)
    root = fetch(url, ignore=[500])

    data = {
        u'active': False,
        u'Photo': unicode(urljoin(BASE_URL,"/mepphoto/%s.jpg" % userid)),
        u'meta': {u'url': url}
        }

    mepdiv=root.xpath('//div[@class="zone_info_mep_transparent_mep_details"]')
    if len(mepdiv) == 1:
        mepdiv = mepdiv[0]
    else:
        logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv)))
    data[u'Name'] = mangleName(unws(' '.join(mepdiv.xpath('.//li[@class="mep_name"]//text()'))))

    borntxt=mepdiv.xpath('.//span[@class="more_info"]/text()')
    if len(borntxt)>0:
        if unws(borntxt[-1]).startswith('Date of death:'):
            try:
                data[u'Death'] = datetime.strptime(unws(borntxt[-1]), u"Date of death: %d %B %Y")
            except ValueError:
                logger.warn('[!] failed to scrape birth data %s' % url)
                logger.warn(traceback.format_exc())
            tmp = borntxt[-2].split(',', 1)
        else:
            tmp = borntxt[-1].split(',', 1)
        if len(tmp)==2:
            (d, p) = tmp
        else:
            d,p = tmp[0], None
        try:
            data[u'Birth'] = { u'date': datetime.strptime(unws(d), u"Date of birth: %d %B %Y")}
        except ValueError:
            logger.warn(traceback.format_exc())
        finally:
            if p:
                if 'Birth' in data:
                    data[u'Birth'][u'place'] = unws(p)
                else:
                    data[u'Birth'] = unws(p)
    else:
        logger.warn('[!] no birth data %s' % url)

    # scrape stuff from right column
    addif(data,u'RSS',[unicode(urljoin(BASE_URL,x.get('href')),'utf8')
                       for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_rss"]')])
    addif(data,u'Homepage',[x.get('href')
                            for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_website"]')])
    addif(data,u'Twitter',[x.get('href')
                           for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_twitt"]')])
    addif(data,u'Facebook',[x.get('href')
                           for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_fb"]')])
    addif(data,u'Mail',[x.get('href')[7:].replace('[dot]','.').replace('[at]','@')[::-1]
                        for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_email"]')])
    # contact information
    for span in root.xpath('//div[@id="content_right"]//h3'):
        title=unws(''.join(span.xpath('.//text()')))
        if title == "Contacts":
            addif(data,u'Addresses',getAddress(span))

    # scrape main content
    for section in root.xpath('//div[@id="content_left"]/div[@class="boxcontent nobackground"]/h4'):
        key=unws(''.join(section.xpath('.//text()')))
        if key=="National parties":
            # constituencies
            key='Constituencies'
            for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'):
                line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')]))
                try:
                    interval, party = line.split(' : ',1)
                except ValueError:
                    continue
                tmp = interval.split(' / ')
                if not key in data: data[key]=[]
                if len(tmp)==2:
                    (start, end) = tmp
                else:
                    start = interval.split()[0]
                    end = "31.12.9999"
                cstart = party.rfind(' (')
                if party[cstart+2:-1] in SEIRTNUOC:
                    country = party[cstart+2:-1]
                    party = party[:cstart]
                else:
                    logger.warn('unknown country: %s' % party[cstart+2:-1])
                    country='unknown'
                #print etree.tostring(constlm, pretty_print=True)
                data[key].append({
                    u'party':     party,
                    u'country':   country,
                    u'start':     datetime.strptime(unws(start), u"%d.%m.%Y"),
                    u'end':       datetime.strptime(unws(end), u"%d.%m.%Y"),
                    })
        elif key in ['Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President', 'President', 'Vice-President', 'Observer', 'Quaestor']:
            # memberships in various committees, delegations and EP mgt
            for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'):
                line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')]))
                try:
                    interval, org = line.split(' : ',1)
                except ValueError:
                    continue
                tmp = interval.split(' / ')
                if len(tmp)==2:
                    (start, end) = tmp
                else:
                    start = interval.split()[0]
                    end = "31.12.9999"
                item={u'role': key,
                      u'abbr': COMMITTEE_MAP.get(org),
                      u'Organization': org,
                      u'start':     datetime.strptime(unws(start), u"%d.%m.%Y"),
                      u'end':       datetime.strptime(unws(end), u"%d.%m.%Y"),
                      }
                for start, field in orgmaps:
                    if item['abbr'] in COMMITTEE_MAP or item['Organization'].startswith(start):
                        if not field in data: data[field]=[]
                        if field=='Committees' and item['Organization'] in COMMITTEE_MAP:
                            item[u'committee_id']=COMMITTEE_MAP[item['Organization']]
                        data[field].append(item)
                        break
        elif key == u'Political groups':
            for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'):
                line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')]))
                interval, org = line.split(' : ',1)
                tmp = org.split(u' - ')
                if len(tmp)>1:
                    org = ' - '.join(tmp[:-1])
                    role = tmp[-1]
                elif org.endswith(' -'):
                        org=org[:-2]
                        role=''
                else:
                    logger.error('[!] political group line %s' % line)
                    continue
                tmp = interval.split(' / ')
                if len(tmp)==2:
                    (start, end) = tmp
                else:
                    start = interval.split()[0]
                    end = "31.12.9999"
                if not u'Groups' in data: data[u'Groups']=[]
                data[u'Groups'].append(
                    {u'role':         role,
                     u'Organization': org,
                     u'country':      COUNTRIES.get(unws(constlm.get('class')).upper(), 'unknown country: %s' % unws(constlm.get('class'))),
                     u'groupid':      group_map[org],
                     u'start':        datetime.strptime(unws(start), u"%d.%m.%Y"),
                     u'end':          datetime.strptime(unws(end), u"%d.%m.%Y"),
                     })
        else:
            logger.error('[!] unknown field %s' % key)

    # sort all lists in descending order
    for fld in ['Constituencies', 'Groups', 'Committees', 'Delegations', 'Staff']:
        if not fld in data: continue
        data[fld]=sorted(data[fld],
                         key=lambda x: x.get('end',x['start']),
                         reverse=True)

    # get CV - page (is on separate http path :/)
    cvurl='http://www.europarl.europa.eu/meps/en/%s/_cv.html' % userid
    root = fetch(cvurl, ignore=[500])
    data[u'CV']=[unws(x) for x in root.xpath('//p[@class="details_cv"]/text()')]

    # get assistants also on a separate page :/
    assurl='http://www.europarl.europa.eu/meps/en/%s/_assistants.html' % userid
    root = fetch(assurl, ignore=[500])
    for h3 in root.xpath('//h3[@id="section"]'):
        title=unws(''.join(h3.xpath('.//text()')))
        if title in ['Accredited assistants', 'Local assistants']:
            if not 'assistants' in data: data['assistants']={}
            addif(data['assistants'],
                  title.lower().split()[0],
                  [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')])
        elif title in ['Accredited assistants (grouping)', 'Local assistants (grouping)',
                       'Service providers', ' Trainees', 'Paying agents (grouping)', 'Paying agents']:
            if not 'assistants' in data: data['assistants']={}
            addif(data['assistants'],
                  title.lower(),
                  [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')])

    return data
Exemplo n.º 22
0
def scrape(url, rapporteur=None):
    if (url in ['http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-483.680%2b02%2bDOC%2bPDF%2bV0%2f%2fEN',
                'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-454.387%2b01%2bDOC%2bPDF%2bV0%2f%2fEN',
                'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-456.679%2b01%2bDOC%2bPDF%2bV0%2f%2fEN',
                'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-494.504%2b01%2bDOC%2bPDF%2bV0%2f%2fEN',
                'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.705%2b01%2bDOC%2bPDF%2bV0%2f%2fEN',
                'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.767%2b02%2bDOC%2bPDF%2bV0%2f%2fEN',
                'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-454.385%2b01%2bDOC%2bPDF%2bV0%2f%2fEN',
                'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-465.012%2b01%2bDOC%2bPDF%2bV0%2f%2fEN',
                'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-496.504%2b01%2bDOC%2bPDF%2bV0%2f%2fEN',
                'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.724%2b01%2bDOC%2bPDF%2bV0%2f%2fEN',
                'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.721%2b02%2bDOC%2bPDF%2bV0%2f%2fEN',
                'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.723%2b03%2bDOC%2bPDF%2bV0%2f%2fEN']
        or not url.endswith('EN')):
        logger.info("skipping unparsable url")
        return []
    prolog=True
    res=[]
    block=None
    reference=None
    date=None
    committee=[]
    text=getraw(url).split('\n')
    for line in text:
        if prolog:
            if amstart.match(line):
                if reference==None:
                    logger.warn("%s [!] couldn't find ref: %s" %
                                (datetime.now().isoformat(),
                                 unws([x for x in text[:20] if unws(x)][2])))
                    # marking as scraped though
                    db.ep_ams.save({'src': url, 'error': "couldn't find reference in source pdf"})
                    return []
                if date==None or committee==[]:
                    return []
                    #raise ValueError
                block=[line]
                prolog=False
                continue

            line=unws(line)

            if not line: continue

            if line in COMMITTEE_MAP:
                committee.append(COMMITTEE_MAP[line])
                continue

            if (committee and
                  not reference and
                  re.match(refre, line)):
                reference=line
                if url == 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-506.166%2b03%2bDOC%2bPDF%2bV0%2f%2fEN':
                    logger.info("adjusting reference to eudatap")
                    reference="2012/0011(COD)"
                continue

            if (reference and
                not date):
                try:
                    date = parse(unws(line), dayfirst=True)
                except ValueError:
                    pass
                except TypeError:
                    pass
            continue

        if amstart.match(line):
            # parse block
            res.append(parse_block(block, url, reference, date, committee, rapporteur))
            block=[line]
            continue
        block.append(line)
    if block and filter(None,block):
        res.append(parse_block(block, url, reference, date, committee, rapporteur))
    return res
Exemplo n.º 23
0
def scrape(url, rapporteur=None):
    if url in [
        "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-483.680%2b02%2bDOC%2bPDF%2bV0%2f%2fEN",
        "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-454.387%2b01%2bDOC%2bPDF%2bV0%2f%2fEN",
        "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-456.679%2b01%2bDOC%2bPDF%2bV0%2f%2fEN",
        "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-494.504%2b01%2bDOC%2bPDF%2bV0%2f%2fEN",
        "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.705%2b01%2bDOC%2bPDF%2bV0%2f%2fEN",
        "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.767%2b02%2bDOC%2bPDF%2bV0%2f%2fEN",
        "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-454.385%2b01%2bDOC%2bPDF%2bV0%2f%2fEN",
        "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-465.012%2b01%2bDOC%2bPDF%2bV0%2f%2fEN",
        "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-496.504%2b01%2bDOC%2bPDF%2bV0%2f%2fEN",
        "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.724%2b01%2bDOC%2bPDF%2bV0%2f%2fEN",
        "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.721%2b02%2bDOC%2bPDF%2bV0%2f%2fEN",
        "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.723%2b03%2bDOC%2bPDF%2bV0%2f%2fEN",
    ] or not url.endswith("EN"):
        logger.info("skipping unparsable url")
        return []
    prolog = True
    res = []
    block = None
    reference = None
    date = None
    committee = []
    text = getraw(url).split("\n")
    for line in text:
        if prolog:
            if amstart.match(line):
                if reference == None:
                    logger.warn(
                        "%s [!] couldn't find ref: %s"
                        % (datetime.now().isoformat(), unws([x for x in text[:20] if unws(x)][2]))
                    )
                    # marking as scraped though
                    db.ep_ams.save({"src": url, "error": "couldn't find reference in source pdf"})
                    return []
                if date == None or committee == []:
                    return []
                    # raise ValueError
                block = [line]
                prolog = False
                continue

            line = unws(line)

            if not line:
                continue

            if line in COMMITTEE_MAP:
                committee.append(COMMITTEE_MAP[line])
                continue

            if committee and not reference and re.match(refre, line):
                reference = line
                if (
                    url
                    == "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-506.166%2b03%2bDOC%2bPDF%2bV0%2f%2fEN"
                ):
                    logger.info("adjusting reference to eudatap")
                    reference = "2012/0011(COD)"
                continue

            if reference and not date:
                try:
                    date = parse(unws(line), dayfirst=True)
                except ValueError:
                    pass
                except TypeError:
                    pass
            continue

        if amstart.match(line):
            # parse block
            res.append(parse_block(block, url, reference, date, committee, rapporteur))
            block = [line]
            continue
        block.append(line)
    if block and filter(None, block):
        res.append(parse_block(block, url, reference, date, committee, rapporteur))
    return res
Exemplo n.º 24
0

def getMEPGender(id):
    try:
        mepraw = fetch("http://www.europarl.europa.eu/meps/fr/%s/get.html" % (id), ignore=[500])
    except Exception, e:
        logger.error("mepgender %s" % e)
        return "n/a"
    borntxt = mepraw.xpath('//div[@class="ep_elementpeople2"]//div[@class="ep_elementtext"]/p/text()')
    if len(borntxt) > 0:
        hint = borntxt[0].replace(u"\u00A0", " ").split()[0]
        if hint == u"Née":
            return "F"
        elif hint == u"Né":
            return "M"
    logger.warn("[!] no birth/gender data http://www.europarl.europa.eu/meps/fr/%s/get.html" % id)
    return "n/a"


def parseMember(userid):
    url = "http://www.europarl.europa.eu/meps/en/%s/get.html" % userid
    logger.info("scraping %s" % url)
    root = fetch(url, ignore=[500])
    data = {u"active": False, "meta": {u"url": url}}  # return {'active': False}
    mepdiv = root.xpath('//div[@class="ep_elementpeople2"]')
    if len(mepdiv) == 1:
        mepdiv = mepdiv[0]
    else:
        logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv)))
    data[u"Name"] = mangleName(unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[0]))
    data[u"Photo"] = unicode(urljoin(BASE_URL, mepdiv.xpath('.//span[@class="ep_img"]/img')[0].get("src")), "utf8")
Exemplo n.º 25
0
def parse_block(block, url, reference, date, committee, rapporteur):
    am={u'src': url,
        u'reference': reference,
        u'date': date,
        u'committee': committee}

    #logger.info(block)
    # get title
    try:
        am[u'seq']=int(unws(block[0]).split()[1])
    except ValueError:
        am[u'seq']=unws(block[0]).split()[1]
    except IndexError:
        logger.warn("%s wrong seq %s" % (datetime.now().isoformat(), block[0]))
        am[u'seq']=unws(block[0])
    del block[0]

    strip(block)

    # find and strip justification
    i=len(block)-1
    while i>2 and not (unws(block[i])=="Justification" and block[i].startswith(' ' * 6)):
        i-=1
    if i>2:
        if i<len(block)-1 and (not unws(block[i+1]) or not block[i+1].startswith(' ') ):
            am['justification']='\n'.join(block[i+2:])
            del block[i:]
            strip(block)
        else:
            logger.warn("%s wrong justification\n%s" % (datetime.now().isoformat(), '\n'.join(block[i:])))

    # get original language
    if 4<len(unws(block[-1]))<=6 and unws(block[-1]).startswith('Or.'):
        am['orig_lang']=unws(block[-1])[4:]
        del block[-1]
        strip(block)

    # find split column new/old heading
    i=len(block)-1
    while (i>2 and
           not ((block[i].endswith("     Amendment") or
                 block[i].endswith("     PARTICULARS") or
                 block[i].endswith("     Remedy") or
                 block[i].endswith("     Amended text") or
                 block[i].endswith("     Amendement") or
                 block[i].endswith("           Amendments by Parliament") or
                 block[i].endswith("           Proposal for rejection") or
                 block[i].endswith("           Proposal for a rejection") or
                 block[i].endswith("           Does not affect English version") or
                 block[i].endswith("           (Does not affect English version)") or
                 block[i].endswith("      Amendment by Parliament")) and
                len(block[i])>33) and
           not (unws(block[i])=='Text proposed by the Commission' or
                unws(block[i]) in types)):
        i-=1
    if i>2:
        #if block[i].endswith("               Proposal for rejection"):
        #    pass # location will be possibly '-'
        seq=False
        if unws(block[i]) in ["Amendment", "Amendment by Parliament"]:
            # sequential format, search for preceeding original text
            j=i
            while (j>2 and not (unws(block[j]) in types or unws(block[j])=='Text proposed by the Commission')):
                j-=1
            if j>2: i=j
            seq=True; key='old'
        elif unws(block[i])=='Text proposed by the Commission' or block[i].strip() in types:
            seq=True; key='old'
        # throw headers
        del block[i]
        while i<len(block) and not unws(block[i]): del block[i]        # skip blank lines
        mid=max([len(x) for x in block])/2
        while i<len(block):
            if seq:
                if unws(block[i]) in ["Amendment", "Amendment by Parliament", "Text Amended"]:
                    key='new'
                    del block[i]
                    continue
                try: am[key].append(block[i])
                except KeyError: am[key]=[block[i]]
                del block[i]
                continue
            # only new, old is empty
            if block[i].startswith('         '):
                try: am['new'].append(unws(block[i]))
                except KeyError: am['new']=[unws(block[i])]
                del block[i]
                continue
            newstart = block[i].rstrip().rfind('  ')
            # only old, new is empty
            if newstart < 6:
                try: am['old'].append(unws(block[i]))
                except KeyError: am['old']=[unws(block[i])]
                del block[i]
                continue
            #mid=len(block[i])/2
            #mid=40
            lsep=block[i].rfind('  ', 0, mid)
            # todo calculate both, and use the one closer to the center
            rsep=block[i].find('  ', mid)
            sep=None
            if abs(lsep-mid)<abs(rsep-mid):
                if abs(lsep-mid)<15:
                    sep=lsep
            else:
                if abs(rsep-mid)<15:
                    sep=rsep
            if sep:
                try: am['old'].append(unws(block[i][:sep]))
                except KeyError: am['old']=[unws(block[i][:sep])]
                try: am['new'].append(unws(block[i][sep:]))
                except KeyError: am['new']=[unws(block[i][sep:])]
            else:
                # no sane split found
                #logger.warn("no split: %s %s\n%s" % (datetime.now().isoformat(),
                #                                     (sep, mid, len(block[i]), newstart, block[i]),
                #                                     block[i][mid-1:mid+2]))
                # fallback to naive splitting
                try: am['old'].append(unws(block[i][:newstart]))
                except KeyError: am['old']=[unws(block[i][:newstart])]
                try: am['new'].append(unws(block[i][newstart:]))
                except KeyError: am['new']=[unws(block[i][newstart:])]
            del block[i]
        strip(block)
    else:
        logger.warn("%s no table\n%s" % (datetime.now().isoformat(), '\n'.join(block[i:])))
        am['content']=block[i:]
        return am

    i=0
    # find end of authors
    while (i<len(block) and
           unws(block[i]) and
           not unws(block[i]).lower().startswith('compromise') and
           not istype(block[i]) and
           not unws(block[i]).split()[0] in locstarts): i+=1
    if i<len(block):
        if i>0:
            names=' '.join(block[:i])
            am['authors']=names
            #logger.info("names \n%s" % names)

            # convert to pt mep _ids
            for text in filter(None,splitNames(names)):
                mep=getMep(text,None,False)
                if mep:
                    try: am['meps'].append(mep['UserID'])
                    except KeyError: am['meps']=[mep['UserID']]
                else:
                    logger.info("fix %s" % text)
            del block[:i]
            strip(block)
        elif rapporteur:
            am['authors']=rapporteur
            for text in filter(None,splitNames(rapporteur)):
                mep=getMep(text,None,False)
                if mep:
                    try: am['meps'].append(mep['UserID'])
                    except KeyError: am['meps']=[mep['UserID']]
                else:
                    logger.info("fix %s" % text)
        else:
            logger.info("%s no authors in Amendment %s" % (datetime.now().isoformat(), am['seq']))
    else:
        logger.warn("%s no boundaries in Amendment %s\n%s" % (datetime.now().isoformat(),
                                                              am['seq'],
                                                              '\n'.join(block)))
        am['rest']=block
        return am

    # handle compromise info
    i=0
    while (i<len(block) and
           unws(block[i]) and
           not istype(block[i]) and
           not unws(block[i]).split()[0] in locstarts): i+=1
    if i<len(block) and i>0:
        am['compromise']=block[:i]
        del block[:i]
        strip(block)

    i=0
    while (i<len(block) and unws(block[i])):
        if unws(block[i]).split()[0] in locstarts:
            try: am['location'].append((' '.join(block[:i]),unws(block[i])))
            except KeyError: am['location']=[(' '.join(block[:i]),unws(block[i]))]
            del block[:i+1]
            i=0
        else:
            i+=1
    if len(block)>0 and ((len(block)==1 or
                          not unws(block[1])) and
                         unws(block[0])!='1' and
                         'location' in am):
        am['location'][-1]=(am['location'][-1][0],"%s %s" % (am['location'][-1][1],block[0]))
        del block[0]
        strip(block)

    if block:
        if not ((len(block)==3 and
                unws(block[0])=='1' and
                not unws(block[1]) and
                block[2].startswith("  ")) or
                (len(block)==2 and
                unws(block[0])=='1' and
                block[1].startswith("  "))):
            # ignore obvious footnotes
            logger.info("rest in Amendment %s\n%s" % (am['seq'],'\n'.join(block)))
    return am
Exemplo n.º 26
0
def merge_events(events, committees, agents):
    bydate = {}
    for event in events:
        if not event['date'] in bydate:
            bydate[event['date']] = [event]
        else:
            bydate[event['date']].append(event)
    #pprint.pprint(sorted([(k,[dict([(k1,v1) for k1,v1 in i.items() if k1!='text']) for i in v]) for k,v in bydate.items()]))
    res = []
    # merge items to events.
    for date, items in bydate.items():
        actors = {}  # collects items/actor for a given date
        for item in items:
            if not item.get('body'):
                # find out body, or drop
                body = stage2inst.get(item.get('type'))
                if body:
                    item[u'body'] = body
                elif item.get(
                        'type') == 'Final act published in Official Journal':
                    # this really has no body or all
                    res.append(item)
                    continue
                else:
                    logger.warn('unknown body: %s' % item.get('type'))
                    item[u'body'] = 'unknown'
            # new institution for this date
            if not item['body'] in actors:
                # new body for this date
                actors[item['body']] = item
                if 'doc' in actors[item['body']]:
                    docs = merge_new_docs(actors[item['body']]['doc'], item)
                    del actors[item['body']]['doc']
                    actors[item['body']][u'docs'] = docs
                cmts = getCommittee(item, committees)
                if cmts:
                    actors[item['body']][u'committees'] = sorted(
                        cmts, key=itemgetter('committee'))
                if item['body'] == 'EC':
                    actors[u'EC'][u'commission'] = sorted(
                        [{
                            u'DG': x['dg'],
                            u'Commissioner': x['commissioner']
                        } if x.get('commissioner') else {
                            u'DG': x['dg']
                        } for x in agents if x['body'] == 'EC'])
                continue
            # merge any docs
            if 'doc' in item:
                docs = merge_new_docs(item['doc'], item)
                for doc in docs:
                    skip = False
                    # update docs, that are already in there, but with a different 'type'
                    for cdoc in actors[item['body']].get('docs', []):
                        if cdoc.get('url') == doc.get('url') or cdoc.get(
                                'title') == doc.get('title'):
                            cdoc.update(doc)
                            skip = True
                            break
                    if skip: continue
                    try:
                        actors[item['body']][u'docs'].append(doc)
                    except KeyError:
                        actors[item['body']][u'docs'] = [doc]
                del item['doc']
            # merge any fields not yet in the actor
            actors[item['body']].update([(k, v) for k, v in item.items()
                                         if k not in actors[item['body']]])
        res.extend([x for x in actors.values() if x])
    #pprint.pprint(sorted(res, key=itemgetter('date')))
    #pprint.pprint(sorted([dict([(k1,v1) for k1,v1 in v.items() if k1!='text']) for v in res], key=itemgetter('date')))
    return res
Exemplo n.º 27
0
def merge_events(events,committees,agents):
    bydate={}
    for event in events:
        if not event['date'] in bydate:
            bydate[event['date']]=[event]
        else:
            bydate[event['date']].append(event)
    #pprint.pprint(sorted([(k,[dict([(k1,v1) for k1,v1 in i.items() if k1!='text']) for i in v]) for k,v in bydate.items()]))
    res=[]
    # merge items to events.
    for date, items in bydate.items():
        actors={} # collects items/actor for a given date
        for item in items:
            if not item.get('body'):
                # find out body, or drop
                body=stage2inst.get(item.get('type'))
                if body:
                    item[u'body']=body
                elif item.get('type')=='Final act published in Official Journal':
                    # this really has no body or all
                    res.append(item)
                    continue
                else:
                    logger.warn('unknown body: %s' % item.get('type'))
                    item[u'body']='unknown'
            # new institution for this date
            if not item['body'] in actors:
                # new body for this date
                actors[item['body']]=item
                if 'doc' in actors[item['body']]:
                    docs=merge_new_docs(actors[item['body']]['doc'], item)
                    del actors[item['body']]['doc']
                    actors[item['body']][u'docs']=docs
                cmts=getCommittee(item,committees)
                if cmts:
                    actors[item['body']][u'committees']=sorted(cmts, key=itemgetter('committee'))
                if item['body']=='EC':
                    actors[u'EC'][u'commission']=sorted([{u'DG': x['dg'],
                                                        u'Commissioner': x['commissioner']} if x.get('commissioner') else {u'DG': x['dg']}
                                                       for x in agents if x['body']=='EC'])
                continue
            # merge any docs
            if 'doc' in item:
                docs=merge_new_docs(item['doc'], item)
                for doc in docs:
                    skip=False
                    # update docs, that are already in there, but with a different 'type'
                    for cdoc in actors[item['body']].get('docs',[]):
                        if cdoc.get('url')==doc.get('url') or cdoc.get('title')==doc.get('title'):
                            cdoc.update(doc)
                            skip=True
                            break
                    if skip: continue
                    try:
                        actors[item['body']][u'docs'].append(doc)
                    except KeyError:
                        actors[item['body']][u'docs']=[doc]
                del item['doc']
            # merge any fields not yet in the actor
            actors[item['body']].update([(k,v) for k,v in item.items() if k not in actors[item['body']]])
        res.extend([x for x in actors.values() if x])
    #pprint.pprint(sorted(res, key=itemgetter('date')))
    #pprint.pprint(sorted([dict([(k1,v1) for k1,v1 in v.items() if k1!='text']) for v in res], key=itemgetter('date')))
    return res
Exemplo n.º 28
0
def getactors(node):
    res={}
    ax=[None,[]]
    for row in node.xpath('.//tr'):
        cells=row.xpath('./td/p')
        if not cells: continue

        # get role Rapporteur|Responsible|Rapporteur for the opinion|Opinions
        role=cells[0].xpath('text()')
        if role and unws(role[0]):
            if ax[0] and ax[1]: res[ax[0]]=sorted(ax[1])
            tmp=unws(role[0])[:-1]
            if tmp=="Rapporteur for the opinion":
                tmp="Rapporteur"
            ax=[tmp,[]]

        tmp=unws((cells[1].xpath('text()') or [''])[0])
        if ax[0] in ["Rapporteur", "Rapporteur for the opinion"] and tmp:
            name=' '.join(tmp.split()[:-1])
            item={u'group': tmp.split()[-1][1:-1],
                  u'name': name,
                  u'mepref': getMEPRef(name) }
            if len(cells)>2:
                item[u'docs']=getdoclist(cells[2])
            ax[1].append(item)
            continue
        if ax[0] in ["Opinions", "Responsible"] and tmp:
            tmp1=tmp.split(u' –',1)
            if len(tmp1)==2:
                (comid, rest)=tmp1
            elif len(tmp1)==1:
                if len(tmp1[0])==4 and tmp1[0].isupper():
                    (comid, rest)=(tmp1,'')
                elif len(tmp1[0])>4 and tmp1[0][4] in ['-', u'–', u':', u'*'] and tmp1[0][:4].isupper():
                    (comid, rest)=(tmp1[:4],tmp1[5:])
                else:
                    skip=False
                    for com in tmp.split(', '):
                        if com in COMMITTEE_MAP and len(com)==4:
                            ax[1].append({u'comid': com})
                            skip=True
                    if skip:
                        continue
            else:
                logger.warn("[!] unknown committee: %s" % tmp)
                raise
            if not comid:
                logger.warn("[!] unknown committee: %s" % tmp)
            item={u'comid': comid}
            if rest==' Decision: no opinion':
                item[u'response']=u'Decision: no opinion'
            if not rest and len(comid)>4:
                for com in comid.split(', '):
                    ax[1].append({u'comid': com})
                continue
            if len(cells)>2:
                tmp=unws((cells[2].xpath('text()') or [None])[0])
                if tmp:
                    name=' '.join(tmp.split()[:-1])
                    item.update({u'group': tmp.split()[-1][1:-1],
                                 u'name': name,
                                 u'mepref': getMEPRef(name)})
                    if len(cells)>3:
                        item[u'docs']=getdoclist(cells[3])
            ax[1].append(item)
    if ax[0] and ax[1]: res[ax[0]]=sorted(ax[1])
    return res
Exemplo n.º 29
0
def parse_block(block, url, reference, date, committee, rapporteur):
    am = {u"src": url, u"reference": reference, u"date": date, u"committee": committee}

    # logger.info(block)
    # get title
    try:
        am[u"seq"] = int(unws(block[0]).split()[1])
    except ValueError:
        am[u"seq"] = unws(block[0]).split()[1]
    except IndexError:
        logger.warn("%s wrong seq %s" % (datetime.now().isoformat(), block[0]))
        am[u"seq"] = unws(block[0])
    del block[0]

    strip(block)

    # find and strip justification
    i = len(block) - 1
    while i > 2 and not (unws(block[i]) == "Justification" and block[i].startswith(" " * 6)):
        i -= 1
    if i > 2:
        if i < len(block) - 1 and (not unws(block[i + 1]) or not block[i + 1].startswith(" ")):
            am["justification"] = "\n".join(block[i + 2 :])
            del block[i:]
            strip(block)
        else:
            logger.warn("%s wrong justification\n%s" % (datetime.now().isoformat(), "\n".join(block[i:])))

    # get original language
    if 4 < len(unws(block[-1])) <= 6 and unws(block[-1]).startswith("Or."):
        am["orig_lang"] = unws(block[-1])[4:]
        del block[-1]
        strip(block)

    # find split column new/old heading
    i = len(block) - 1
    while (
        i > 2
        and not (
            (
                block[i].endswith("     Amendment")
                or block[i].endswith("     PARTICULARS")
                or block[i].endswith("     Remedy")
                or block[i].endswith("     Amended text")
                or block[i].endswith("     Amendement")
                or block[i].endswith("           Amendments by Parliament")
                or block[i].endswith("           Proposal for rejection")
                or block[i].endswith("           Proposal for a rejection")
                or block[i].endswith("           Does not affect English version")
                or block[i].endswith("           (Does not affect English version)")
                or block[i].endswith("      Amendment by Parliament")
            )
            and len(block[i]) > 33
        )
        and not (unws(block[i]) == "Text proposed by the Commission" or unws(block[i]) in types)
    ):
        i -= 1
    if i > 2:
        # if block[i].endswith("               Proposal for rejection"):
        #    pass # location will be possibly '-'
        seq = False
        if unws(block[i]) in ["Amendment", "Amendment by Parliament"]:
            # sequential format, search for preceeding original text
            j = i
            while j > 2 and not (unws(block[j]) in types or unws(block[j]) == "Text proposed by the Commission"):
                j -= 1
            if j > 2:
                i = j
            seq = True
            key = "old"
        elif unws(block[i]) == "Text proposed by the Commission" or block[i].strip() in types:
            seq = True
            key = "old"
        # throw headers
        del block[i]
        while i < len(block) and not unws(block[i]):
            del block[i]  # skip blank lines
        mid = max([len(x) for x in block]) / 2
        while i < len(block):
            if seq:
                if unws(block[i]) in ["Amendment", "Amendment by Parliament", "Text Amended"]:
                    key = "new"
                    del block[i]
                    continue
                try:
                    am[key].append(block[i])
                except KeyError:
                    am[key] = [block[i]]
                del block[i]
                continue
            # only new, old is empty
            if block[i].startswith("         "):
                try:
                    am["new"].append(unws(block[i]))
                except KeyError:
                    am["new"] = [unws(block[i])]
                del block[i]
                continue
            newstart = block[i].rstrip().rfind("  ")
            # only old, new is empty
            if newstart < 6:
                try:
                    am["old"].append(unws(block[i]))
                except KeyError:
                    am["old"] = [unws(block[i])]
                del block[i]
                continue
            # mid=len(block[i])/2
            # mid=40
            lsep = block[i].rfind("  ", 0, mid)
            # todo calculate both, and use the one closer to the center
            rsep = block[i].find("  ", mid)
            sep = None
            if abs(lsep - mid) < abs(rsep - mid):
                if abs(lsep - mid) < 15:
                    sep = lsep
            else:
                if abs(rsep - mid) < 15:
                    sep = rsep
            if sep:
                try:
                    am["old"].append(unws(block[i][:sep]))
                except KeyError:
                    am["old"] = [unws(block[i][:sep])]
                try:
                    am["new"].append(unws(block[i][sep:]))
                except KeyError:
                    am["new"] = [unws(block[i][sep:])]
            else:
                # no sane split found
                # logger.warn("no split: %s %s\n%s" % (datetime.now().isoformat(),
                #                                     (sep, mid, len(block[i]), newstart, block[i]),
                #                                     block[i][mid-1:mid+2]))
                # fallback to naive splitting
                try:
                    am["old"].append(unws(block[i][:newstart]))
                except KeyError:
                    am["old"] = [unws(block[i][:newstart])]
                try:
                    am["new"].append(unws(block[i][newstart:]))
                except KeyError:
                    am["new"] = [unws(block[i][newstart:])]
            del block[i]
        strip(block)
    else:
        logger.warn("%s no table\n%s" % (datetime.now().isoformat(), "\n".join(block[i:])))
        am["content"] = block[i:]
        return am

    i = 0
    # find end of authors
    while (
        i < len(block)
        and unws(block[i])
        and not unws(block[i]).lower().startswith("compromise")
        and not istype(block[i])
        and not unws(block[i]).split()[0] in locstarts
    ):
        i += 1
    if i < len(block):
        if i > 0:
            names = " ".join(block[:i])
            am["authors"] = names
            # logger.info("names \n%s" % names)

            # convert to pt mep _ids
            for text in filter(None, splitNames(names)):
                mep = getMep(text, None, False)
                if mep:
                    try:
                        am["meps"].append(mep["UserID"])
                    except KeyError:
                        am["meps"] = [mep["UserID"]]
                else:
                    logger.info("fix %s" % text)
            del block[:i]
            strip(block)
        elif rapporteur:
            am["authors"] = rapporteur
            for text in filter(None, splitNames(rapporteur)):
                mep = getMep(text, None, False)
                if mep:
                    try:
                        am["meps"].append(mep["UserID"])
                    except KeyError:
                        am["meps"] = [mep["UserID"]]
                else:
                    logger.info("fix %s" % text)
        else:
            logger.info("%s no authors in Amendment %s" % (datetime.now().isoformat(), am["seq"]))
    else:
        logger.warn("%s no boundaries in Amendment %s\n%s" % (datetime.now().isoformat(), am["seq"], "\n".join(block)))
        am["rest"] = block
        return am

    # handle compromise info
    i = 0
    while i < len(block) and unws(block[i]) and not istype(block[i]) and not unws(block[i]).split()[0] in locstarts:
        i += 1
    if i < len(block) and i > 0:
        am["compromise"] = block[:i]
        del block[:i]
        strip(block)

    i = 0
    while i < len(block) and unws(block[i]):
        if unws(block[i]).split()[0] in locstarts:
            try:
                am["location"].append((" ".join(block[:i]), unws(block[i])))
            except KeyError:
                am["location"] = [(" ".join(block[:i]), unws(block[i]))]
            del block[: i + 1]
            i = 0
        else:
            i += 1
    if len(block) > 0 and ((len(block) == 1 or not unws(block[1])) and unws(block[0]) != "1" and "location" in am):
        am["location"][-1] = (am["location"][-1][0], "%s %s" % (am["location"][-1][1], block[0]))
        del block[0]
        strip(block)

    if block:
        if not (
            (len(block) == 3 and unws(block[0]) == "1" and not unws(block[1]) and block[2].startswith("  "))
            or (len(block) == 2 and unws(block[0]) == "1" and block[1].startswith("  "))
        ):
            # ignore obvious footnotes
            logger.info("rest in Amendment %s\n%s" % (am["seq"], "\n".join(block)))
    return am
Exemplo n.º 30
0
def scrape(url, comid):
    root=fetch(url)
    lines=[x for x in root.xpath('//td[@class="contents"]/div/*') if unws(' '.join(x.xpath('.//text()')))]
    if not len(lines): return
    if not unws(' '.join(lines[2].xpath('.//text()')))=='DRAFT AGENDA':
        logger.error("NOT DRAFT AGENDA %s" % unws(' '.join(lines[2].xpath('.//text()'))))
    agenda={u'committee': comid,
            u'committee_full': unws(' '.join(lines[0].xpath('.//text()'))),
            u'src': url,
        }
    i=1
    if unws(' '.join(lines[3].xpath('.//text()')))=="INTERPARLIAMENTARY COMMITTEE MEETING":
        logger.warn("skipping interparl com meet")
        return
    if unws(' '.join(lines[6].xpath('.//text()'))).startswith('Room'):
            agenda.update({u'docid': unws(' '.join(lines[1].xpath('.//text()'))),
                           u'type': unws(' '.join(lines[3].xpath('.//text()'))),
                           u'time': toTime(unws(' '.join(lines[4].xpath('.//text()')))),
                           u'city': unws(' '.join(lines[5].xpath('.//text()'))),
                           u'room': unws(' '.join(lines[6].xpath('.//text()')))[6:],
                           })
            i=7
    itemcnt=0
    item={}
    schedule=None
    res=[]
    while i < len(lines):
        line=lines[i]
        i+=1
        txt=unws(' '.join(line.xpath('.//text()')))
        if txt in ['* * *', '***']:
            continue # skip end of schedule block

        # 20 December 2011, 16.00 – 16.30
        tmp=toTime(txt)
        if tmp:
            schedule=tmp
            if i<len(lines) and unws(' '.join(lines[i].xpath('.//text()'))) == 'In camera':
                schedule[u'incamera']=True
                i+=1
            continue

        if line.tag=='div':
            item[u'actors']=getactors(line)
            continue
        firsttoken=txt.split()[0]
        # 6. Alternative dispute resolution for consumer disputes and
        #    amending Regulation (EC) No 2006/2004 and Directive
        #    2009/22/EC (Directive on consumer ADR)
        if firsttoken[-1]=='.' and firsttoken[:-1].isdigit() and itemcnt+1==int(firsttoken[:-1]):
            if item: res.append(item)
            itemcnt+=1
            item=copy.deepcopy(agenda)
            item.update({u'title': ' '.join(txt.split()[1:]),
                         u'seq_no': itemcnt,})
            if schedule:
                item.update(schedule)
            continue
        # trailing list of "details"
        # · Presentation by the Commission of the proposal & Impact Assessment
        # · Exchange of views
        if firsttoken==u"·":
            if not 'list' in item: item[u'list']=[]
            tmp=' '.join(txt.split()[1:])
            if tmp.startswith('Deadline for tabling amendments:'):
                try:
                    item[u'tabling_deadline']=datetime.strptime(tmp.split(':')[1].strip(),"%d %B %Y, %H.%M")
                except ValueError:
                    try:
                        item[u'tabling_deadline']=datetime.strptime(tmp.split(':')[1].strip(),"%d.%m.%Y at %H.%M")
                    except:
                        logger.warn('[$] unknown tabling deadline format %s' % unws(tmp))
            item[u'list'].append(tmp)
            continue
        # committee dossier
        # IMCO/7/08130
        if txt.startswith("%s/7/" % comid) and len(txt)==12:
            item[u'comdossier']=txt
            continue
        # ***I    2011/0373(COD)       COM(2011)0793 – C7-0454/2011
        tmp=getdocs(txt)
        if tmp:
            item.update(tmp)
            continue
        # fall-through line
        logger.debug("(falltrough) %s %s" % (line.tag, txt.encode('utf8')))
    if item: res.append(item)
    return res
Exemplo n.º 31
0
def parseMember(userid):
    url='http://www.europarl.europa.eu/meps/en/%s/get.html' % userid
    logger.info("scraping %s" % url)
    root = fetch(url)
    data = {u'active': True, 'meta': {u'url': url}} # return {'active': False}
    mepdiv=root.xpath('//div[@class="ep_elementpeople2"]')
    if len(mepdiv) == 1:
        mepdiv = mepdiv[0]
    else:
        logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv)))
    data[u'Name'] = mangleName(unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[0]))
    data[u'Photo'] = unicode(urljoin(BASE_URL,mepdiv.xpath('.//span[@class="ep_img"]/img')[0].get('src')),'utf8')
    (d, p) = mepdiv.xpath('.//div[@class="ep_elementtext"]/p/text()')[0].split(',', 1)
    try:
        data[u'Birth'] = { u'date': datetime.strptime(unws(d), "Born on %d %B %Y"),
                           u'place': unws(p) }
    except ValueError:
        logger.warn('[!] failed to scrape birth data %s' % url)
        logger.warn(traceback.format_exc())
    const={u'country': unws(mepdiv.xpath('.//span[@class="ep_country"]/text()')[0])}
    data[u'Constituencies']=[const]
    try:
        const[u'party']=unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[1]),
    except IndexError:
        data[u'active']=False
    else:
        group=unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[0])
        data[u'Groups'] = [{ u'role': unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[1]),
                             u'group': group,
                             u'groupid': group_map[group]}]
    cdiv=root.xpath('//div[@class="ep_elementcontact"]')
    if len(cdiv):
        addif(data,u'RSS',[unicode(urljoin(BASE_URL,x.get('href')),'utf8') for x in cdiv[0].xpath('.//li[@class="ep_rss"]//a')])
        addif(data,u'Homepage',[unicode(x.get('href'),'utf8') for x in cdiv[0].xpath('.//li[@class="ep_website"]//a')])
        addif(data,u'Mail',[decodemail(unws(x)) for x in cdiv[0].xpath('.//li[@class="ep_email"]//text()') if len(unws(x))])
    for span in root.xpath('//div[@id="contextzone"]//span[@class="ep_title"]'):
        title=unws(''.join(span.xpath('.//text()')))
        if title in ['Accredited assistants', 'Local assistants']:
            addif(data,title,[unws(x) for x in span.xpath('../../..//li/div/text()')])
    addif(data,u'Addresses',getAddress(root))
    for div in root.xpath('//div[@class="ep_content"]'):
        key=unws(u''.join(div.xpath('.//span[@class="ep_title"]/text()')))
        if not len(key):
            continue
        elif key.lower()=='curriculum vitae':
            data[u'CV'] = [unws(x) for x in div.xpath('.//div[@class="ep_elementtext"]//li/div/text()')]
        elif key in ['Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President', 'President', 'Vice-President']:
            for span in div.xpath('.//span[@class="commission_label"]'):
                item={u'role': key,
                      u'abbr': unws(''.join(span.xpath('text()'))),
                      u'Organization': unws(span.tail)}
                for start, field in orgmaps:
                    if item['Organization'].startswith(start):
                        if not field in data: data[field]=[]
                        if field=='Committees' and item['Organization'] in COMMITTEE_MAP:
                            item[u'committee_id']=COMMITTEE_MAP[item['Organization']]
                        data[field].append(item)
                        break
        else:
            logger.error('[!] unknown field %s' % key)
    return data
Exemplo n.º 32
0
def scrape(celexid, path):
    logger.info("scraping %s%s:NOT" % (EURLEXURL, celexid))
    path.reverse()
    (code, lang) = celexid.split(":")[1:3]
    st = 6
    if len(code) > 6:
        if code[6].isalpha(): st = 7
        eurlex = {
            'id': {
                u'celexid': celexid,
                u'sector': code[0],
                u'year': code[1:5],
                u'doctype': code[5:st],
                u'refno': code[st:],
                u'lang': lang,
            }
        }
    else:
        eurlex = {
            'id': {
                u'celexid': celexid,
                u'sector': code[0],
                u'year': code[1:5],
                u'doctype': code[5:6],
                u'lang': lang,
            }
        }

    try:
        eurlex['id'][u'typeDesc'] = CELEXCODES[code[0]]['Document Types'][
            code[5:st]] if code[5:st] != 'C' else CELEXCODES[code[0]]['Sector']
    except:
        eurlex['id'][u'typeDesc'] = u"Unknown"
        logger.warn("[!] unknown typedesc %s" % celexid)
    eurlex['meta'] = {u'src': "%s%s:NOT" % (EURLEXURL, celexid)}

    root = fetch("%s%s:NOT" % (EURLEXURL, celexid))
    if len(root.xpath('//h1[text()="No documents matching criteria."]')) > 0:
        logger.warn('[!] nothing to scrape here: %s',
                    "%s%s:NOT" % (EURLEXURL, celexid))
        return
    eurlex[u'title'] = unws(
        root.xpath(
            '//h2[text()="Title and reference"]/following-sibling::p/text()')
        [0])
    # dates
    dates = root.xpath('//h2[text()="Dates"]/following-sibling::ul/text()')
    for y in dates:
        if not unws(y): continue
        title, rest = unws(y).split(": ", 1)
        item = {}
        date = rest[:10]
        tail = rest[10:]
        if tail.startswith('; '):
            tail = tail[2:]
        if date == '99/99/9999': item[u'date'] = datetime(9999, 12, 31)
        elif date == '00/00/0000': item[u'date'] = datetime(0001, 01, 01)
        elif date == '//': continue
        else:
            try:
                item[u'date'] = datetime.strptime(date, u"%d/%m/%Y")
            except ValueError:
                try:
                    item[u'date'] = datetime.strptime(date, u"%m/%d/%Y")
                except:
                    pass
        if len(tail):
            item['note'] = tail
        try:
            eurlex['dates'][title] = item
        except:
            eurlex['dates'] = {title: item}
Exemplo n.º 33
0
def scrape_epagents(table):
    heading=''.join(table.xpath('.//td[@class="players_committee"]')[0].xpath(".//text()")).strip()
    responsible=None
    if heading in [ "Committee responsible", "Former committee responsible"]:
        responsible=True
    elif heading in ["Committee for opinion", "Former committee for opinion"]:
        responsible=False
    else:
        logger.warn(u"[!] unknown committee heading %s" % heading)

    # handle shadows
    shadowelems=table.xpath('//a[@id="shadowRapporteurHeader"]/../following-sibling::div/p//span[@class="players_rapporter_text"]/a')
    tips=[t.xpath('text()')[0] if len(t.xpath('text()'))>0 else groupurlmap[t.xpath("a")[0].get('href')]
          for t in table.xpath('//a[@id="shadowRapporteurHeader"]/../following-sibling::div//span[@class="tiptip"]')]
    shadows={}
    for shadow, group in izip_longest(shadowelems, tips):
        committee=shadow.xpath('./ancestor::td/preceding-sibling::td//acronym/text()')[0]
        if not committee in shadows: shadows[committee]=[]
        if group=='NI': group=u'NI'
        mep={u'name': unicode(shadow.xpath('text()')[0]),
             u'group': unicode(group)}
        tmp=getMEPRef(shadow.xpath('text()')[0])
        if tmp:
           mep[u'mepref']=tmp
        #else:
        #    raise IndexError
        shadows[committee].append(mep)
    # delete the uneccessary shadow elements - so the following regular lst2obj get's what it expects
    for todel in table.xpath('//a[@id="shadowRapporteurHeader"]/..'):
        parent=todel.xpath('..')[0]
        parent.remove(todel.xpath('following-sibling::div')[0])
        parent.remove(todel)

    # handle each row of agents
    agents=[]
    for agent in lst2obj(table,epagents,1):
        agent[u'responsible']=responsible
        agent[u'body']=u'EP'
        if agent.get('rapporteur'):
            meps=[]
            for mep in agent['rapporteur']:
                if unws(mep['name']).startswith("The committee decided not to give an opinion"):
                    del agent['rapporteur'][agent['rapporteur'].index(mep)]
                    agent[u'opinion']=None
                    continue
                tmp=getMEPRef(mep['name'])
                if tmp:
                    meps.append({u'mepref': tmp,
                                 u'group': mep['group'],
                                 u'name': mep['name']})
                else:
                    meps.append({u'group': mep['group'],
                                 u'name': mep['name']})
            agent[u'rapporteur']=meps

        abbr=agent['committee'][:4]
        if abbr=='BUDE': abbr='BUDG'
        if not abbr in COMMITTEE_MAP.keys():
            logger.warn(u"[!] uknown committee abbrev %s" % abbr)
            agent[u'committee_full']=agent['committee']
            if agent['committee'][4]==' ' and abbr.isalpha():
                agent[u'committee']=abbr
        else:
            agent[u'committee_full']=agent['committee'][5:]
            agent[u'committee']=abbr

        if agent.get(u'committee') in shadows.keys():
            agent[u'shadows']=shadows[agent['committee']]

        if not agent in agents: agents.append(agent)
    return agents
Exemplo n.º 34
0
def scrape_epagents(table):
    heading = ''.join(
        table.xpath('.//td[@class="players_committee"]')[0].xpath(
            ".//text()")).strip()
    responsible = None
    if heading in ["Committee responsible", "Former committee responsible"]:
        responsible = True
    elif heading in ["Committee for opinion", "Former committee for opinion"]:
        responsible = False
    else:
        logger.warn(u"[!] unknown committee heading %s" % heading)

    # handle shadows
    shadowelems = table.xpath(
        '//a[@id="shadowRapporteurHeader"]/../following-sibling::div/p//span[@class="players_rapporter_text"]/a'
    )
    tips = [
        t.xpath('text()')[0] if len(t.xpath('text()')) > 0 else
        groupurlmap[t.xpath("a")[0].get('href')] for t in table.xpath(
            '//a[@id="shadowRapporteurHeader"]/../following-sibling::div//span[@class="tiptip"]'
        )
    ]
    shadows = {}
    for shadow, group in izip_longest(shadowelems, tips):
        committee = shadow.xpath(
            './ancestor::td/preceding-sibling::td//acronym/text()')[0]
        if not committee in shadows: shadows[committee] = []
        if group == 'NI': group = u'NI'
        mep = {u'name': unicode(shadow.xpath('text()')[0]), u'group': group}
        tmp = getMEPRef(shadow.xpath('text()')[0])
        if tmp:
            mep[u'mepref'] = tmp
        else:
            raise IndexError
        shadows[committee].append(mep)
    # delete the uneccessary shadow elements - so the following regular lst2obj get's what it expects
    for todel in table.xpath('//a[@id="shadowRapporteurHeader"]/..'):
        parent = todel.xpath('..')[0]
        parent.remove(todel.xpath('following-sibling::div')[0])
        parent.remove(todel)

    # handle each row of agents
    agents = []
    for agent in lst2obj(table, epagents, 1):
        agent[u'responsible'] = responsible
        agent[u'body'] = u'EP'
        if agent.get('rapporteur'):
            meps = []
            for mep in agent['rapporteur']:
                if unws(mep['name']).startswith(
                        "The committee decided not to give an opinion"):
                    del agent['rapporteur'][agent['rapporteur'].index(mep)]
                    agent[u'opinion'] = None
                    continue
                tmp = getMEPRef(mep['name'])
                if tmp:
                    meps.append({
                        u'mepref': tmp,
                        u'group': mep['group'],
                        u'name': mep['name']
                    })
                else:
                    raise IndexError
            agent[u'rapporteur'] = meps

        abbr = agent['committee'][:4]
        if abbr == 'BUDE': abbr = 'BUDG'
        if not abbr in COMMITTEE_MAP.keys():
            logger.warn(u"[!] uknown committee abbrev %s" % abbr)
            agent[u'committee_full'] = agent['committee']
            if agent['committee'][4] == ' ' and abbr.isalpha():
                agent[u'committee'] = abbr
        else:
            agent[u'committee_full'] = agent['committee'][5:]
            agent[u'committee'] = abbr

        if agent.get(u'committee') in shadows.keys():
            agent[u'shadows'] = shadows[agent['committee']]

        if not agent in agents: agents.append(agent)
    return agents