示例#1
0
def getactivities(mepid, terms=[8]):
    urltpl = 'http://www.europarl.europa.eu/meps/en/%s/see_more.html?type=%s&leg=%s&index=%s'
    #ctjson={'content-type': 'application/json'}
    actions={}
    for type in activitymap.keys():
        actions[type]={}
        for term in terms:
            term=str(term)
            actions[type][term]=[]
            idx=0
            while True:
                _url = urltpl % (mepid,type,term,idx)
                try:
                    res=fetch_raw(_url, ignore=[500]) #, headers=ctjson)
                except:
                    logger.warn("failed to fetch %s" % _url)
                    break
                if res is None:
                    break
                if '<h2>Error while collecting data</h2>' in res: break
                ret=json.loads(res)
                actions[type][term].extend(ret['documentList'])
                idx=ret['nextIndex']
                if idx in [-1,0]:
                    break
            if not actions[type][term]:
                del actions[type][term]
        if not actions[type]:
            del actions[type]

    return actions
示例#2
0
def get_all_dossiers(**kwargs):
    for year in range(datetime.date.today().year, 1971, -1):
        tree = fetch(
            'https://oeil.secure.europarl.europa.eu/oeil/widgets/resultwidget.do?lang=en&noHeader=false&q=objectReferenceN:N-%s/*\(*\)'
            % (year))
        tmp = tree.xpath(
            '//span[@class="ep_name" and (starts-with(normalize-space(),"Results found :") or starts-with(normalize-space(),"Result found :"))]/text()'
        )
        if not tmp:
            log(1, "no dossiers found for %d" % year)
            raise ValueError("failed to find number of dossiers for year %d" %
                             year)
        tmp = unws(tmp[0])
        count = int(tmp[tmp.index(":") + 1:])
        log(4, "year %d, count %d" % (year, count))
        #tree=fetch('https://oeil.secure.europarl.europa.eu/oeil/popups/printresultlist.xml?q=objectReferenceN:N-%s/????\(*\)&lang=en&s1&all&limit=%s&lang=en'
        #           % (year, count), prune_xml=True)
        tree = fromstring(
            fetch_raw(
                'https://oeil.secure.europarl.europa.eu/oeil/popups/printresultlist.xml?q=objectReferenceN:N-%s/*\(*\)&lang=en&s1&all&limit=%s&lang=en'
                % (year, count)).encode("utf8"))
        items = tree.xpath('//item')
        i = 0
        for item in items:
            url = html.unescape(
                urljoin(BASE_URL, str(item.xpath('./link/text()')[0])))
            ref = unws(item.xpath('./reference/text()')[0])
            if '*' in ref: ref = ref[:ref.index('*')]
            log(4, 'adding dossier scraping job %s' % url)
            payload = dict(kwargs)
            payload['url'] = url
            add_job('dossier', payload=payload)
            i += 1
        if i != count: log(1, "total %d, expected %d" % (i, count))
示例#3
0
def getraw(pdf):
    (fd, fname)=mkstemp()
    fd=os.fdopen(fd, 'wb')
    fd.write(fetch_raw(pdf).encode('utf-8'))
    fd.close()
    text=pdftotext('-nopgbrk',
                   '-layout',
                   fname,
                   '-')
    os.unlink(fname)
    return text
示例#4
0
def getXML(url):
    try:
        raw = fetch_raw(url,binary=True)
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 404: return None
        log(1, "failed to fetch xml from url: %s" % url)
        raise
    try:
        return fromstring(raw)
    except:
        log(1, "failed to parse xml from url: %s" % url)
        raise
示例#5
0
def getraw(pdf):
    log(5, "fetching url: %s" % pdf)
    (fd, fname)=mkstemp()
    fd=os.fdopen(fd, 'wb')
    fd.write(fetch_raw(pdf, binary=True))
    fd.close()
    text=pdftotext('-nopgbrk',
                   '-layout',
                   fname,
                   '-')
    os.unlink(fname)
    return text
示例#6
0
def crawl(year, term, **kwargs):
    url = 'https://www.europarl.europa.eu/RegistreWeb/services/search'
    params = {
        "references": [],
        "authors": [],
        "typesDoc": ["PPVD"],
        "eurovoc": None,
        "codeAuthor": None,
        "fulltext": None,
        "searchLanguages": ["EN"],
        "relations": [],
        "allAuthorities": [],
        "dateCriteria": {
            "field": "DATE_DOCU",
            "startDate": None,
            "endDate": None
        },
        "sortAndOrder": "DATE_DOCU_ASC"
    }
    params['year'] = str(year)
    params['leg'] = str(term)
    params["currentPage"] = 1
    params["nbRows"] = 10

    res = fetch_raw(url, asjson=params, res=True).json()
    while (len(res.get('documents', [])) > 0):
        for d in res.get('documents'):
            if d.get("fragDocu") != "RCV": continue
            for f in d.get('formatDocs', []):
                if f.get('typeDoc', '') != 'text/xml': continue
                if f['url'] in seen: continue
                seen.add(f['url'])
                payload = dict(kwargs)
                payload['url'] = f['url']
                #print(payload)
                add_job('pvote', payload=payload)
        params["currentPage"] += 1
        res = fetch_raw(url, asjson=params, res=True).json()
示例#7
0
 def _get(template, term, _date):
     url = template % (_date.strftime("%Y/%m-%d"), term, _date.strftime("(%Y)%m-%d"))
     try:
         raw = fetch_raw(url,binary=True)
     except requests.exceptions.HTTPError as e:
         if e.response.status_code == 404: return None, None
         log(1, "failed to fetch xml from url: %s" % url)
         raise
     try:
         xml = fromstring(raw)
     except:
         log(1, "failed to parse xml from url: %s" % url)
         raise
     return url, xml
示例#8
0
def getraw(pdf):
    (fd, fname)=mkstemp()
    fd=os.fdopen(fd, 'wb')
    fd.write(fetch_raw(pdf, binary=True))
    fd.close()
    text=pdftotext(#'-nopgbrk',
                   '-layout',
                   #'-x', x,
                   #'-y', y,
                   #'-H', h,
                   #'-W', w,
                   fname,
                   '-')
    os.unlink(fname)
    # remove pagebreaks and footers
    return unpaginate(text,pdf)
示例#9
0
def scrape(id, **kwargs):
    # we ignore the /meps/en/<id>/<name>/home path, since we can get all info also from other pages
    url = "http://www.europarl.europa.eu/meps/en/%s/name/cv" % id
    xml = fetch_raw(url)  # we have to patch up the returned html...
    xml = xml.replace("</br>", "<br/>")  # ...it contains some bad tags..
    root = fromstring(
        xml
    )  # ...which make the lxml soup parser drop some branches in the DOM
    sidebar_check(root, url)

    mep = {
        'UserID':
        id,
        'Name':
        mangleName(
            unws(' '.join(
                root.xpath('//span[@class="sln-member-name"]/text()'))), id),
        'Photo':
        "https://www.europarl.europa.eu/mepphoto/%s.jpg" % id,
        'meta': {
            'url': url
        },
        'Twitter': [
            unws(x.replace("http:// ", "")) for x in root.xpath(
                '//section[@id="presentationmep"]//a[@data-original-title="Twitter"]/@href'
            )
        ],
        'Homepage': [
            unws(x.replace("http:// ", "")) for x in root.xpath(
                '//section[@id="presentationmep"]//a[@data-original-title="Website"]/@href'
            )
        ],
        'Facebook': [
            unws(x.replace("http:// ", "")) for x in root.xpath(
                '//section[@id="presentationmep"]//a[@data-original-title="Facebook"]/@href'
            )
        ],
        'Instagram': [
            unws(x.replace("http:// ", "")) for x in root.xpath(
                '//section[@id="presentationmep"]//a[@data-original-title="Instagram"]/@href'
            )
        ],
        'Mail': [
            deobfus_mail(x) for x in root.xpath(
                '//section[@id="presentationmep"]//a[@data-original-title="E-mail"]/@href'
            )
        ],
        'Addresses':
        parse_addr(root),
        'active':
        False,
    }

    mep = addchangednames(mep)

    birthdate = root.xpath('//time[@id="birthDate"]/text()')
    if len(birthdate) > 0:
        mep['Birth'] = {
            'date': datetime.strptime(unws(birthdate[0]), u"%d-%m-%Y")
        }
        place = root.xpath('//time[@id="birthDate"]/following-sibling::text()')
        if len(place) > 0:
            tmp = unws(' '.join(place))
            if tmp.startswith(", "): tmp = tmp[2:]
            mep['Birth']['place'] = tmp

    death = root.xpath('//time[@id="deathDate"]/text()')
    if death:
        mep['Death'] = datetime.strptime(unws(death[0]), u"%d-%m-%Y")

    body = root.xpath(
        '//span[@id="detailedcardmep"]/following-sibling::section')[0]

    if body.xpath('.//h1[text()="Curriculum vitae "]'):
        if not body.xpath('.//h3[@id="no_cv_available"]'):
            mep['CV'] = {
                'updated':
                datetime.strptime(
                    unws(
                        body.xpath(
                            './/p[@class="small"]/strong[contains(text(),"Updated: ")]/text()'
                        )[0]), u"Updated: %d/%m/%Y")
            }
            mep['CV'].update({
                unws(''.join(title.xpath(".//text()"))): [
                    unws(''.join(item.xpath(".//text()"))).replace(
                        "-...", "- ...")
                    for item in title.xpath("following-sibling::ul/li")
                ]
                for title in body.xpath('.//h4')
                #if not unws(''.join(title.xpath(".//text()"))).startswith("Original version : ")
            })

    # assistants
    url = "http://www.europarl.europa.eu/meps/en/%s/name/assistants" % id
    root = fetch(url)
    body = root.xpath(
        '//span[@id="detailedcardmep"]/following-sibling::section')[0]
    if unws(' '.join(body.xpath(".//h1/text()"))) == "Assistants":
        for h4 in body.xpath('.//h4'):
            title = unws(''.join(h4.xpath(".//text()")))
            assistants = [
                unws(''.join(item.xpath(".//text()")))
                for item in h4.xpath("../div//span")
            ]
            if title in ['Accredited assistants', 'Local assistants']:
                if not 'assistants' in mep: mep['assistants'] = {}
                title = title.lower().split()[0]
                if assistants: mep['assistants'][title] = assistants
            elif title in [
                    'Accredited assistants (grouping)',
                    'Local assistants (grouping)', 'Service providers',
                    'Trainees', 'Paying agents (grouping)', 'Paying agents',
                    'Assistants to the Vice-Presidency/to the Quaestorate'
            ]:
                if not 'assistants' in mep: mep['assistants'] = {}
                title = title.lower()
                if assistants: mep['assistants'][title] = assistants
            else:
                log(2,
                    'unknown title for assistants "{}" {}'.format(title, url))
                raise ValueError

    # declarations
    root = fetch("http://www.europarl.europa.eu/meps/en/%s/name/declarations" %
                 id)
    body = root.xpath(
        '//span[@id="detailedcardmep"]/following-sibling::section')[0]
    if unws(' '.join(body.xpath(".//h1/text()"))) == "Declarations":
        for title in body.xpath('.//h4'):
            key = unws(''.join(title.xpath('.//text()')))
            if key == 'Declaration of financial interests':
                key = 'Financial Declarations'
                mep[key] = []
                for pdf in title.xpath('./following-sibling::ul/li/a'):
                    url = pdf.xpath('./@href')[0]
                    try:
                        mep[key].append(findecl.scrape(url))
                    except:
                        log(1, "failed to extract findecl from %s" % url)
            elif key == 'Declarations of participation by Members in events organised by third parties':
                key = 'Declarations of Participation'
                mep[key] = []
                for pdf in title.xpath(
                        './following-sibling::ul/li/a'
                )[::
                  -1]:  # reversed order, otherwise newer ones get prepended and mess up the diff
                    url = pdf.xpath('./@href')[0]
                    name = unws(''.join(pdf.xpath('.//text()')))
                    mep[key].append({'title': name, 'url': url})
            elif key in [
                    'Declaration of good conduct',
                    'Voluntary confirmation on the use of the General Expenditure Allowance'
            ]:
                mep[key] = []
                for pdf in title.xpath(
                        './following-sibling::ul/li/a'
                )[::
                  -1]:  # reversed order, otherwise newer ones get prepended and mess up the diff
                    url = pdf.xpath('./@href')[0]
                    name = unws(''.join(pdf.xpath('.//text()')))
                    mep[key].append({'title': name, 'url': url})
            else:
                log(
                    2,
                    'unknown type of declaration: "%s" http://www.europarl.europa.eu/meps/en/%s/name/declarations'
                    % (key, id))
                key = None
                raise ValueError

    # history
    parse_history(id, root, mep)
    process(mep,
            id,
            db.mep,
            'ep_meps',
            mep['Name']['full'],
            nopreserve=(['Addresses'], ['assistants']),
            onchanged=onchanged)

    if __name__ == '__main__':
        return mep
    del mep